It's not necessary to do rounding for alloca operations when the requested alignment is equal to the stack alignment. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@40004 91177308-0d34-0410-b5e6-96231b3b80d8

commit: f17a25c88b892d30c2b41ba7ecdfbdfb2b4be9cc [log] [tgz]
author: Dan Gohman <djg@cray.com> Wed Jul 18 16:29:46 2007 +0000
committer: Dan Gohman <djg@cray.com> Wed Jul 18 16:29:46 2007 +0000
tree: ebb79ea1ee5e3bc1fdf38541a811a8b804f0679a
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
new file mode 100644
index 0000000..8134dcc
--- /dev/null
+++ b/lib/Target/ARM/ARM.h

@@ -0,0 +1,109 @@
+//===-- ARM.h - Top-level interface for ARM representation---- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// ARM back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM_H
+#define TARGET_ARM_H
+
+#include <iosfwd>
+#include <cassert>
+
+namespace llvm {
+
+class ARMTargetMachine;
+class FunctionPass;
+class MachineCodeEmitter;
+
+// Enums corresponding to ARM condition codes
+namespace ARMCC {
+  enum CondCodes {
+    EQ,
+    NE,
+    HS,
+    LO,
+    MI,
+    PL,
+    VS,
+    VC,
+    HI,
+    LS,
+    GE,
+    LT,
+    GT,
+    LE,
+    AL
+  };
+  
+  inline static CondCodes getOppositeCondition(CondCodes CC){
+    switch (CC) {
+    default: assert(0 && "Unknown condition code");
+    case EQ: return NE;
+    case NE: return EQ;
+    case HS: return LO;
+    case LO: return HS;
+    case MI: return PL;
+    case PL: return MI;
+    case VS: return VC;
+    case VC: return VS;
+    case HI: return LS;
+    case LS: return HI;
+    case GE: return LT;
+    case LT: return GE;
+    case GT: return LE;
+    case LE: return GT;
+    }
+  }
+}
+
+inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown condition code");
+  case ARMCC::EQ:  return "eq";
+  case ARMCC::NE:  return "ne";
+  case ARMCC::HS:  return "hs";
+  case ARMCC::LO:  return "lo";
+  case ARMCC::MI:  return "mi";
+  case ARMCC::PL:  return "pl";
+  case ARMCC::VS:  return "vs";
+  case ARMCC::VC:  return "vc";
+  case ARMCC::HI:  return "hi";
+  case ARMCC::LS:  return "ls";
+  case ARMCC::GE:  return "ge";
+  case ARMCC::LT:  return "lt";
+  case ARMCC::GT:  return "gt";
+  case ARMCC::LE:  return "le";
+  case ARMCC::AL:  return "al";
+  }
+}
+
+FunctionPass *createARMISelDag(ARMTargetMachine &TM);
+FunctionPass *createARMCodePrinterPass(std::ostream &O, ARMTargetMachine &TM);
+FunctionPass *createARMCodeEmitterPass(ARMTargetMachine &TM,
+                                       MachineCodeEmitter &MCE);
+FunctionPass *createARMLoadStoreOptimizationPass();
+FunctionPass *createARMConstantIslandPass();
+
+} // end namespace llvm;
+
+// Defines symbolic names for ARM registers.  This defines a mapping from
+// register name to register number.
+//
+#include "ARMGenRegisterNames.inc"
+
+// Defines symbolic names for the ARM instructions.
+//
+#include "ARMGenInstrNames.inc"
+
+
+#endif

diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
new file mode 100644
index 0000000..0272004
--- /dev/null
+++ b/lib/Target/ARM/ARM.td

@@ -0,0 +1,119 @@
+//===- ARM.td - Describe the ARM Target Machine -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "../Target.td"
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget features.
+//
+
+def ArchV4T     : SubtargetFeature<"v4t", "ARMArchVersion", "V4T",
+                                   "ARM v4T">;
+def ArchV5T     : SubtargetFeature<"v5t", "ARMArchVersion", "V5T",
+                                   "ARM v5T">;
+def ArchV5TE    : SubtargetFeature<"v5te", "ARMArchVersion", "V5TE",
+                                   "ARM v5TE, v5TEj, v5TExp">;
+def ArchV6      : SubtargetFeature<"v6", "ARMArchVersion", "V6",
+                                   "ARM v6">;
+def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFP2", "true",
+                                   "Enable VFP2 instructions ">;
+
+//===----------------------------------------------------------------------===//
+// ARM Processors supported.
+//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+// V4 Processors.
+def : Proc<"generic",         []>;
+def : Proc<"arm8",            []>;
+def : Proc<"arm810",          []>;
+def : Proc<"strongarm",       []>;
+def : Proc<"strongarm110",    []>;
+def : Proc<"strongarm1100",   []>;
+def : Proc<"strongarm1110",   []>;
+
+// V4T Processors.
+def : Proc<"arm7tdmi",        [ArchV4T]>;
+def : Proc<"arm7tdmi-s",      [ArchV4T]>;
+def : Proc<"arm710t",         [ArchV4T]>;
+def : Proc<"arm720t",         [ArchV4T]>;
+def : Proc<"arm9",            [ArchV4T]>;
+def : Proc<"arm9tdmi",        [ArchV4T]>;
+def : Proc<"arm920",          [ArchV4T]>;
+def : Proc<"arm920t",         [ArchV4T]>;
+def : Proc<"arm922t",         [ArchV4T]>;
+def : Proc<"arm940t",         [ArchV4T]>;
+def : Proc<"ep9312",          [ArchV4T]>;
+
+// V5T Processors.
+def : Proc<"arm10tdmi",       [ArchV5T]>;
+def : Proc<"arm1020t",        [ArchV5T]>;
+
+// V5TE Processors.
+def : Proc<"arm9e",           [ArchV5TE]>;
+def : Proc<"arm926ej-s",      [ArchV5TE]>;
+def : Proc<"arm946e-s",       [ArchV5TE]>;
+def : Proc<"arm966e-s",       [ArchV5TE]>;
+def : Proc<"arm968e-s",       [ArchV5TE]>;
+def : Proc<"arm10e",          [ArchV5TE]>;
+def : Proc<"arm1020e",        [ArchV5TE]>;
+def : Proc<"arm1022e",        [ArchV5TE]>;
+def : Proc<"xscale",          [ArchV5TE]>;
+def : Proc<"iwmmxt",          [ArchV5TE]>;
+
+// V6 Processors.
+def : Proc<"arm1136j-s",      [ArchV6]>;
+def : Proc<"arm1136jf-s",     [ArchV6, FeatureVFP2]>;
+def : Proc<"arm1176jz-s",     [ArchV6]>;
+def : Proc<"arm1176jzf-s",    [ArchV6, FeatureVFP2]>;
+def : Proc<"mpcorenovfp",     [ArchV6]>;
+def : Proc<"mpcore",          [ArchV6, FeatureVFP2]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARMRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrInfo.td"
+
+def ARMInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+  let TSFlagsFields = ["AddrModeBits",
+                       "SizeFlag",
+                       "IndexModeBits",
+                       "Opcode"];
+  let TSFlagsShifts = [0,
+                       4,
+                       7,
+                       9];
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def ARM : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = ARMInstrInfo;
+}

diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
new file mode 100644
index 0000000..3f47a69
--- /dev/null
+++ b/lib/Target/ARM/ARMAddressingModes.h

@@ -0,0 +1,394 @@
+//===- ARMAddressingModes.h - ARM Addressing Modes --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+  
+/// ARM_AM - ARM Addressing Mode Stuff
+namespace ARM_AM {
+  enum ShiftOpc {
+    no_shift = 0,
+    asr,
+    lsl,
+    lsr,
+    ror,
+    rrx
+  };
+  
+  enum AddrOpc {
+    add = '+', sub = '-'
+  };
+  
+  static inline const char *getShiftOpcStr(ShiftOpc Op) {
+    switch (Op) {
+    default: assert(0 && "Unknown shift opc!");
+    case ARM_AM::asr: return "asr";
+    case ARM_AM::lsl: return "lsl";
+    case ARM_AM::lsr: return "lsr";
+    case ARM_AM::ror: return "ror";
+    case ARM_AM::rrx: return "rrx";
+    }
+  }
+  
+  static inline ShiftOpc getShiftOpcForNode(SDOperand N) {
+    switch (N.getOpcode()) {
+    default:          return ARM_AM::no_shift;
+    case ISD::SHL:    return ARM_AM::lsl;
+    case ISD::SRL:    return ARM_AM::lsr;
+    case ISD::SRA:    return ARM_AM::asr;
+    case ISD::ROTR:   return ARM_AM::ror;
+    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
+    // Can't handle RRX here, because it would require folding a flag into
+    // the addressing mode.  :(  This causes us to miss certain things.
+    //case ARMISD::RRX: return ARM_AM::rrx;
+    }
+  }
+
+  enum AMSubMode {
+    bad_am_submode = 0,
+    ia,
+    ib,
+    da,
+    db
+  };
+
+  static inline const char *getAMSubModeStr(AMSubMode Mode) {
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::ia: return "ia";
+    case ARM_AM::ib: return "ib";
+    case ARM_AM::da: return "da";
+    case ARM_AM::db: return "db";
+    }
+  }
+
+  static inline const char *getAMSubModeAltStr(AMSubMode Mode, bool isLD) {
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::ia: return isLD ? "fd" : "ea";
+    case ARM_AM::ib: return isLD ? "ed" : "fa";
+    case ARM_AM::da: return isLD ? "fa" : "ed";
+    case ARM_AM::db: return isLD ? "ea" : "fd";
+    }
+  }
+
+  /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
+  ///
+  static inline unsigned rotr32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val >> Amt) | (Val << ((32-Amt)&31));
+  }
+  
+  /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
+  ///
+  static inline unsigned rotl32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val << Amt) | (Val >> ((32-Amt)&31));
+  }
+  
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #1: shift_operand with registers
+  //===--------------------------------------------------------------------===//
+  //
+  // This 'addressing mode' is used for arithmetic instructions.  It can
+  // represent things like:
+  //   reg
+  //   reg [asr|lsl|lsr|ror|rrx] reg
+  //   reg [asr|lsl|lsr|ror|rrx] imm
+  //
+  // This is stored three operands [rega, regb, opc].  The first is the base
+  // reg, the second is the shift amount (or reg0 if not present or imm).  The
+  // third operand encodes the shift opcode and the imm if a reg isn't present.
+  //
+  static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
+    return ShOp | (Imm << 3);
+  }
+  static inline unsigned getSORegOffset(unsigned Op) {
+    return Op >> 3;
+  }
+  static inline ShiftOpc getSORegShOp(unsigned Op) {
+    return (ShiftOpc)(Op & 7);
+  }
+
+  /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
+  /// the 8-bit imm value.
+  static inline unsigned getSOImmValImm(unsigned Imm) {
+    return Imm & 0xFF;
+  }
+  /// getSOImmValRotate - Given an encoded imm field for the reg/imm form, return
+  /// the rotate amount.
+  static inline unsigned getSOImmValRot(unsigned Imm) {
+    return (Imm >> 8) * 2;
+  }
+  
+  /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
+  /// computing the rotate amount to use.  If this immediate value cannot be
+  /// handled with a single shifter-op, determine a good rotate amount that will
+  /// take a maximal chunk of bits out of the immediate.
+  static inline unsigned getSOImmValRotate(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+    
+    // Use CTZ to compute the rotate amount.
+    unsigned TZ = CountTrailingZeros_32(Imm);
+    
+    // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
+    // not 9.
+    unsigned RotAmt = TZ & ~1;
+    
+    // If we can handle this spread, return it.
+    if ((rotr32(Imm, RotAmt) & ~255U) == 0)
+      return (32-RotAmt)&31;  // HW rotates right, not left.
+
+    // For values like 0xF000000F, we should skip the first run of ones, then
+    // retry the hunt.
+    if (Imm & 1) {
+      unsigned TrailingOnes = CountTrailingZeros_32(~Imm);
+      if (TrailingOnes != 32) {  // Avoid overflow on 0xFFFFFFFF
+        // Restart the search for a high-order bit after the initial seconds of
+        // ones.
+        unsigned TZ2 = CountTrailingZeros_32(Imm & ~((1 << TrailingOnes)-1));
+      
+        // Rotate amount must be even.
+        unsigned RotAmt2 = TZ2 & ~1;
+        
+        // If this fits, use it.
+        if (RotAmt2 != 32 && (rotr32(Imm, RotAmt2) & ~255U) == 0)
+          return (32-RotAmt2)&31;  // HW rotates right, not left.
+      }
+    }
+    
+    // Otherwise, we have no way to cover this span of bits with a single
+    // shifter_op immediate.  Return a chunk of bits that will be useful to
+    // handle.
+    return (32-RotAmt)&31;  // HW rotates right, not left.
+  }
+
+  /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into an shifter_operand immediate operand, return the 12-bit encoding for
+  /// it.  If not, return -1.
+  static inline int getSOImmVal(unsigned Arg) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Arg & ~255U) == 0) return Arg;
+    
+    unsigned RotAmt = getSOImmValRotate(Arg);
+
+    // If this cannot be handled with a single shifter_op, bail out.
+    if (rotr32(~255U, RotAmt) & Arg)
+      return -1;
+      
+    // Encode this correctly.
+    return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
+  }
+  
+  /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
+  /// or'ing together two SOImmVal's.
+  static inline bool isSOImmTwoPartVal(unsigned V) {
+    // If this can be handled with a single shifter_op, bail out.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+    
+    // If this can be handled with two shifter_op's, accept.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    return V == 0;
+  }
+  
+  /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the first chunk of it.
+  static inline unsigned getSOImmTwoPartFirst(unsigned V) {
+    return rotr32(255U, getSOImmValRotate(V)) & V;
+  }
+
+  /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the second chunk of it.
+  static inline unsigned getSOImmTwoPartSecond(unsigned V) {
+    // Mask out the first hunk.  
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    
+    // Take what's left.
+    assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
+    return V;
+  }
+  
+  /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImmValShift(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return CountTrailingZeros_32(Imm);
+  }
+
+  /// isThumbImmShiftedVal - Return true if the specified value can be obtained
+  /// by left shifting a 8-bit immediate.
+  static inline bool isThumbImmShiftedVal(unsigned V) {
+    // If this can be handled with 
+    V = (~255U << getThumbImmValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImmNonShiftedVal - If V is a value that satisfies
+  /// isThumbImmShiftedVal, return the non-shiftd value.
+  static inline unsigned getThumbImmNonShiftedVal(unsigned V) {
+    return V >> getThumbImmValShift(V);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #2
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for most simple load/store instructions.
+  //
+  // addrmode2 := reg +/- reg shop imm
+  // addrmode2 := reg +/- imm12
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15.
+  //
+  // If this addressing mode is a frame index (before prolog/epilog insertion
+  // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
+  // with no shift amount for the frame offset.
+  // 
+  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) {
+    assert(Imm12 < (1 << 12) && "Imm too large!");
+    bool isSub = Opc == sub;
+    return Imm12 | ((int)isSub << 12) | (SO << 13);
+  }
+  static inline unsigned getAM2Offset(unsigned AM2Opc) {
+    return AM2Opc & ((1 << 12)-1);
+  }
+  static inline AddrOpc getAM2Op(unsigned AM2Opc) {
+    return ((AM2Opc >> 12) & 1) ? sub : add;
+  }
+  static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
+    return (ShiftOpc)(AM2Opc >> 13);
+  }
+  
+  
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #3
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for sign-extending loads, and load/store-pair instructions.
+  //
+  // addrmode3 := reg +/- reg
+  // addrmode3 := reg +/- imm8
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7.
+  
+  /// getAM3Opc - This function encodes the addrmode3 opc field.
+  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM3Offset(unsigned AM3Opc) {
+    return AM3Opc & 0xFF;
+  }
+  static inline AddrOpc getAM3Op(unsigned AM3Opc) {
+    return ((AM3Opc >> 8) & 1) ? sub : add;
+  }
+  
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #4
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for load / store multiple instructions.
+  //
+  // addrmode4 := reg, <mode>
+  //
+  // The four modes are:
+  //    IA - Increment after
+  //    IB - Increment before
+  //    DA - Decrement after
+  //    DB - Decrement before
+  //
+  // If the 4th bit (writeback)is set, then the base register is updated after
+  // the memory transfer.
+
+  static inline AMSubMode getAM4SubMode(unsigned Mode) {
+    return (AMSubMode)(Mode & 0x7);
+  }
+
+  static inline unsigned getAM4ModeImm(AMSubMode SubMode, bool WB = false) {
+    return (int)SubMode | ((int)WB << 3);
+  }
+
+  static inline bool getAM4WBFlag(unsigned Mode) {
+    return (Mode >> 3) & 1;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #5
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as FP load/stores.
+  //
+  // addrmode5 := reg +/- imm8*4
+  //
+  // The first operand is always a Reg.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7.
+  //
+  // This can also be used for FP load/store multiple ops. The third field encodes
+  // writeback mode in bit 8, the number of registers (or 2 times the number of
+  // registers for DPR ops) in bits 0-7. In addition, bit 9-11 encodes one of the
+  // following two sub-modes:
+  //
+  //    IA - Increment after
+  //    DB - Decrement before
+  
+  /// getAM5Opc - This function encodes the addrmode5 opc field.
+  static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  /// getAM5Opc - This function encodes the addrmode5 opc field for FLDM and
+  /// FSTM instructions.
+  static inline unsigned getAM5Opc(AMSubMode SubMode, bool WB,
+                                   unsigned char Offset) {
+    assert((SubMode == ia || SubMode == db) &&
+           "Illegal addressing mode 5 sub-mode!");
+    return ((int)SubMode << 9) | ((int)WB << 8) | Offset;
+  }
+  static inline AMSubMode getAM5SubMode(unsigned AM5Opc) {
+    return (AMSubMode)((AM5Opc >> 9) & 0x7);
+  }
+  static inline bool getAM5WBFlag(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1);
+  }
+  
+} // end namespace ARM_AM
+} // end namespace llvm
+
+#endif
+

diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
new file mode 100644
index 0000000..5e65226
--- /dev/null
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp

@@ -0,0 +1,1029 @@
+//===-- ARMAsmPrinter.cpp - ARM LLVM assembly writer ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format ARM assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARM.h"
+#include "ARMTargetMachine.h"
+#include "ARMAddressingModes.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
+#include <cctype>
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  struct VISIBILITY_HIDDEN ARMAsmPrinter : public AsmPrinter {
+    ARMAsmPrinter(std::ostream &O, TargetMachine &TM, const TargetAsmInfo *T)
+      : AsmPrinter(O, TM, T), DW(O, this, T), AFI(NULL), InCPMode(false) {
+      Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    }
+
+    DwarfWriter DW;
+
+    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+    /// make the right decision when printing asm code for different targets.
+    const ARMSubtarget *Subtarget;
+
+    /// AFI - Keep a pointer to ARMFunctionInfo for the current
+    /// MachineFunction
+    ARMFunctionInfo *AFI;
+
+    /// We name each basic block in a Function with a unique number, so
+    /// that we can consistently refer to them later. This is cleared
+    /// at the beginning of each call to runOnMachineFunction().
+    ///
+    typedef std::map<const Value *, unsigned> ValueMapTy;
+    ValueMapTy NumberForBB;
+
+    /// Keeps the set of GlobalValues that require non-lazy-pointers for
+    /// indirect access.
+    std::set<std::string> GVNonLazyPtrs;
+
+    /// Keeps the set of external function GlobalAddresses that the asm
+    /// printer should generate stubs for.
+    std::set<std::string> FnStubs;
+
+    /// True if asm printer is printing a series of CONSTPOOL_ENTRY.
+    bool InCPMode;
+    
+    virtual const char *getPassName() const {
+      return "ARM Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int opNum,
+                      const char *Modifier = 0);
+    void printSOImmOperand(const MachineInstr *MI, int opNum);
+    void printSOImm2PartOperand(const MachineInstr *MI, int opNum);
+    void printSORegOperand(const MachineInstr *MI, int opNum);
+    void printAddrMode2Operand(const MachineInstr *MI, int OpNo);
+    void printAddrMode2OffsetOperand(const MachineInstr *MI, int OpNo);
+    void printAddrMode3Operand(const MachineInstr *MI, int OpNo);
+    void printAddrMode3OffsetOperand(const MachineInstr *MI, int OpNo);
+    void printAddrMode4Operand(const MachineInstr *MI, int OpNo,
+                               const char *Modifier = 0);
+    void printAddrMode5Operand(const MachineInstr *MI, int OpNo,
+                               const char *Modifier = 0);
+    void printAddrModePCOperand(const MachineInstr *MI, int OpNo,
+                                const char *Modifier = 0);
+    void printThumbAddrModeRROperand(const MachineInstr *MI, int OpNo);
+    void printThumbAddrModeRI5Operand(const MachineInstr *MI, int OpNo,
+                                      unsigned Scale);
+    void printThumbAddrModeS1Operand(const MachineInstr *MI, int OpNo);
+    void printThumbAddrModeS2Operand(const MachineInstr *MI, int OpNo);
+    void printThumbAddrModeS4Operand(const MachineInstr *MI, int OpNo);
+    void printThumbAddrModeSPOperand(const MachineInstr *MI, int OpNo);
+    void printPredicateOperand(const MachineInstr *MI, int opNum);
+    void printSBitModifierOperand(const MachineInstr *MI, int opNum);
+    void printPCLabel(const MachineInstr *MI, int opNum);
+    void printRegisterList(const MachineInstr *MI, int opNum);
+    void printCPInstOperand(const MachineInstr *MI, int opNum,
+                            const char *Modifier);
+    void printJTBlockOperand(const MachineInstr *MI, int opNum);
+
+    virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                 unsigned AsmVariant, const char *ExtraCode);
+
+    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    void printMachineInstruction(const MachineInstr *MI);
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+
+    virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+      printDataDirective(MCPV->getType());
+
+      ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)MCPV;
+      GlobalValue *GV = ACPV->getGV();
+      std::string Name = GV ? Mang->getValueName(GV) : TAI->getGlobalPrefix();
+      if (!GV)
+        Name += ACPV->getSymbol();
+      if (ACPV->isNonLazyPointer()) {
+        GVNonLazyPtrs.insert(Name);
+        O << TAI->getPrivateGlobalPrefix() << Name << "$non_lazy_ptr";
+      } else if (ACPV->isStub()) {
+        FnStubs.insert(Name);
+        O << TAI->getPrivateGlobalPrefix() << Name << "$stub";
+      } else
+        O << Name;
+      if (ACPV->hasModifier()) O << "(" << ACPV->getModifier() << ")";
+      if (ACPV->getPCAdjustment() != 0) {
+        O << "-(" << TAI->getPrivateGlobalPrefix() << "PC"
+          << utostr(ACPV->getLabelId())
+          << "+" << (unsigned)ACPV->getPCAdjustment();
+         if (ACPV->mustAddCurrentAddress())
+           O << "-.";
+         O << ")";
+      }
+      O << "\n";
+
+      // If the constant pool value is a extern weak symbol, remember to emit
+      // the weak reference.
+      if (GV && GV->hasExternalWeakLinkage())
+        ExtWeakSymbols.insert(GV);
+    }
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+    }
+  };
+} // end of anonymous namespace
+
+#include "ARMGenAsmWriter.inc"
+
+/// createARMCodePrinterPass - Returns a pass that prints the ARM
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *llvm::createARMCodePrinterPass(std::ostream &o,
+                                             ARMTargetMachine &tm) {
+  return new ARMAsmPrinter(o, tm, tm.getTargetAsmInfo());
+}
+
+/// runOnMachineFunction - This uses the printInstruction()
+/// method to print assembly for each instruction.
+///
+bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  AFI = MF.getInfo<ARMFunctionInfo>();
+
+  DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>());
+
+  SetupMachineFunction(MF);
+  O << "\n";
+
+  // NOTE: we don't print out constant pools here, they are handled as
+  // instructions.
+
+  O << "\n";
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:
+    SwitchToTextSection("\t.text", F);
+    break;
+  case Function::ExternalLinkage:
+    SwitchToTextSection("\t.text", F);
+    O << "\t.globl\t" << CurrentFnName << "\n";
+    break;
+  case Function::WeakLinkage:
+  case Function::LinkOnceLinkage:
+    if (Subtarget->isTargetDarwin()) {
+      SwitchToTextSection(
+                ".section __TEXT,__textcoal_nt,coalesced,pure_instructions", F);
+      O << "\t.globl\t" << CurrentFnName << "\n";
+      O << "\t.weak_definition\t" << CurrentFnName << "\n";
+    } else {
+      O << TAI->getWeakRefDirective() << CurrentFnName << "\n";
+    }
+    break;
+  }
+
+  const char *VisibilityDirective = NULL;
+  if (F->hasHiddenVisibility())
+    VisibilityDirective = TAI->getHiddenDirective();
+  else if (F->hasProtectedVisibility())
+    VisibilityDirective = TAI->getProtectedDirective();
+
+  if (VisibilityDirective)
+    O << VisibilityDirective << CurrentFnName << "\n";
+
+  if (AFI->isThumbFunction()) {
+    EmitAlignment(1, F, AFI->getAlign());
+    O << "\t.code\t16\n";
+    O << "\t.thumb_func";
+    if (Subtarget->isTargetDarwin())
+      O << "\t" << CurrentFnName;
+    O << "\n";
+    InCPMode = false;
+  } else
+    EmitAlignment(2, F);
+
+  O << CurrentFnName << ":\n";
+  // Emit pre-function debug information.
+  DW.BeginFunction(&MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      printMachineInstruction(II);
+    }
+  }
+
+  if (TAI->hasDotTypeDotSizeDirective())
+    O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n";
+
+  // Emit post-function debug information.
+  DW.EndFunction();
+
+  return false;
+}
+
+void ARMAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                 const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (MRegisterInfo::isPhysicalRegister(MO.getReg()))
+      O << TM.getRegisterInfo()->get(MO.getReg()).Name;
+    else
+      assert(0 && "not implemented");
+    break;
+  case MachineOperand::MO_Immediate: {
+    if (!Modifier || strcmp(Modifier, "no_hash") != 0)
+      O << "#";
+
+    O << (int)MO.getImmedValue();
+    break;
+  }
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMachineBasicBlock());
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    GlobalValue *GV = MO.getGlobal();
+    std::string Name = Mang->getValueName(GV);
+    bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() ||
+                  GV->hasLinkOnceLinkage());
+    if (isExt && isCallOp && Subtarget->isTargetDarwin() &&
+        TM.getRelocationModel() != Reloc::Static) {
+      O << TAI->getPrivateGlobalPrefix() << Name << "$stub";
+      FnStubs.insert(Name);
+    } else
+      O << Name;
+    
+    if (MO.getOffset() > 0)
+      O << '+' << MO.getOffset();
+    else if (MO.getOffset() < 0)
+      O << MO.getOffset();
+    
+    if (isCallOp && Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_)
+      O << "(PLT)";
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    std::string Name(TAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+    if (isCallOp && Subtarget->isTargetDarwin() &&
+        TM.getRelocationModel() != Reloc::Static) {
+      O << TAI->getPrivateGlobalPrefix() << Name << "$stub";
+      FnStubs.insert(Name);
+    } else
+      O << Name;
+    if (isCallOp && Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_)
+      O << "(PLT)";
+    break;
+  }
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getConstantPoolIndex();
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getJumpTableIndex();
+    break;
+  default:
+    O << "<unknown operand type>"; abort (); break;
+  }
+}
+
+static void printSOImm(std::ostream &O, int64_t V, const TargetAsmInfo *TAI) {
+  assert(V < (1 << 12) && "Not a valid so_imm value!");
+  unsigned Imm = ARM_AM::getSOImmValImm(V);
+  unsigned Rot = ARM_AM::getSOImmValRot(V);
+  
+  // Print low-level immediate formation info, per
+  // A5.1.3: "Data-processing operands - Immediate".
+  if (Rot) {
+    O << "#" << Imm << ", " << Rot;
+    // Pretty printed version.
+    O << ' ' << TAI->getCommentString() << ' ' << (int)ARM_AM::rotr32(Imm, Rot);
+  } else {
+    O << "#" << Imm;
+  }
+}
+
+/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit
+/// immediate in bits 0-7.
+void ARMAsmPrinter::printSOImmOperand(const MachineInstr *MI, int OpNum) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isImmediate() && "Not a valid so_imm value!");
+  printSOImm(O, MO.getImmedValue(), TAI);
+}
+
+/// printSOImm2PartOperand - SOImm is broken into two pieces using a mov
+/// followed by a or to materialize.
+void ARMAsmPrinter::printSOImm2PartOperand(const MachineInstr *MI, int OpNum) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isImmediate() && "Not a valid so_imm value!");
+  unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO.getImmedValue());
+  unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO.getImmedValue());
+  printSOImm(O, ARM_AM::getSOImmVal(V1), TAI);
+  O << "\n\torr";
+  printPredicateOperand(MI, 2);
+  O << " ";
+  printOperand(MI, 0); 
+  O << ", ";
+  printOperand(MI, 0); 
+  O << ", ";
+  printSOImm(O, ARM_AM::getSOImmVal(V2), TAI);
+}
+
+// so_reg is a 4-operand unit corresponding to register forms of the A5.1
+// "Addressing Mode 1 - Data-processing operands" forms.  This includes:
+//    REG 0   0    - e.g. R5
+//    REG REG 0,SH_OPC     - e.g. R5, ROR R3
+//    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
+void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+
+  assert(MRegisterInfo::isPhysicalRegister(MO1.getReg()));
+  O << TM.getRegisterInfo()->get(MO1.getReg()).Name;
+
+  // Print the shift opc.
+  O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImmedValue()))
+    << " ";
+
+  if (MO2.getReg()) {
+    assert(MRegisterInfo::isPhysicalRegister(MO2.getReg()));
+    O << TM.getRegisterInfo()->get(MO2.getReg()).Name;
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+  } else {
+    O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
+  }
+}
+
+void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+
+  if (!MO1.isRegister()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name;
+
+  if (!MO2.getReg()) {
+    if (ARM_AM::getAM2Offset(MO3.getImm()))  // Don't print +0.
+      O << ", #"
+        << (char)ARM_AM::getAM2Op(MO3.getImm())
+        << ARM_AM::getAM2Offset(MO3.getImm());
+    O << "]";
+    return;
+  }
+
+  O << ", "
+    << (char)ARM_AM::getAM2Op(MO3.getImm())
+    << TM.getRegisterInfo()->get(MO2.getReg()).Name;
+  
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
+    O << ", "
+      << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImmedValue()))
+      << " #" << ShImm;
+  O << "]";
+}
+
+void ARMAsmPrinter::printAddrMode2OffsetOperand(const MachineInstr *MI, int Op){
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+
+  if (!MO1.getReg()) {
+    unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
+    assert(ImmOffs && "Malformed indexed load / store!");
+    O << "#"
+      << (char)ARM_AM::getAM2Op(MO2.getImm())
+      << ImmOffs;
+    return;
+  }
+
+  O << (char)ARM_AM::getAM2Op(MO2.getImm())
+    << TM.getRegisterInfo()->get(MO1.getReg()).Name;
+  
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
+    O << ", "
+      << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImmedValue()))
+      << " #" << ShImm;
+}
+
+void ARMAsmPrinter::printAddrMode3Operand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+  
+  assert(MRegisterInfo::isPhysicalRegister(MO1.getReg()));
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name;
+
+  if (MO2.getReg()) {
+    O << ", "
+      << (char)ARM_AM::getAM3Op(MO3.getImm())
+      << TM.getRegisterInfo()->get(MO2.getReg()).Name
+      << "]";
+    return;
+  }
+  
+  if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
+    O << ", #"
+      << (char)ARM_AM::getAM3Op(MO3.getImm())
+      << ImmOffs;
+  O << "]";
+}
+
+void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op){
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+
+  if (MO1.getReg()) {
+    O << (char)ARM_AM::getAM3Op(MO2.getImm())
+      << TM.getRegisterInfo()->get(MO1.getReg()).Name;
+    return;
+  }
+
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
+  assert(ImmOffs && "Malformed indexed load / store!");
+  O << "#"
+    << (char)ARM_AM::getAM3Op(MO2.getImm())
+    << ImmOffs;
+}
+  
+void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op,
+                                          const char *Modifier) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    if (MO1.getReg() == ARM::SP) {
+      bool isLDM = (MI->getOpcode() == ARM::LDM ||
+                    MI->getOpcode() == ARM::LDM_RET);
+      O << ARM_AM::getAMSubModeAltStr(Mode, isLDM);
+    } else
+      O << ARM_AM::getAMSubModeStr(Mode);
+  } else {
+    printOperand(MI, Op);
+    if (ARM_AM::getAM4WBFlag(MO2.getImm()))
+      O << "!";
+  }
+}
+
+void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op,
+                                          const char *Modifier) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+
+  if (!MO1.isRegister()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+  
+  assert(MRegisterInfo::isPhysicalRegister(MO1.getReg()));
+
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm());
+    if (MO1.getReg() == ARM::SP) {
+      bool isFLDM = (MI->getOpcode() == ARM::FLDMD ||
+                     MI->getOpcode() == ARM::FLDMS);
+      O << ARM_AM::getAMSubModeAltStr(Mode, isFLDM);
+    } else
+      O << ARM_AM::getAMSubModeStr(Mode);
+    return;
+  } else if (Modifier && strcmp(Modifier, "base") == 0) {
+    // Used for FSTM{D|S} and LSTM{D|S} operations.
+    O << TM.getRegisterInfo()->get(MO1.getReg()).Name;
+    if (ARM_AM::getAM5WBFlag(MO2.getImm()))
+      O << "!";
+    return;
+  }
+  
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name;
+  
+  if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
+    O << ", #"
+      << (char)ARM_AM::getAM5Op(MO2.getImm())
+      << ImmOffs*4;
+  }
+  O << "]";
+}
+
+void ARMAsmPrinter::printAddrModePCOperand(const MachineInstr *MI, int Op,
+                                           const char *Modifier) {
+  if (Modifier && strcmp(Modifier, "label") == 0) {
+    printPCLabel(MI, Op+1);
+    return;
+  }
+
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  assert(MRegisterInfo::isPhysicalRegister(MO1.getReg()));
+  O << "[pc, +" << TM.getRegisterInfo()->get(MO1.getReg()).Name << "]";
+}
+
+void
+ARMAsmPrinter::printThumbAddrModeRROperand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name;
+  O << ", " << TM.getRegisterInfo()->get(MO2.getReg()).Name << "]";
+}
+
+void
+ARMAsmPrinter::printThumbAddrModeRI5Operand(const MachineInstr *MI, int Op,
+                                            unsigned Scale) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+
+  if (!MO1.isRegister()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name;
+  if (MO3.getReg())
+    O << ", " << TM.getRegisterInfo()->get(MO3.getReg()).Name;
+  else if (unsigned ImmOffs = MO2.getImm()) {
+    O << ", #" << ImmOffs;
+    if (Scale > 1)
+      O << " * " << Scale;
+  }
+  O << "]";
+}
+
+void
+ARMAsmPrinter::printThumbAddrModeS1Operand(const MachineInstr *MI, int Op) {
+  printThumbAddrModeRI5Operand(MI, Op, 1);
+}
+void
+ARMAsmPrinter::printThumbAddrModeS2Operand(const MachineInstr *MI, int Op) {
+  printThumbAddrModeRI5Operand(MI, Op, 2);
+}
+void
+ARMAsmPrinter::printThumbAddrModeS4Operand(const MachineInstr *MI, int Op) {
+  printThumbAddrModeRI5Operand(MI, Op, 4);
+}
+
+void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name;
+  if (unsigned ImmOffs = MO2.getImm())
+    O << ", #" << ImmOffs << " * 4";
+  O << "]";
+}
+
+void ARMAsmPrinter::printPredicateOperand(const MachineInstr *MI, int opNum) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(opNum).getImmedValue();
+  if (CC != ARMCC::AL)
+    O << ARMCondCodeToString(CC);
+}
+
+void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int opNum){
+  unsigned Reg = MI->getOperand(opNum).getReg();
+  if (Reg) {
+    assert(Reg == ARM::CPSR && "Expect ARM CPSR register!");
+    O << 's';
+  }
+}
+
+void ARMAsmPrinter::printPCLabel(const MachineInstr *MI, int opNum) {
+  int Id = (int)MI->getOperand(opNum).getImmedValue();
+  O << TAI->getPrivateGlobalPrefix() << "PC" << Id;
+}
+
+void ARMAsmPrinter::printRegisterList(const MachineInstr *MI, int opNum) {
+  O << "{";
+  for (unsigned i = opNum, e = MI->getNumOperands(); i != e; ++i) {
+    printOperand(MI, i);
+    if (i != e-1) O << ", ";
+  }
+  O << "}";
+}
+
+void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNo,
+                                       const char *Modifier) {
+  assert(Modifier && "This operand only works with a modifier!");
+  // There are two aspects to a CONSTANTPOOL_ENTRY operand, the label and the
+  // data itself.
+  if (!strcmp(Modifier, "label")) {
+    unsigned ID = MI->getOperand(OpNo).getImm();
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << ID << ":\n";
+  } else {
+    assert(!strcmp(Modifier, "cpentry") && "Unknown modifier for CPE");
+    unsigned CPI = MI->getOperand(OpNo).getConstantPoolIndex();
+
+    const MachineConstantPoolEntry &MCPE =  // Chasing pointers is fun?
+      MI->getParent()->getParent()->getConstantPool()->getConstants()[CPI];
+    
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else {
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+      // remember to emit the weak reference
+      if (const GlobalValue *GV = dyn_cast<GlobalValue>(MCPE.Val.ConstVal))
+        if (GV->hasExternalWeakLinkage())
+          ExtWeakSymbols.insert(GV);
+    }
+  }
+}
+
+void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNo) {
+  const MachineOperand &MO1 = MI->getOperand(OpNo);
+  const MachineOperand &MO2 = MI->getOperand(OpNo+1); // Unique Id
+  unsigned JTI = MO1.getJumpTableIndex();
+  O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+    << '_' << JTI << '_' << MO2.getImmedValue() << ":\n";
+
+  const char *JTEntryDirective = TAI->getJumpTableDirective();
+  if (!JTEntryDirective)
+    JTEntryDirective = TAI->getData32bitsDirective();
+
+  const MachineFunction *MF = MI->getParent()->getParent();
+  MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  bool UseSet= TAI->getSetDirective() && TM.getRelocationModel() == Reloc::PIC_;
+  std::set<MachineBasicBlock*> JTSets;
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    if (UseSet && JTSets.insert(MBB).second)
+      printSetLabel(JTI, MO2.getImmedValue(), MBB);
+
+    O << JTEntryDirective << ' ';
+    if (UseSet)
+      O << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+        << '_' << JTI << '_' << MO2.getImmedValue()
+        << "_set_" << MBB->getNumber();
+    else if (TM.getRelocationModel() == Reloc::PIC_) {
+      printBasicBlockLabel(MBB, false, false);
+      // If the arch uses custom Jump Table directives, don't calc relative to JT
+      if (!TAI->getJumpTableDirective()) 
+        O << '-' << TAI->getPrivateGlobalPrefix() << "JTI"
+          << getFunctionNumber() << '_' << JTI << '_' << MO2.getImmedValue();
+    } else
+      printBasicBlockLabel(MBB, false, false);
+    if (i != e-1)
+      O << '\n';
+  }
+}
+
+
+bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant, const char *ExtraCode){
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'c': // Don't print "$" before a global var name or constant.
+    case 'P': // Print a VFP double precision register.
+      printOperand(MI, OpNo);
+      return false;
+    case 'Q':
+      if (TM.getTargetData()->isLittleEndian())
+        break;
+      // Fallthrough
+    case 'R':
+      if (TM.getTargetData()->isBigEndian())
+        break;
+      // Fallthrough
+    case 'H': // Write second word of DI / DF reference.  
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isRegister() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isRegister())
+        return true;
+      ++OpNo;   // Return the high-part.
+    }
+  }
+  
+  printOperand(MI, OpNo);
+  return false;
+}
+
+void ARMAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  int Opc = MI->getOpcode();
+  switch (Opc) {
+  case ARM::CONSTPOOL_ENTRY:
+    if (!InCPMode && AFI->isThumbFunction()) {
+      EmitAlignment(2);
+      InCPMode = true;
+    }
+    break;
+  default: {
+    if (InCPMode && AFI->isThumbFunction())
+      InCPMode = false;
+    switch (Opc) {
+    case ARM::PICADD:
+    case ARM::PICLD:
+    case ARM::PICLDZH:
+    case ARM::PICLDZB:
+    case ARM::PICLDH:
+    case ARM::PICLDB:
+    case ARM::PICLDSH:
+    case ARM::PICLDSB:
+    case ARM::PICSTR:
+    case ARM::PICSTRH:
+    case ARM::PICSTRB:
+    case ARM::tPICADD:
+      break;
+    default:
+      O << "\t";
+      break;
+    }
+  }}
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+}
+
+bool ARMAsmPrinter::doInitialization(Module &M) {
+  // Emit initial debug information.
+  DW.BeginModule(&M);
+  
+  AsmPrinter::doInitialization(M);
+
+  // Darwin wants symbols to be quoted if they have complex names.
+  if (Subtarget->isTargetDarwin())
+    Mang->setUseQuotes(true);
+
+  return false;
+}
+
+bool ARMAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (!I->hasInitializer())   // External global require no code
+      continue;
+
+    if (EmitSpecialLLVMGlobal(I)) {
+      if (Subtarget->isTargetDarwin() &&
+          TM.getRelocationModel() == Reloc::Static) {
+        if (I->getName() == "llvm.global_ctors")
+          O << ".reference .constructors_used\n";
+        else if (I->getName() == "llvm.global_dtors")
+          O << ".reference .destructors_used\n";
+      }
+      continue;
+    }
+
+    std::string name = Mang->getValueName(I);
+    Constant *C = I->getInitializer();
+    const Type *Type = C->getType();
+    unsigned Size = TD->getTypeSize(Type);
+    unsigned Align = TD->getPreferredAlignmentLog(I);
+
+    const char *VisibilityDirective = NULL;
+    if (I->hasHiddenVisibility())
+      VisibilityDirective = TAI->getHiddenDirective();
+    else if (I->hasProtectedVisibility())
+      VisibilityDirective = TAI->getProtectedDirective();
+
+    if (VisibilityDirective)
+      O << VisibilityDirective << name << "\n";
+
+    if (Subtarget->isTargetELF())
+      O << "\t.type " << name << ",%object\n";
+    
+    if (C->isNullValue()) {
+      if (I->hasExternalLinkage()) {
+        if (const char *Directive = TAI->getZeroFillDirective()) {
+          O << "\t.globl\t" << name << "\n";
+          O << Directive << "__DATA__, __common, " << name << ", "
+            << Size << ", " << Align << "\n";
+          continue;
+        }
+      }
+
+      if (!I->hasSection() &&
+          (I->hasInternalLinkage() || I->hasWeakLinkage() ||
+           I->hasLinkOnceLinkage())) {
+        if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+        if (!NoZerosInBSS && TAI->getBSSSection())
+          SwitchToDataSection(TAI->getBSSSection(), I);
+        else
+          SwitchToDataSection(TAI->getDataSection(), I);
+        if (TAI->getLCOMMDirective() != NULL) {
+          if (I->hasInternalLinkage()) {
+            O << TAI->getLCOMMDirective() << name << "," << Size;
+            if (Subtarget->isTargetDarwin())
+              O << "," << Align;
+          } else
+            O << TAI->getCOMMDirective()  << name << "," << Size;
+        } else {
+          if (I->hasInternalLinkage())
+            O << "\t.local\t" << name << "\n";
+          O << TAI->getCOMMDirective()  << name << "," << Size;
+          if (TAI->getCOMMDirectiveTakesAlignment())
+            O << "," << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+        }
+        O << "\t\t" << TAI->getCommentString() << " " << I->getName() << "\n";
+        continue;
+      }
+    }
+
+    switch (I->getLinkage()) {
+    case GlobalValue::LinkOnceLinkage:
+    case GlobalValue::WeakLinkage:
+      if (Subtarget->isTargetDarwin()) {
+        O << "\t.globl " << name << "\n"
+          << "\t.weak_definition " << name << "\n";
+        SwitchToDataSection("\t.section __DATA,__const_coal,coalesced", I);
+      } else {
+        std::string SectionName("\t.section\t.llvm.linkonce.d." +
+                                name +
+                                ",\"aw\",%progbits");
+        SwitchToDataSection(SectionName.c_str(), I);
+        O << "\t.weak " << name << "\n";
+      }
+      break;
+    case GlobalValue::AppendingLinkage:
+      // FIXME: appending linkage variables should go into a section of
+      // their name or something.  For now, just emit them as external.
+    case GlobalValue::ExternalLinkage:
+      O << "\t.globl " << name << "\n";
+      // FALL THROUGH
+    case GlobalValue::InternalLinkage: {
+      if (I->isConstant()) {
+        const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+        if (TAI->getCStringSection() && CVA && CVA->isCString()) {
+          SwitchToDataSection(TAI->getCStringSection(), I);
+          break;
+        }
+      }
+      // FIXME: special handling for ".ctors" & ".dtors" sections
+      if (I->hasSection() &&
+          (I->getSection() == ".ctors" ||
+           I->getSection() == ".dtors")) {
+        assert(!Subtarget->isTargetDarwin());
+        std::string SectionName = ".section " + I->getSection();
+        SectionName += ",\"aw\",%progbits";
+        SwitchToDataSection(SectionName.c_str());
+      } else {
+        if (C->isNullValue() && !NoZerosInBSS && TAI->getBSSSection())
+          SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSBSSSection() :
+                              TAI->getBSSSection(), I);
+        else if (!I->isConstant())
+          SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSDataSection() :
+                              TAI->getDataSection(), I);
+        else if (I->isThreadLocal())
+          SwitchToDataSection(TAI->getTLSDataSection());
+        else {
+          // Read-only data.
+          bool HasReloc = C->ContainsRelocations();
+          if (HasReloc &&
+              Subtarget->isTargetDarwin() &&
+              TM.getRelocationModel() != Reloc::Static)
+            SwitchToDataSection("\t.const_data\n");
+          else if (!HasReloc && Size == 4 &&
+                   TAI->getFourByteConstantSection())
+            SwitchToDataSection(TAI->getFourByteConstantSection(), I);
+          else if (!HasReloc && Size == 8 &&
+                   TAI->getEightByteConstantSection())
+            SwitchToDataSection(TAI->getEightByteConstantSection(), I);
+          else if (!HasReloc && Size == 16 &&
+                   TAI->getSixteenByteConstantSection())
+            SwitchToDataSection(TAI->getSixteenByteConstantSection(), I);
+          else if (TAI->getReadOnlySection())
+            SwitchToDataSection(TAI->getReadOnlySection(), I);
+          else
+            SwitchToDataSection(TAI->getDataSection(), I);
+        }
+      }
+
+      break;
+    }
+    default:
+      assert(0 && "Unknown linkage type!");
+      break;
+    }
+
+    EmitAlignment(Align, I);
+    O << name << ":\t\t\t\t" << TAI->getCommentString() << " " << I->getName()
+      << "\n";
+    if (TAI->hasDotTypeDotSizeDirective())
+      O << "\t.size " << name << ", " << Size << "\n";
+    // If the initializer is a extern weak symbol, remember to emit the weak
+    // reference!
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+      if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+
+    EmitGlobalConstant(C);
+    O << '\n';
+  }
+
+  if (Subtarget->isTargetDarwin()) {
+    SwitchToDataSection("");
+
+    // Output stubs for dynamically-linked functions
+    unsigned j = 1;
+    for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i, ++j) {
+      if (TM.getRelocationModel() == Reloc::PIC_)
+        SwitchToTextSection(".section __TEXT,__picsymbolstub4,symbol_stubs,"
+                            "none,16", 0);
+      else
+        SwitchToTextSection(".section __TEXT,__symbol_stub4,symbol_stubs,"
+                            "none,12", 0);
+
+      EmitAlignment(2);
+      O << "\t.code\t32\n";
+
+      O << "L" << *i << "$stub:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\tldr ip, L" << *i << "$slp\n";
+      if (TM.getRelocationModel() == Reloc::PIC_) {
+        O << "L" << *i << "$scv:\n";
+        O << "\tadd ip, pc, ip\n";
+      }
+      O << "\tldr pc, [ip, #0]\n";
+      O << "L" << *i << "$slp:\n";
+      if (TM.getRelocationModel() == Reloc::PIC_)
+        O << "\t.long\tL" << *i << "$lazy_ptr-(L" << *i << "$scv+8)\n";
+      else
+        O << "\t.long\tL" << *i << "$lazy_ptr\n";
+      SwitchToDataSection(".lazy_symbol_pointer", 0);
+      O << "L" << *i << "$lazy_ptr:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\t.long\tdyld_stub_binding_helper\n";
+    }
+    O << "\n";
+
+    // Output non-lazy-pointers for external and common global variables.
+    if (GVNonLazyPtrs.begin() != GVNonLazyPtrs.end())
+      SwitchToDataSection(".non_lazy_symbol_pointer", 0);
+    for (std::set<std::string>::iterator i = GVNonLazyPtrs.begin(),
+           e = GVNonLazyPtrs.end(); i != e; ++i) {
+      O << "L" << *i << "$non_lazy_ptr:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\t.long\t0\n";
+    }
+
+    // Emit initial debug information.
+    DW.EndModule();
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    O << "\t.subsections_via_symbols\n";
+  } else {
+    // Emit final debug information for ELF.
+    DW.EndModule();
+  }
+
+  AsmPrinter::doFinalization(M);
+  return false; // success
+}

diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
new file mode 100644
index 0000000..ed1d287
--- /dev/null
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp

@@ -0,0 +1,92 @@
+//===-- ARM/ARMCodeEmitter.cpp - Convert ARM code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Raul Herbster and is distributed under the 
+// University of Illinois Open Source License.  See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the ARM machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-emitter"
+#include "ARMInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "ARM.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+  class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass {
+    const ARMInstrInfo  *II;
+    const TargetData    *TD;
+    TargetMachine       &TM;
+    MachineCodeEmitter  &MCE;
+  public:
+    static char ID;
+    explicit Emitter(TargetMachine &tm, MachineCodeEmitter &mce)
+      : MachineFunctionPass((intptr_t)&ID), II(0), TD(0), TM(tm), 
+      MCE(mce) {}
+    Emitter(TargetMachine &tm, MachineCodeEmitter &mce,
+            const ARMInstrInfo &ii, const TargetData &td)
+      : MachineFunctionPass((intptr_t)&ID), II(&ii), TD(&td), TM(tm), 
+      MCE(mce) {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "ARM Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI);
+
+  private:
+
+  };
+  char Emitter::ID = 0;
+}
+
+/// createARMCodeEmitterPass - Return a pass that emits the collected ARM code
+/// to the specified MCE object.
+FunctionPass *llvm::createARMCodeEmitterPass(ARMTargetMachine &TM,
+                                             MachineCodeEmitter &MCE) {
+  return new Emitter(TM, MCE);
+}
+
+bool Emitter::runOnMachineFunction(MachineFunction &MF) {
+  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+          MF.getTarget().getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+  II = ((ARMTargetMachine&)MF.getTarget()).getInstrInfo();
+  TD = ((ARMTargetMachine&)MF.getTarget()).getTargetData();
+
+  do {
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I)
+        emitInstruction(*I);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void Emitter::emitInstruction(const MachineInstr &MI) {
+  NumEmitted++;  // Keep track of the # of mi's emitted
+}

diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
new file mode 100644
index 0000000..1b93631
--- /dev/null
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp

@@ -0,0 +1,1277 @@
+//===-- ARMConstantIslandPass.cpp - ARM constant islands --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that splits the constant pool up into 'islands'
+// which are scattered through-out the function.  This is required due to the
+// limited pc-relative displacements that ARM has.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-cp-islands"
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMInstrInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumCPEs,     "Number of constpool entries");
+STATISTIC(NumSplit,    "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed, "Number of cond branches fixed");
+STATISTIC(NumUBrFixed, "Number of uncond branches fixed");
+
+namespace {
+  /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+  class VISIBILITY_HIDDEN ARMConstantIslands : public MachineFunctionPass {
+    /// NextUID - Assign unique ID's to CPE's.
+    unsigned NextUID;
+
+    /// BBSizes - The size of each MachineBasicBlock in bytes of code, indexed
+    /// by MBB Number.  The two-byte pads required for Thumb alignment are
+    /// counted as part of the following block (i.e., the offset and size for
+    /// a padded block will both be ==2 mod 4).
+    std::vector<unsigned> BBSizes;
+    
+    /// BBOffsets - the offset of each MBB in bytes, starting from 0.
+    /// The two-byte pads required for Thumb alignment are counted as part of
+    /// the following block.
+    std::vector<unsigned> BBOffsets;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      unsigned MaxDisp;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp) {}
+    };
+    
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+    
+    /// CPEntry - One per constant pool entry, keeping the machine instruction
+    /// pointer, the constpool index, and the number of CPUser's which
+    /// reference this entry.
+    struct CPEntry {
+      MachineInstr *CPEMI;
+      unsigned CPI;
+      unsigned RefCount;
+      CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+        : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+    };
+
+    /// CPEntries - Keep track of all of the constant pool entry machine
+    /// instructions. For each original constpool index (i.e. those that
+    /// existed upon entry to this pass), it keeps a vector of entries.
+    /// Original elements are cloned as we go along; the clones are
+    /// put in the vector of the original element, but have distinct CPIs.
+    std::vector<std::vector<CPEntry> > CPEntries;
+    
+    /// ImmBranch - One per immediate branch, keeping the machine instruction
+    /// pointer, conditional or unconditional, the max displacement,
+    /// and (if isCond is true) the corresponding unconditional branch
+    /// opcode.
+    struct ImmBranch {
+      MachineInstr *MI;
+      unsigned MaxDisp : 31;
+      bool isCond : 1;
+      int UncondBr;
+      ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+        : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+    };
+
+    /// ImmBranches - Keep track of all the immediate branch instructions.
+    ///
+    std::vector<ImmBranch> ImmBranches;
+
+    /// PushPopMIs - Keep track of all the Thumb push / pop instructions.
+    ///
+    SmallVector<MachineInstr*, 4> PushPopMIs;
+
+    /// HasFarJump - True if any far jump instruction has been emitted during
+    /// the branch fix up pass.
+    bool HasFarJump;
+
+    const TargetInstrInfo *TII;
+    ARMFunctionInfo *AFI;
+    bool isThumb;
+  public:
+    static char ID;
+    ARMConstantIslands() : MachineFunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM constant island placement and branch shortening pass";
+    }
+    
+  private:
+    void DoInitialPlacement(MachineFunction &Fn,
+                            std::vector<MachineInstr*> &CPEMIs);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    void InitialFunctionScan(MachineFunction &Fn,
+                             const std::vector<MachineInstr*> &CPEMIs);
+    MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI);
+    void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void AdjustBBOffsetsAfter(MachineBasicBlock *BB, int delta);
+    bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI);
+    int LookForExistingCPEntry(CPUser& U, unsigned UserOffset);
+    bool LookForWater(CPUser&U, unsigned UserOffset, 
+                      MachineBasicBlock** NewMBB);
+    MachineBasicBlock* AcceptWater(MachineBasicBlock *WaterBB, 
+                        std::vector<MachineBasicBlock*>::iterator IP);
+    void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                      MachineBasicBlock** NewMBB);
+    bool HandleConstantPoolUser(MachineFunction &Fn, unsigned CPUserIndex);
+    void RemoveDeadCPEMI(MachineInstr *CPEMI);
+    bool RemoveUnusedCPEntries();
+    bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset, 
+                      MachineInstr *CPEMI, unsigned Disp,
+                      bool DoDump);
+    bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U);
+    bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset,
+                        unsigned Disp, bool NegativeOK);
+    bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br);
+    bool FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br);
+    bool FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br);
+    bool UndoLRSpillRestore();
+
+    unsigned GetOffsetOf(MachineInstr *MI) const;
+    void dumpBBs();
+    void verify(MachineFunction &Fn);
+  };
+  char ARMConstantIslands::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void ARMConstantIslands::verify(MachineFunction &Fn) {
+  assert(BBOffsets.size() == BBSizes.size());
+  for (unsigned i = 1, e = BBOffsets.size(); i != e; ++i)
+    assert(BBOffsets[i-1]+BBSizes[i-1] == BBOffsets[i]);
+  if (isThumb) {
+    for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end();
+         MBBI != E; ++MBBI) {
+      MachineBasicBlock *MBB = MBBI;
+      if (!MBB->empty() &&
+          MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY)
+        assert((BBOffsets[MBB->getNumber()]%4 == 0 &&
+                BBSizes[MBB->getNumber()]%4 == 0) ||
+               (BBOffsets[MBB->getNumber()]%4 != 0 &&
+                BBSizes[MBB->getNumber()]%4 != 0));
+    }
+  }
+}
+
+/// print block size and offset information - debugging
+void ARMConstantIslands::dumpBBs() {
+  for (unsigned J = 0, E = BBOffsets.size(); J !=E; ++J) {
+    DOUT << "block " << J << " offset " << BBOffsets[J] << 
+                            " size " << BBSizes[J] << "\n";
+  }
+}
+
+/// createARMConstantIslandPass - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARMConstantIslandPass() {
+  return new ARMConstantIslands();
+}
+
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) {
+  MachineConstantPool &MCP = *Fn.getConstantPool();
+  
+  TII = Fn.getTarget().getInstrInfo();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  isThumb = AFI->isThumbFunction();
+
+  HasFarJump = false;
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  Fn.RenumberBlocks();
+
+  /// Thumb functions containing constant pools get 2-byte alignment.  This is so
+  /// we can keep exact track of where the alignment padding goes.  Set default.
+  AFI->setAlign(isThumb ? 1U : 2U);
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP.isEmpty()) {
+    DoInitialPlacement(Fn, CPEMIs);
+    if (isThumb)
+      AFI->setAlign(2U);
+  }
+  
+  /// The next UID to take is the first unused one.
+  NextUID = CPEMIs.size();
+  
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  InitialFunctionScan(Fn, CPEMIs);
+  CPEMIs.clear();
+  
+  /// Remove dead constant pool entries.
+  RemoveUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  bool MadeChange = false;
+  while (true) {
+    bool Change = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      Change |= HandleConstantPoolUser(Fn, i);
+    DEBUG(dumpBBs());
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      Change |= FixUpImmediateBr(Fn, ImmBranches[i]);
+    DEBUG(dumpBBs());
+    if (!Change)
+      break;
+    MadeChange = true;
+  }
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify(Fn);
+
+  // If LR has been forced spilled and no far jumps (i.e. BL) has been issued.
+  // Undo the spill / restore of LR if possible.
+  if (!HasFarJump && AFI->isLRSpilledForFarJump() && isThumb)
+    MadeChange |= UndoLRSpillRestore();
+
+  BBSizes.clear();
+  BBOffsets.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  PushPopMIs.clear();
+
+  return MadeChange;
+}
+
+/// DoInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void ARMConstantIslands::DoInitialPlacement(MachineFunction &Fn,
+                                        std::vector<MachineInstr*> &CPEMIs){
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = new MachineBasicBlock();
+  Fn.getBasicBlockList().push_back(BB);
+  
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs =
+    Fn.getConstantPool()->getConstants();
+  
+  const TargetData &TD = *Fn.getTarget().getTargetData();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeSize(CPs[i].getType());
+    // Verify that all constant pool entries are a multiple of 4 bytes.  If not,
+    // we would have to pad them out or something so that instructions stay
+    // aligned.
+    assert((Size & 3) == 0 && "CP Entry not multiple of 4 bytes!");
+    MachineInstr *CPEMI =
+      BuildMI(BB, TII->get(ARM::CONSTPOOL_ENTRY))
+                           .addImm(i).addConstantPoolIndex(i).addImm(Size);
+    CPEMIs.push_back(CPEMI);
+
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    std::vector<CPEntry> CPEs;
+    CPEs.push_back(CPEntry(CPEMI, i));
+    CPEntries.push_back(CPEs);
+    NumCPEs++;
+    DOUT << "Moved CPI#" << i << " to end of function as #" << i << "\n";
+  }
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  if (next(MBBI) == MBB->getParent()->end())  // Can't fall off end of function.
+    return false;
+  
+  MachineBasicBlock *NextBB = next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+  
+  return false;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+ARMConstantIslands::CPEntry
+*ARMConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return NULL;
+}
+
+/// InitialFunctionScan - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn,
+                                 const std::vector<MachineInstr*> &CPEMIs) {
+  unsigned Offset = 0;
+  for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+    
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+    
+    unsigned MBBSize = 0;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      // Add instruction size to MBBSize.
+      MBBSize += ARM::GetInstSize(I);
+
+      int Opc = I->getOpcode();
+      if (TII->isBranch(Opc)) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        case ARM::tBR_JTr:
+          // A Thumb table jump may involve padding; for the offsets to
+          // be right, functions containing these must be 4-byte aligned.
+          AFI->setAlign(2U);
+          if ((Offset+MBBSize)%4 != 0)
+            MBBSize += 2;           // padding
+          continue;   // Does not get an entry in ImmBranches
+        default:
+          continue;  // Ignore other JT branches
+        case ARM::Bcc:
+          isCond = true;
+          UOpc = ARM::B;
+          // Fallthrough
+        case ARM::B:
+          Bits = 24;
+          Scale = 4;
+          break;
+        case ARM::tBcc:
+          isCond = true;
+          UOpc = ARM::tB;
+          Bits = 8;
+          Scale = 2;
+          break;
+        case ARM::tB:
+          Bits = 11;
+          Scale = 2;
+          break;
+        }
+
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
+        PushPopMIs.push_back(I);
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+        if (I->getOperand(op).isConstantPoolIndex()) {
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+          
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          unsigned TSFlags = I->getInstrDescriptor()->TSFlags;
+          switch (TSFlags & ARMII::AddrModeMask) {
+          default: 
+            // Constant pool entries can reach anything.
+            if (I->getOpcode() == ARM::CONSTPOOL_ENTRY)
+              continue;
+            if (I->getOpcode() == ARM::tLEApcrel) {
+              Bits = 8;  // Taking the address of a CP entry.
+              break;
+            }
+            assert(0 && "Unknown addressing mode for CP reference!");
+          case ARMII::AddrMode1: // AM1: 8 bits << 2
+            Bits = 8;
+            Scale = 4;  // Taking the address of a CP entry.
+            break;
+          case ARMII::AddrMode2:
+            Bits = 12;  // +-offset_12
+            break;
+          case ARMII::AddrMode3:
+            Bits = 8;   // +-offset_8
+            break;
+            // addrmode4 has no immediate offset.
+          case ARMII::AddrMode5:
+            Bits = 8;
+            Scale = 4;  // +-(offset_8*4)
+            break;
+          case ARMII::AddrModeT1:
+            Bits = 5;  // +offset_5
+            break;
+          case ARMII::AddrModeT2:
+            Bits = 5;
+            Scale = 2;  // +(offset_5*2)
+            break;
+          case ARMII::AddrModeT4:
+            Bits = 5;
+            Scale = 4;  // +(offset_5*4)
+            break;
+          case ARMII::AddrModeTs:
+            Bits = 8;
+            Scale = 4;  // +(offset_8*4)
+            break;
+          }
+
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = I->getOperand(op).getConstantPoolIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;          
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+          
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+        }
+    }
+
+    // In thumb mode, if this block is a constpool island, we may need padding
+    // so it's aligned on 4 byte boundary.
+    if (isThumb &&
+        !MBB.empty() &&
+        MBB.begin()->getOpcode() == ARM::CONSTPOOL_ENTRY &&
+        (Offset%4) != 0)
+      MBBSize += 2;
+
+    BBSizes.push_back(MBBSize);
+    BBOffsets.push_back(Offset);
+    Offset += MBBSize;
+  }
+}
+
+/// GetOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned ARMConstantIslands::GetOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBOffsets[MBB->getNumber()];
+
+  // If we're looking for a CONSTPOOL_ENTRY in Thumb, see if this block has
+  // alignment padding, and compensate if so.
+  if (isThumb && 
+      MI->getOpcode() == ARM::CONSTPOOL_ENTRY && 
+      Offset%4 != 0)
+    Offset += 2;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); ; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    if (&*I == MI) return Offset;
+    Offset += ARM::GetInstSize(I);
+  }
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// UpdateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consequtive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+  
+  // Insert a size into BBSizes to align it properly with the (newly
+  // renumbered) block numbers.
+  BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0);
+
+  // Likewise for BBOffsets.
+  BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0);
+  
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having 
+  // available water after it.
+  std::vector<MachineBasicBlock*>::iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update datastructures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB = new MachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  OrigBB->getParent()->getBasicBlockList().insert(MBBI, NewBB);
+  
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+  
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  BuildMI(OrigBB, TII->get(isThumb ? ARM::tB : ARM::B)).addMBB(NewBB);
+  NumSplit++;
+  
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  while (!OrigBB->succ_empty()) {
+    MachineBasicBlock *Succ = *OrigBB->succ_begin();
+    OrigBB->removeSuccessor(Succ);
+    NewBB->addSuccessor(Succ);
+    
+    // This pass should be run after register allocation, so there should be no
+    // PHI nodes to update.
+    assert((Succ->empty() || Succ->begin()->getOpcode() != TargetInstrInfo::PHI)
+           && "PHI nodes should be eliminated by now!");
+  }
+  
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+  
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as UpdateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+  
+  // Insert a size into BBSizes to align it properly with the (newly
+  // renumbered) block numbers.
+  BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0);
+  
+  // Likewise for BBOffsets.
+  BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0);
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having 
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  std::vector<MachineBasicBlock*>::iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+
+  // Figure out how large the first NewMBB is.  (It cannot
+  // contain a constpool_entry or tablejump.)
+  unsigned NewBBSize = 0;
+  for (MachineBasicBlock::iterator I = NewBB->begin(), E = NewBB->end();
+       I != E; ++I)
+    NewBBSize += ARM::GetInstSize(I);
+  
+  unsigned OrigBBI = OrigBB->getNumber();
+  unsigned NewBBI = NewBB->getNumber();
+  // Set the size of NewBB in BBSizes.
+  BBSizes[NewBBI] = NewBBSize;
+  
+  // We removed instructions from UserMBB, subtract that off from its size.
+  // Add 2 or 4 to the block to count the unconditional branch we added to it.
+  unsigned delta = isThumb ? 2 : 4;
+  BBSizes[OrigBBI] -= NewBBSize - delta;
+
+  // ...and adjust BBOffsets for NewBB accordingly.
+  BBOffsets[NewBBI] = BBOffsets[OrigBBI] + BBSizes[OrigBBI];
+
+  // All BBOffsets following these blocks must be modified.
+  AdjustBBOffsetsAfter(NewBB, delta);
+
+  return NewBB;
+}
+
+/// OffsetIsInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a 
+/// constant pool entry).
+bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset, 
+                      unsigned TrialOffset, unsigned MaxDisp, bool NegativeOK) {
+  // On Thumb offsets==2 mod 4 are rounded down by the hardware for 
+  // purposes of the displacement computation; compensate for that here.  
+  // Effectively, the valid range of displacements is 2 bytes smaller for such
+  // references.
+  if (isThumb && UserOffset%4 !=0)
+    UserOffset -= 2;
+  // CPEs will be rounded up to a multiple of 4.
+  if (isThumb && TrialOffset%4 != 0)
+    TrialOffset += 2;
+
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset-UserOffset <= MaxDisp)
+      return true;
+  } else if (NegativeOK) {
+    if (UserOffset-TrialOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// WaterIsInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+
+bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset,
+                         MachineBasicBlock* Water, CPUser &U)
+{
+  unsigned MaxDisp = U.MaxDisp;
+  MachineFunction::iterator I = next(MachineFunction::iterator(Water));
+  unsigned CPEOffset = BBOffsets[Water->getNumber()] + 
+                       BBSizes[Water->getNumber()];
+
+  // If the CPE is to be inserted before the instruction, that will raise
+  // the offset of the instruction.  (Currently applies only to ARM, so
+  // no alignment compensation attempted here.)
+  if (CPEOffset < UserOffset)
+    UserOffset += U.CPEMI->getOperand(2).getImm();
+
+  return OffsetIsInRange (UserOffset, CPEOffset, MaxDisp, !isThumb);
+}
+
+/// CPEIsInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
+                                      MachineInstr *CPEMI,
+                                      unsigned MaxDisp, bool DoDump) {
+  unsigned CPEOffset  = GetOffsetOf(CPEMI);
+  assert(CPEOffset%4 == 0 && "Misaligned CPE");
+
+  if (DoDump) {
+    DOUT << "User of CPE#" << CPEMI->getOperand(0).getImm()
+         << " max delta=" << MaxDisp
+         << " insn address=" << UserOffset
+         << " CPE address=" << CPEOffset
+         << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI;
+  }
+
+  return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, !isThumb);
+}
+
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+
+void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB, 
+                                              int delta) {
+  MachineFunction::iterator MBBI = BB; MBBI = next(MBBI);
+  for(unsigned i=BB->getNumber()+1; i<BB->getParent()->getNumBlockIDs(); i++) {
+    BBOffsets[i] += delta;
+    // If some existing blocks have padding, adjust the padding as needed, a
+    // bit tricky.  delta can be negative so don't use % on that.
+    if (isThumb) {
+      MachineBasicBlock *MBB = MBBI;
+      if (!MBB->empty()) {
+        // Constant pool entries require padding.
+        if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
+          unsigned oldOffset = BBOffsets[i] - delta;
+          if (oldOffset%4==0 && BBOffsets[i]%4!=0) {
+            // add new padding
+            BBSizes[i] += 2;
+            delta += 2;
+          } else if (oldOffset%4!=0 && BBOffsets[i]%4==0) {
+            // remove existing padding
+            BBSizes[i] -=2;
+            delta -= 2;
+          }
+        }
+        // Thumb jump tables require padding.  They should be at the end;
+        // following unconditional branches are removed by AnalyzeBranch.
+        MachineInstr *ThumbJTMI = NULL;
+        if (prior(MBB->end())->getOpcode() == ARM::tBR_JTr)
+          ThumbJTMI = prior(MBB->end());
+        if (ThumbJTMI) {
+          unsigned newMIOffset = GetOffsetOf(ThumbJTMI);
+          unsigned oldMIOffset = newMIOffset - delta;
+          if (oldMIOffset%4 == 0 && newMIOffset%4 != 0) {
+            // remove existing padding
+            BBSizes[i] -= 2;
+            delta -= 2;
+          } else if (oldMIOffset%4 != 0 && newMIOffset%4 == 0) {
+            // add new padding
+            BBSizes[i] += 2;
+            delta += 2;
+          }
+        }
+        if (delta==0)
+          return;
+      }
+      MBBI = next(MBBI);
+    }
+  }
+}
+
+/// DecrementOldEntry - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed 
+/// the entry, false if we didn't.
+
+bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    RemoveDeadCPEMI(CPEMI);
+    CPE->CPEMI = NULL;
+    NumCPEs--;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, true)) {
+    DOUT << "In range\n";
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getConstantPoolIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == NULL)
+      continue;
+    if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, false)) {
+      DOUT << "Replacing CPE#" << CPI << " with CPE#" << CPEs[i].CPI << "\n";
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isConstantPoolIndex()) {
+          UserMI->getOperand(j).setConstantPoolIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return DecrementOldEntry(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  return (Opc == ARM::tB) ? ((1<<10)-1)*2 : ((1<<23)-1)*4;
+}
+
+/// AcceptWater - Small amount of common code factored out of the following.
+
+MachineBasicBlock* ARMConstantIslands::AcceptWater(MachineBasicBlock *WaterBB, 
+                          std::vector<MachineBasicBlock*>::iterator IP) {
+  DOUT << "found water in range\n";
+  // Remove the original WaterList entry; we want subsequent
+  // insertions in this vicinity to go after the one we're
+  // about to insert.  This considerably reduces the number
+  // of times we have to move the same CPE more than once.
+  WaterList.erase(IP);
+  // CPE goes before following block (NewMBB).
+  return next(MachineFunction::iterator(WaterBB));
+}
+
+/// LookForWater - look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, *NewMBB
+/// is set to the WaterList entry.
+/// For ARM, we prefer the water that's farthest away.  For Thumb, prefer
+/// water that will not introduce padding to water that will; within each
+/// group, prefer the water that's farthest away.
+
+bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
+                                      MachineBasicBlock** NewMBB) {
+  std::vector<MachineBasicBlock*>::iterator IPThatWouldPad;
+  MachineBasicBlock* WaterBBThatWouldPad = NULL;
+  if (!WaterList.empty()) {
+    for (std::vector<MachineBasicBlock*>::iterator IP = prior(WaterList.end()),
+        B = WaterList.begin();; --IP) {
+      MachineBasicBlock* WaterBB = *IP;
+      if (WaterIsInRange(UserOffset, WaterBB, U)) {
+        if (isThumb &&
+            (BBOffsets[WaterBB->getNumber()] + 
+             BBSizes[WaterBB->getNumber()])%4 != 0) {
+          // This is valid Water, but would introduce padding.  Remember
+          // it in case we don't find any Water that doesn't do this.
+          if (!WaterBBThatWouldPad) {
+            WaterBBThatWouldPad = WaterBB;
+            IPThatWouldPad = IP;
+          }
+        } else {
+          *NewMBB = AcceptWater(WaterBB, IP);
+          return true;
+        }
+    }
+      if (IP == B)
+        break;
+    }
+  }
+  if (isThumb && WaterBBThatWouldPad) {
+    *NewMBB = AcceptWater(WaterBBThatWouldPad, IPThatWouldPad);
+    return true;
+  }
+  return false;
+}
+
+/// CreateNewWater - No existing WaterList entry will work for 
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case *NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+
+void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, 
+                        unsigned UserOffset, MachineBasicBlock** NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  unsigned OffsetOfNextBlock = BBOffsets[UserMBB->getNumber()] + 
+                               BBSizes[UserMBB->getNumber()];
+  assert(OffsetOfNextBlock== BBOffsets[UserMBB->getNumber()+1]);
+
+  // If the use is at the end of the block, or the end of the block
+  // is within range, make new water there.  (The addition below is
+  // for the unconditional branch we will be adding:  4 bytes on ARM,
+  // 2 on Thumb.  Possible Thumb alignment padding is allowed for
+  // inside OffsetIsInRange.
+  // If the block ends in an unconditional branch already, it is water, 
+  // and is known to be out of range, so we'll always be adding a branch.)
+  if (&UserMBB->back() == UserMI ||
+      OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb ? 2: 4),
+           U.MaxDisp, !isThumb)) {
+    DOUT << "Split at end of block\n";
+    if (&UserMBB->back() == UserMI)
+      assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!");
+    *NewMBB = next(MachineFunction::iterator(UserMBB));
+    // Add an unconditional branch from UserMBB to fallthrough block.
+    // Record it for branch lengthening; this new branch will not get out of
+    // range, but if the preceding conditional branch is out of range, the
+    // targets will be exchanged, and the altered branch may be out of
+    // range, so the machinery has to know about it.
+    int UncondBr = isThumb ? ARM::tB : ARM::B;
+    BuildMI(UserMBB, TII->get(UncondBr)).addMBB(*NewMBB);
+    unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+    ImmBranches.push_back(ImmBranch(&UserMBB->back(), 
+                          MaxDisp, false, UncondBr));
+    int delta = isThumb ? 2 : 4;
+    BBSizes[UserMBB->getNumber()] += delta;
+    AdjustBBOffsetsAfter(UserMBB, delta);
+  } else {
+    // What a big block.  Find a place within the block to split it.
+    // This is a little tricky on Thumb since instructions are 2 bytes
+    // and constant pool entries are 4 bytes: if instruction I references
+    // island CPE, and instruction I+1 references CPE', it will
+    // not work well to put CPE as far forward as possible, since then
+    // CPE' cannot immediately follow it (that location is 2 bytes
+    // farther away from I+1 than CPE was from I) and we'd need to create
+    // a new island.  So, we make a first guess, then walk through the
+    // instructions between the one currently being looked at and the
+    // possible insertion point, and make sure any other instructions
+    // that reference CPEs will be able to use the same island area;
+    // if not, we back up the insertion point.
+
+    // The 4 in the following is for the unconditional branch we'll be
+    // inserting (allows for long branch on Thumb).  Alignment of the
+    // island is handled inside OffsetIsInRange.
+    unsigned BaseInsertOffset = UserOffset + U.MaxDisp -4;
+    // This could point off the end of the block if we've already got
+    // constant pool entries following this block; only the last one is
+    // in the water list.  Back past any possible branches (allow for a
+    // conditional and a maximally long unconditional).
+    if (BaseInsertOffset >= BBOffsets[UserMBB->getNumber()+1])
+      BaseInsertOffset = BBOffsets[UserMBB->getNumber()+1] - 
+                              (isThumb ? 6 : 8);
+    unsigned EndInsertOffset = BaseInsertOffset +
+           CPEMI->getOperand(2).getImm();
+    MachineBasicBlock::iterator MI = UserMI;
+    ++MI;
+    unsigned CPUIndex = CPUserIndex+1;
+    for (unsigned Offset = UserOffset+ARM::GetInstSize(UserMI);
+         Offset < BaseInsertOffset;
+         Offset += ARM::GetInstSize(MI),
+            MI = next(MI)) {
+      if (CPUIndex < CPUsers.size() && CPUsers[CPUIndex].MI == MI) {
+        if (!OffsetIsInRange(Offset, EndInsertOffset, 
+              CPUsers[CPUIndex].MaxDisp, !isThumb)) {
+          BaseInsertOffset -= (isThumb ? 2 : 4);
+          EndInsertOffset -= (isThumb ? 2 : 4);
+        }
+        // This is overly conservative, as we don't account for CPEMIs
+        // being reused within the block, but it doesn't matter much.
+        EndInsertOffset += CPUsers[CPUIndex].CPEMI->getOperand(2).getImm();
+        CPUIndex++;
+      }
+    }
+    DOUT << "Split in middle of big block\n";
+    *NewMBB = SplitBlockBeforeInstr(prior(MI));
+  }
+}
+
+/// HandleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick it up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn, 
+                                                unsigned CPUserIndex){
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getConstantPoolIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  MachineBasicBlock *NewMBB;
+  // Compute this only once, it's expensive.  The 4 or 8 is the value the
+  //  hardware keeps in the PC (2 insns ahead of the reference).
+  unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8);
+
+  // Special case: tLEApcrel are two instructions MI's. The actual user is the
+  // second instruction.
+  if (UserMI->getOpcode() == ARM::tLEApcrel)
+    UserOffset += 2;
+ 
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = LookForExistingCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID  = NextUID++;
+
+  // Look for water where we can place this CPE.  We look for the farthest one
+  // away that will work.  Forward references only for now (although later
+  // we might find some that are backwards).
+
+  if (!LookForWater(U, UserOffset, &NewMBB)) {
+    // No water found.
+    DOUT << "No water found\n";
+    CreateNewWater(CPUserIndex, UserOffset, &NewMBB);
+  }
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MachineBasicBlock *NewIsland = new MachineBasicBlock();
+  Fn.getBasicBlockList().insert(NewMBB, NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  UpdateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  DecrementOldEntry(CPI, CPEMI);
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.CPEMI = BuildMI(NewIsland, TII->get(ARM::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  NumCPEs++;
+
+  BBOffsets[NewIsland->getNumber()] = BBOffsets[NewMBB->getNumber()];
+  // Compensate for .align 2 in thumb mode.
+  if (isThumb && BBOffsets[NewIsland->getNumber()]%4 != 0) 
+    Size += 2;
+  // Increase the size of the island block to account for the new entry.
+  BBSizes[NewIsland->getNumber()] += Size;
+  AdjustBBOffsetsAfter(NewIsland, Size);
+  
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isConstantPoolIndex()) {
+      UserMI->getOperand(i).setConstantPoolIndex(ID);
+      break;
+    }
+      
+  DOUT << "  Moved CPE to #" << ID << " CPI=" << CPI << "\t" << *UserMI;
+      
+  return true;
+}
+
+/// RemoveDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBSizes[CPEBB->getNumber()] -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    // In thumb mode, the size of island may be  padded by two to compensate for
+    // the alignment requirement.  Then it will now be 2 when the block is
+    // empty, so fix this.
+    // All succeeding offsets have the current size value added in, fix this.
+    if (BBSizes[CPEBB->getNumber()] != 0) {
+      Size += BBSizes[CPEBB->getNumber()];
+      BBSizes[CPEBB->getNumber()] = 0;
+    }
+  }
+  AdjustBBOffsetsAfter(CPEBB, -Size);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// RemoveUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool ARMConstantIslands::RemoveUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          RemoveDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = NULL;
+          MadeChange = true;
+        }
+      }
+  }  
+  return MadeChange;
+}
+
+/// BBIsInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
+                                     unsigned MaxDisp) {
+  unsigned PCAdj      = isThumb ? 4 : 8;
+  unsigned BrOffset   = GetOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBOffsets[DestBB->getNumber()];
+
+  DOUT << "Branch of destination BB#" << DestBB->getNumber()
+       << " from BB#" << MI->getParent()->getNumber()
+       << " max delta=" << MaxDisp
+       << " from " << GetOffsetOf(MI) << " to " << DestOffset
+       << " offset " << int(DestOffset-BrOffset) << "\t" << *MI;
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// FixUpImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMachineBasicBlock();
+
+  // Check to see if the DestBB is already in-range.
+  if (BBIsInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return FixUpUnconditionalBr(Fn, Br);
+  return FixUpConditionalBr(Fn, Br);
+}
+
+/// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to to a branch.
+bool
+ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  assert(isThumb && "Expected a Thumb function!");
+
+  // Use BL to implement far jump.
+  Br.MaxDisp = (1 << 21) * 2;
+  MI->setInstrDescriptor(TII->get(ARM::tBfar));
+  BBSizes[MBB->getNumber()] += 2;
+  AdjustBBOffsetsAfter(MBB, 2);
+  HasFarJump = true;
+  NumUBrFixed++;
+
+  DOUT << "  Changed B to long jump " << *MI;
+
+  return true;
+}
+
+/// FixUpConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMachineBasicBlock();
+
+  // Add a unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // blt L1
+  // =>
+  // bge L2
+  // b   L1
+  // L2:
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImmedValue();
+  CC = ARMCC::getOppositeCondition(CC);
+  unsigned CCReg = MI->getOperand(2).getReg();
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  NumCBrFixed++;
+  if (BMI != MI) {
+    if (next(MachineBasicBlock::iterator(MI)) == MBB->back() &&
+        BMI->getOpcode() == Br.UncondBr) {
+      // Last MI in the BB is a unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMachineBasicBlock();
+      if (BBIsInRange(MI, NewDest, Br.MaxDisp)) {
+        DOUT << "  Invert Bcc condition and swap its destination with " << *BMI;
+        BMI->getOperand(0).setMachineBasicBlock(DestBB);
+        MI->getOperand(0).setMachineBasicBlock(NewDest);
+        MI->getOperand(1).setImm(CC);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    SplitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding a unconditional
+    // branch to the destination.
+    int delta = ARM::GetInstSize(&MBB->back());
+    BBSizes[MBB->getNumber()] -= delta;
+    MachineBasicBlock* SplitBB = next(MachineFunction::iterator(MBB));
+    AdjustBBOffsetsAfter(SplitBB, -delta);
+    MBB->back().eraseFromParent();
+    // BBOffsets[SplitBB] is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = next(MachineFunction::iterator(MBB));
+ 
+  DOUT << "  Insert B to BB#" << DestBB->getNumber()
+       << " also invert condition and change dest. to BB#"
+       << NextBB->getNumber() << "\n";
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  BuildMI(MBB, TII->get(MI->getOpcode())).addMBB(NextBB)
+    .addImm(CC).addReg(CCReg);
+  Br.MI = &MBB->back();
+  BBSizes[MBB->getNumber()] += ARM::GetInstSize(&MBB->back());
+  BuildMI(MBB, TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBSizes[MBB->getNumber()] += ARM::GetInstSize(&MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBSizes[MI->getParent()->getNumber()] -= ARM::GetInstSize(MI);
+  MI->eraseFromParent();
+
+  // The net size change is an addition of one unconditional branch.
+  int delta = ARM::GetInstSize(&MBB->back());
+  AdjustBBOffsetsAfter(MBB, delta);
+  return true;
+}
+
+/// UndoLRSpillRestore - Remove Thumb push / pop instructions that only spills
+/// LR / restores LR to pc.
+bool ARMConstantIslands::UndoLRSpillRestore() {
+  bool MadeChange = false;
+  for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
+    MachineInstr *MI = PushPopMIs[i];
+    if (MI->getOpcode() == ARM::tPOP_RET &&
+        MI->getOperand(0).getReg() == ARM::PC &&
+        MI->getNumExplicitOperands() == 1) {
+      BuildMI(MI->getParent(), TII->get(ARM::tBX_RET));
+      MI->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}

diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
new file mode 100644
index 0000000..30a8eaf
--- /dev/null
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp

@@ -0,0 +1,90 @@
+//===- ARMConstantPoolValue.cpp - ARM constantpool value --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Type.h"
+using namespace llvm;
+
+ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, unsigned id,
+                                           ARMCP::ARMCPKind k,
+                                           unsigned char PCAdj,
+                                           const char *Modif,
+                                           bool AddCA)
+  : MachineConstantPoolValue((const Type*)gv->getType()),
+    GV(gv), S(NULL), LabelId(id), Kind(k), PCAdjust(PCAdj),
+    Modifier(Modif), AddCurrentAddress(AddCA) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(const char *s, unsigned id,
+                                           ARMCP::ARMCPKind k,
+                                           unsigned char PCAdj,
+                                           const char *Modif,
+                                           bool AddCA)
+  : MachineConstantPoolValue((const Type*)Type::Int32Ty),
+    GV(NULL), S(s), LabelId(id), Kind(k), PCAdjust(PCAdj),
+    Modifier(Modif), AddCurrentAddress(AddCA) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv,
+                                           ARMCP::ARMCPKind k,
+                                           const char *Modif)
+  : MachineConstantPoolValue((const Type*)Type::Int32Ty),
+    GV(gv), S(NULL), LabelId(0), Kind(k), PCAdjust(0),
+    Modifier(Modif) {}
+
+int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                    unsigned Alignment) {
+  unsigned AlignMask = (1 << Alignment)-1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].Offset & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      if (CPV->GV == GV &&
+          CPV->S == S &&
+          CPV->LabelId == LabelId &&
+          CPV->Kind == Kind &&
+          CPV->PCAdjust == PCAdjust)
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+void
+ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(GV);
+  ID.AddPointer(S);
+  ID.AddInteger(LabelId);
+  ID.AddInteger((unsigned)Kind);
+  ID.AddInteger(PCAdjust);
+}
+
+void ARMConstantPoolValue::print(std::ostream &O) const {
+  if (GV)
+    O << GV->getName();
+  else
+    O << S;
+  if (isNonLazyPointer()) O << "$non_lazy_ptr";
+  else if (isStub()) O << "$stub";
+  if (Modifier) O << "(" << Modifier << ")";
+  if (PCAdjust != 0) {
+    O << "-(LPIC" << LabelId << "+"
+      << (unsigned)PCAdjust;
+    if (AddCurrentAddress)
+      O << "-.";
+    O << ")";
+  }
+}

diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
new file mode 100644
index 0000000..d71bcf0
--- /dev/null
+++ b/lib/Target/ARM/ARMConstantPoolValue.h

@@ -0,0 +1,75 @@
+//===- ARMConstantPoolValue.h - ARM constantpool value ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+
+namespace llvm {
+
+namespace ARMCP {
+  enum ARMCPKind {
+    CPValue,
+    CPNonLazyPtr,
+    CPStub
+  };
+}
+
+/// ARMConstantPoolValue - ARM specific constantpool value. This is used to
+/// represent PC relative displacement between the address of the load
+/// instruction and the global value being loaded, i.e. (&GV-(LPIC+8)).
+class ARMConstantPoolValue : public MachineConstantPoolValue {
+  GlobalValue *GV;         // GlobalValue being loaded.
+  const char *S;           // ExtSymbol being loaded.
+  unsigned LabelId;        // Label id of the load.
+  ARMCP::ARMCPKind Kind;   // non_lazy_ptr or stub?
+  unsigned char PCAdjust;  // Extra adjustment if constantpool is pc relative.
+                           // 8 for ARM, 4 for Thumb.
+  const char *Modifier;    // GV modifier i.e. (&GV(modifier)-(LPIC+8))
+  bool AddCurrentAddress;
+
+public:
+  ARMConstantPoolValue(GlobalValue *gv, unsigned id,
+                       ARMCP::ARMCPKind Kind = ARMCP::CPValue,
+                       unsigned char PCAdj = 0, const char *Modifier = NULL,
+                       bool AddCurrentAddress = false);
+  ARMConstantPoolValue(const char *s, unsigned id,
+                       ARMCP::ARMCPKind Kind = ARMCP::CPValue,
+                       unsigned char PCAdj = 0, const char *Modifier = NULL,
+                       bool AddCurrentAddress = false);
+  ARMConstantPoolValue(GlobalValue *GV, ARMCP::ARMCPKind Kind,
+                       const char *Modifier);
+
+
+  GlobalValue *getGV() const { return GV; }
+  const char *getSymbol() const { return S; }
+  const char *getModifier() const { return Modifier; }
+  bool hasModifier() const { return Modifier != NULL; }
+  bool mustAddCurrentAddress() const { return AddCurrentAddress; }
+  unsigned getLabelId() const { return LabelId; }
+  bool isNonLazyPointer() const { return Kind == ARMCP::CPNonLazyPtr; }
+  bool isStub() const { return Kind == ARMCP::CPStub; }
+  unsigned char getPCAdjustment() const { return PCAdjust; }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  virtual void print(std::ostream &O) const;
+};
+  
+}
+
+#endif

diff --git a/lib/Target/ARM/ARMFrameInfo.h b/lib/Target/ARM/ARMFrameInfo.h
new file mode 100644
index 0000000..c56640a
--- /dev/null
+++ b/lib/Target/ARM/ARMFrameInfo.h

@@ -0,0 +1,33 @@
+//===-- ARMTargetFrameInfo.h - Define TargetFrameInfo for ARM ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_FRAMEINFO_H
+#define ARM_FRAMEINFO_H
+
+#include "ARM.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "ARMSubtarget.h"
+
+namespace llvm {
+
+class ARMFrameInfo : public TargetFrameInfo {
+public:
+  ARMFrameInfo(const ARMSubtarget &ST)
+    : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0) {
+  }
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
new file mode 100644
index 0000000..933651d
--- /dev/null
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp

@@ -0,0 +1,859 @@
+//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMISelLowering.h"
+#include "ARMTargetMachine.h"
+#include "ARMAddressingModes.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+//===--------------------------------------------------------------------===//
+/// ARMDAGToDAGISel - ARM specific code to select ARM machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class ARMDAGToDAGISel : public SelectionDAGISel {
+  ARMTargetLowering Lowering;
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+
+public:
+  ARMDAGToDAGISel(ARMTargetMachine &TM)
+    : SelectionDAGISel(Lowering), Lowering(TM),
+    Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
+  }
+
+  virtual const char *getPassName() const {
+    return "ARM Instruction Selection";
+  } 
+  
+  SDNode *Select(SDOperand Op);
+  virtual void InstructionSelectBasicBlock(SelectionDAG &DAG);
+  bool SelectAddrMode2(SDOperand Op, SDOperand N, SDOperand &Base,
+                       SDOperand &Offset, SDOperand &Opc);
+  bool SelectAddrMode2Offset(SDOperand Op, SDOperand N,
+                             SDOperand &Offset, SDOperand &Opc);
+  bool SelectAddrMode3(SDOperand Op, SDOperand N, SDOperand &Base,
+                       SDOperand &Offset, SDOperand &Opc);
+  bool SelectAddrMode3Offset(SDOperand Op, SDOperand N,
+                             SDOperand &Offset, SDOperand &Opc);
+  bool SelectAddrMode5(SDOperand Op, SDOperand N, SDOperand &Base,
+                       SDOperand &Offset);
+
+  bool SelectAddrModePC(SDOperand Op, SDOperand N, SDOperand &Offset,
+                         SDOperand &Label);
+
+  bool SelectThumbAddrModeRR(SDOperand Op, SDOperand N, SDOperand &Base,
+                             SDOperand &Offset);
+  bool SelectThumbAddrModeRI5(SDOperand Op, SDOperand N, unsigned Scale,
+                              SDOperand &Base, SDOperand &OffImm,
+                              SDOperand &Offset);
+  bool SelectThumbAddrModeS1(SDOperand Op, SDOperand N, SDOperand &Base,
+                             SDOperand &OffImm, SDOperand &Offset);
+  bool SelectThumbAddrModeS2(SDOperand Op, SDOperand N, SDOperand &Base,
+                             SDOperand &OffImm, SDOperand &Offset);
+  bool SelectThumbAddrModeS4(SDOperand Op, SDOperand N, SDOperand &Base,
+                             SDOperand &OffImm, SDOperand &Offset);
+  bool SelectThumbAddrModeSP(SDOperand Op, SDOperand N, SDOperand &Base,
+                             SDOperand &OffImm);
+
+  bool SelectShifterOperandReg(SDOperand Op, SDOperand N, SDOperand &A,
+                               SDOperand &B, SDOperand &C);
+  
+  // Include the pieces autogenerated from the target description.
+#include "ARMGenDAGISel.inc"
+};
+}
+
+void ARMDAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
+  DEBUG(BB->dump());
+
+  DAG.setRoot(SelectRoot(DAG.getRoot()));
+  DAG.RemoveDeadNodes();
+
+  ScheduleAndEmitDAG(DAG);
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2(SDOperand Op, SDOperand N,
+                                      SDOperand &Base, SDOperand &Offset,
+                                      SDOperand &Opc) {
+  if (N.getOpcode() == ISD::MUL) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      // X * [3,5,9] -> X + X * [2,4,8] etc.
+      int RHSC = (int)RHS->getValue();
+      if (RHSC & 1) {
+        RHSC = RHSC & ~1;
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        if (isPowerOf2_32(RHSC)) {
+          unsigned ShAmt = Log2_32(RHSC);
+          Base = Offset = N.getOperand(0);
+          Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+                                                            ARM_AM::lsl),
+                                          MVT::i32);
+          return true;
+        }
+      }
+    }
+  }
+
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return true;
+  }
+  
+  // Match simple R +/- imm12 operands.
+  if (N.getOpcode() == ISD::ADD)
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int RHSC = (int)RHS->getValue();
+      if ((RHSC >= 0 && RHSC < 0x1000) ||
+          (RHSC < 0 && RHSC > -0x1000)) { // 12 bits.
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        }
+        Offset = CurDAG->getRegister(0, MVT::i32);
+
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC,
+                                                          ARM_AM::no_shift),
+                                        MVT::i32);
+        return true;
+      }
+    }
+  
+  // Otherwise this is R +/- [possibly shifted] R
+  ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::ADD ? ARM_AM::add:ARM_AM::sub;
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
+  unsigned ShAmt = 0;
+  
+  Base   = N.getOperand(0);
+  Offset = N.getOperand(1);
+  
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh =
+           dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+      ShAmt = Sh->getValue();
+      Offset = N.getOperand(1).getOperand(0);
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+  
+  // Try matching (R shl C) + (R).
+  if (N.getOpcode() == ISD::ADD && ShOpcVal == ARM_AM::no_shift) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
+    if (ShOpcVal != ARM_AM::no_shift) {
+      // Check to see if the RHS of the shift is a constant, if not, we can't
+      // fold it.
+      if (ConstantSDNode *Sh =
+          dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+        ShAmt = Sh->getValue();
+        Offset = N.getOperand(0).getOperand(0);
+        Base = N.getOperand(1);
+      } else {
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    }
+  }
+  
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDOperand Op, SDOperand N,
+                                            SDOperand &Offset, SDOperand &Opc) {
+  unsigned Opcode = Op.getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+    int Val = (int)C->getValue();
+    if (Val >= 0 && Val < 0x1000) { // 12 bits.
+      Offset = CurDAG->getRegister(0, MVT::i32);
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
+                                                        ARM_AM::no_shift),
+                                      MVT::i32);
+      return true;
+    }
+  }
+
+  Offset = N;
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+  unsigned ShAmt = 0;
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      ShAmt = Sh->getValue();
+      Offset = N.getOperand(0);
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode3(SDOperand Op, SDOperand N,
+                                      SDOperand &Base, SDOperand &Offset,
+                                      SDOperand &Opc) {
+  if (N.getOpcode() == ISD::SUB) {
+    // X - C  is canonicalize to X + -C, no need to handle it here.
+    Base = N.getOperand(0);
+    Offset = N.getOperand(1);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32);
+    return true;
+  }
+  
+  if (N.getOpcode() != ISD::ADD) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
+    return true;
+  }
+  
+  // If the RHS is +/- imm8, fold into addr mode.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getValue();
+    if ((RHSC >= 0 && RHSC < 256) ||
+        (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed.
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      Offset = CurDAG->getRegister(0, MVT::i32);
+
+      ARM_AM::AddrOpc AddSub = ARM_AM::add;
+      if (RHSC < 0) {
+        AddSub = ARM_AM::sub;
+        RHSC = - RHSC;
+      }
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32);
+      return true;
+    }
+  }
+  
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDOperand Op, SDOperand N,
+                                            SDOperand &Offset, SDOperand &Opc) {
+  unsigned Opcode = Op.getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+    int Val = (int)C->getValue();
+    if (Val >= 0 && Val < 256) {
+      Offset = CurDAG->getRegister(0, MVT::i32);
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32);
+      return true;
+    }
+  }
+
+  Offset = N;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), MVT::i32);
+  return true;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode5(SDOperand Op, SDOperand N,
+                                      SDOperand &Base, SDOperand &Offset) {
+  if (N.getOpcode() != ISD::ADD) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                       MVT::i32);
+    return true;
+  }
+  
+  // If the RHS is +/- imm8, fold into addr mode.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getValue();
+    if ((RHSC & 3) == 0) {  // The constant is implicitly multiplied by 4.
+      RHSC >>= 2;
+      if ((RHSC >= 0 && RHSC < 256) ||
+          (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed.
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        }
+
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+                                           MVT::i32);
+        return true;
+      }
+    }
+  }
+  
+  Base = N;
+  Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                     MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrModePC(SDOperand Op, SDOperand N,
+                                        SDOperand &Offset, SDOperand &Label) {
+  if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
+    Offset = N.getOperand(0);
+    SDOperand N1 = N.getOperand(1);
+    Label  = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getValue(),
+                                       MVT::i32);
+    return true;
+  }
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDOperand Op, SDOperand N,
+                                            SDOperand &Base, SDOperand &Offset){
+  if (N.getOpcode() != ISD::ADD) {
+    Base = N;
+    // We must materialize a zero in a reg! Returning an constant here won't
+    // work since its node is -1 so it won't get added to the selection queue.
+    // Explicitly issue a tMOVri8 node!
+    Offset = SDOperand(CurDAG->getTargetNode(ARM::tMOVi8, MVT::i32,
+                                    CurDAG->getTargetConstant(0, MVT::i32)), 0);
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5(SDOperand Op, SDOperand N,
+                                        unsigned Scale, SDOperand &Base,
+                                        SDOperand &OffImm, SDOperand &Offset) {
+  if (Scale == 4) {
+    SDOperand TmpBase, TmpOffImm;
+    if (SelectThumbAddrModeSP(Op, N, TmpBase, TmpOffImm))
+      return false;  // We want to select tLDRspi / tSTRspi instead.
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+      return false;  // We want to select tLDRpci instead.
+  }
+
+  if (N.getOpcode() != ISD::ADD) {
+    Base = (N.getOpcode() == ARMISD::Wrapper) ? N.getOperand(0) : N;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  // Thumb does not have [sp, r] address mode.
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
+  if ((LHSR && LHSR->getReg() == ARM::SP) ||
+      (RHSR && RHSR->getReg() == ARM::SP)) {
+    Base = N;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  // If the RHS is + imm5 * scale, fold into addr mode.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getValue();
+    if ((RHSC & (Scale-1)) == 0) {  // The constant is implicitly multiplied.
+      RHSC /= Scale;
+      if (RHSC >= 0 && RHSC < 32) {
+        Base = N.getOperand(0);
+        Offset = CurDAG->getRegister(0, MVT::i32);
+        OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeS1(SDOperand Op, SDOperand N,
+                                            SDOperand &Base, SDOperand &OffImm,
+                                            SDOperand &Offset) {
+  return SelectThumbAddrModeRI5(Op, N, 1, Base, OffImm, Offset);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeS2(SDOperand Op, SDOperand N,
+                                            SDOperand &Base, SDOperand &OffImm,
+                                            SDOperand &Offset) {
+  return SelectThumbAddrModeRI5(Op, N, 2, Base, OffImm, Offset);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeS4(SDOperand Op, SDOperand N,
+                                            SDOperand &Base, SDOperand &OffImm,
+                                            SDOperand &Offset) {
+  return SelectThumbAddrModeRI5(Op, N, 4, Base, OffImm, Offset);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDOperand Op, SDOperand N,
+                                           SDOperand &Base, SDOperand &OffImm) {
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  if (N.getOperand(0).getOpcode() == ISD::FrameIndex ||
+      (LHSR && LHSR->getReg() == ARM::SP)) {
+    // If the RHS is + imm8 * scale, fold into addr mode.
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int RHSC = (int)RHS->getValue();
+      if ((RHSC & 3) == 0) {  // The constant is implicitly multiplied.
+        RHSC >>= 2;
+        if (RHSC >= 0 && RHSC < 256) {
+          Base = N.getOperand(0);
+          if (Base.getOpcode() == ISD::FrameIndex) {
+            int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+            Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+          }
+          OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+          return true;
+        }
+      }
+    }
+  }
+  
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectShifterOperandReg(SDOperand Op,
+                                              SDOperand N, 
+                                              SDOperand &BaseReg,
+                                              SDOperand &ShReg,
+                                              SDOperand &Opc) {
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+  
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    ShReg = CurDAG->getRegister(0, MVT::i32);
+    ShImmVal = RHS->getValue() & 31;
+  } else {
+    ShReg = N.getOperand(1);
+  }
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  MVT::i32);
+  return true;
+}
+
+/// getAL - Returns a ARMCC::AL immediate node.
+static inline SDOperand getAL(SelectionDAG *CurDAG) {
+  return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, MVT::i32);
+}
+
+
+SDNode *ARMDAGToDAGISel::Select(SDOperand Op) {
+  SDNode *N = Op.Val;
+  unsigned Opcode = N->getOpcode();
+
+  if (Opcode >= ISD::BUILTIN_OP_END && Opcode < ARMISD::FIRST_NUMBER)
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    unsigned Val = cast<ConstantSDNode>(N)->getValue();
+    bool UseCP = true;
+    if (Subtarget->isThumb())
+      UseCP = (Val > 255 &&                          // MOV
+               ~Val > 255 &&                         // MOV + MVN
+               !ARM_AM::isThumbImmShiftedVal(Val));  // MOV + LSL
+    else
+      UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&     // MOV
+               ARM_AM::getSOImmVal(~Val) == -1 &&    // MVN
+               !ARM_AM::isSOImmTwoPartVal(Val));     // two instrs.
+    if (UseCP) {
+      SDOperand CPIdx =
+        CurDAG->getTargetConstantPool(ConstantInt::get(Type::Int32Ty, Val),
+                                      TLI.getPointerTy());
+
+      SDNode *ResNode;
+      if (Subtarget->isThumb())
+        ResNode = CurDAG->getTargetNode(ARM::tLDRcp, MVT::i32, MVT::Other,
+                                        CPIdx, CurDAG->getEntryNode());
+      else {
+        SDOperand Ops[] = {
+          CPIdx, 
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getTargetConstant(0, MVT::i32),
+          getAL(CurDAG),
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getEntryNode()
+        };
+        ResNode=CurDAG->getTargetNode(ARM::LDRcp, MVT::i32, MVT::Other, Ops, 6);
+      }
+      ReplaceUses(Op, SDOperand(ResNode, 0));
+      return NULL;
+    }
+      
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::FrameIndex: {
+    // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDOperand TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    if (Subtarget->isThumb())
+      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, TFI,
+                                  CurDAG->getTargetConstant(0, MVT::i32));
+    else {
+      SDOperand Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                          getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                          CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->SelectNodeTo(N, ARM::ADDri, MVT::i32, Ops, 5);
+    }
+  }
+  case ISD::ADD: {
+    // Select add sp, c to tADDhirr.
+    SDOperand N0 = Op.getOperand(0);
+    SDOperand N1 = Op.getOperand(1);
+    RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(Op.getOperand(0));
+    RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(Op.getOperand(1));
+    if (LHSR && LHSR->getReg() == ARM::SP) {
+      std::swap(N0, N1);
+      std::swap(LHSR, RHSR);
+    }
+    if (RHSR && RHSR->getReg() == ARM::SP) {
+      AddToISelQueue(N0);
+      AddToISelQueue(N1);
+      return CurDAG->SelectNodeTo(N, ARM::tADDhirr, Op.getValueType(), N0, N1);
+    }
+    break;
+  }
+  case ISD::MUL:
+    if (Subtarget->isThumb())
+      break;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned RHSV = C->getValue();
+      if (!RHSV) break;
+      if (isPowerOf2_32(RHSV-1)) {  // 2^n+1?
+        SDOperand V = Op.getOperand(0);
+        AddToISelQueue(V);
+        unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV-1));
+        SDOperand Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getTargetConstant(ShImm, MVT::i32),
+                            getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getRegister(0, MVT::i32) };
+        return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7);
+      }
+      if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
+        SDOperand V = Op.getOperand(0);
+        AddToISelQueue(V);
+        unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV+1));
+        SDOperand Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getTargetConstant(ShImm, MVT::i32),
+                            getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getRegister(0, MVT::i32) };
+        return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
+      }
+    }
+    break;
+  case ARMISD::FMRRD:
+    AddToISelQueue(Op.getOperand(0));
+    return CurDAG->getTargetNode(ARM::FMRRD, MVT::i32, MVT::i32,
+                                 Op.getOperand(0), getAL(CurDAG),
+                                 CurDAG->getRegister(0, MVT::i32));
+  case ARMISD::MULHILOU: {
+    AddToISelQueue(Op.getOperand(0));
+    AddToISelQueue(Op.getOperand(1));
+    SDOperand Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+    return CurDAG->getTargetNode(ARM::UMULL, MVT::i32, MVT::i32, Ops, 5);
+  }
+  case ARMISD::MULHILOS: {
+    AddToISelQueue(Op.getOperand(0));
+    AddToISelQueue(Op.getOperand(1));
+    SDOperand Ops[] = { Op.getOperand(0), Op.getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+    return CurDAG->getTargetNode(ARM::SMULL, MVT::i32, MVT::i32, Ops, 5);
+  }
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    ISD::MemIndexedMode AM = LD->getAddressingMode();
+    MVT::ValueType LoadedVT = LD->getLoadedVT();
+    if (AM != ISD::UNINDEXED) {
+      SDOperand Offset, AMOpc;
+      bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+      unsigned Opcode = 0;
+      bool Match = false;
+      if (LoadedVT == MVT::i32 &&
+          SelectAddrMode2Offset(Op, LD->getOffset(), Offset, AMOpc)) {
+        Opcode = isPre ? ARM::LDR_PRE : ARM::LDR_POST;
+        Match = true;
+      } else if (LoadedVT == MVT::i16 &&
+                 SelectAddrMode3Offset(Op, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = (LD->getExtensionType() == ISD::SEXTLOAD)
+          ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST)
+          : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST);
+      } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) {
+        if (LD->getExtensionType() == ISD::SEXTLOAD) {
+          if (SelectAddrMode3Offset(Op, LD->getOffset(), Offset, AMOpc)) {
+            Match = true;
+            Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST;
+          }
+        } else {
+          if (SelectAddrMode2Offset(Op, LD->getOffset(), Offset, AMOpc)) {
+            Match = true;
+            Opcode = isPre ? ARM::LDRB_PRE : ARM::LDRB_POST;
+          }
+        }
+      }
+
+      if (Match) {
+        SDOperand Chain = LD->getChain();
+        SDOperand Base = LD->getBasePtr();
+        AddToISelQueue(Chain);
+        AddToISelQueue(Base);
+        AddToISelQueue(Offset);
+        SDOperand Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
+                           CurDAG->getRegister(0, MVT::i32), Chain };
+        return CurDAG->getTargetNode(Opcode, MVT::i32, MVT::i32,
+                                     MVT::Other, Ops, 6);
+      }
+    }
+    // Other cases are autogenerated.
+    break;
+  }
+  case ARMISD::BRCOND: {
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    unsigned Opc = Subtarget->isThumb() ? ARM::tBcc : ARM::Bcc;
+    SDOperand Chain = Op.getOperand(0);
+    SDOperand N1 = Op.getOperand(1);
+    SDOperand N2 = Op.getOperand(2);
+    SDOperand N3 = Op.getOperand(3);
+    SDOperand InFlag = Op.getOperand(4);
+    assert(N1.getOpcode() == ISD::BasicBlock);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    AddToISelQueue(Chain);
+    AddToISelQueue(N1);
+    AddToISelQueue(InFlag);
+    SDOperand Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getValue()), MVT::i32);
+    SDOperand Ops[] = { N1, Tmp2, N3, Chain, InFlag };
+    SDNode *ResNode = CurDAG->getTargetNode(Opc, MVT::Other, MVT::Flag, Ops, 5);
+    Chain = SDOperand(ResNode, 0);
+    InFlag = SDOperand(ResNode, 1);
+    ReplaceUses(SDOperand(Op.Val, 1), InFlag);
+    ReplaceUses(SDOperand(Op.Val, 0), SDOperand(Chain.Val, Chain.ResNo));
+    return NULL;
+  }
+  case ARMISD::CMOV: {
+    bool isThumb = Subtarget->isThumb();
+    MVT::ValueType VT = Op.getValueType();
+    SDOperand N0 = Op.getOperand(0);
+    SDOperand N1 = Op.getOperand(1);
+    SDOperand N2 = Op.getOperand(2);
+    SDOperand N3 = Op.getOperand(3);
+    SDOperand InFlag = Op.getOperand(4);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+    // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+    // Pattern complexity = 18  cost = 1  size = 0
+    SDOperand CPTmp0;
+    SDOperand CPTmp1;
+    SDOperand CPTmp2;
+    if (!isThumb && VT == MVT::i32 &&
+        SelectShifterOperandReg(Op, N1, CPTmp0, CPTmp1, CPTmp2)) {
+      AddToISelQueue(N0);
+      AddToISelQueue(CPTmp0);
+      AddToISelQueue(CPTmp1);
+      AddToISelQueue(CPTmp2);
+      AddToISelQueue(InFlag);
+      SDOperand Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getValue()), MVT::i32);
+      SDOperand Ops[] = { N0, CPTmp0, CPTmp1, CPTmp2, Tmp2, N3, InFlag };
+      return CurDAG->SelectNodeTo(Op.Val, ARM::MOVCCs, MVT::i32, Ops, 7);
+    }
+
+    // Pattern: (ARMcmov:i32 GPR:i32:$false,
+    //             (imm:i32)<<P:Predicate_so_imm>><<X:so_imm_XFORM>>:$true,
+    //             (imm:i32):$cc)
+    // Emits: (MOVCCi:i32 GPR:i32:$false,
+    //           (so_imm_XFORM:i32 (imm:i32):$true), (imm:i32):$cc)
+    // Pattern complexity = 10  cost = 1  size = 0
+    if (VT == MVT::i32 &&
+        N3.getOpcode() == ISD::Constant &&
+        Predicate_so_imm(N3.Val)) {
+      AddToISelQueue(N0);
+      AddToISelQueue(InFlag);
+      SDOperand Tmp1 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N1)->getValue()), MVT::i32);
+      Tmp1 = Transform_so_imm_XFORM(Tmp1.Val);
+      SDOperand Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getValue()), MVT::i32);
+      SDOperand Ops[] = { N0, Tmp1, Tmp2, N3, InFlag };
+      return CurDAG->SelectNodeTo(Op.Val, ARM::MOVCCi, MVT::i32, Ops, 5);
+    }
+
+    // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+    // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+    //
+    // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+    // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 11  size = 0
+    //
+    // Also FCPYScc and FCPYDcc.
+    AddToISelQueue(N0);
+    AddToISelQueue(N1);
+    AddToISelQueue(InFlag);
+    SDOperand Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getValue()), MVT::i32);
+    SDOperand Ops[] = { N0, N1, Tmp2, N3, InFlag };
+    unsigned Opc = 0;
+    switch (VT) {
+    default: assert(false && "Illegal conditional move type!");
+      break;
+    case MVT::i32:
+      Opc = isThumb ? ARM::tMOVCCr : ARM::MOVCCr;
+      break;
+    case MVT::f32:
+      Opc = ARM::FCPYScc;
+      break;
+    case MVT::f64:
+      Opc = ARM::FCPYDcc;
+      break; 
+    }
+    return CurDAG->SelectNodeTo(Op.Val, Opc, VT, Ops, 5);
+  }
+  case ARMISD::CNEG: {
+    MVT::ValueType VT = Op.getValueType();
+    SDOperand N0 = Op.getOperand(0);
+    SDOperand N1 = Op.getOperand(1);
+    SDOperand N2 = Op.getOperand(2);
+    SDOperand N3 = Op.getOperand(3);
+    SDOperand InFlag = Op.getOperand(4);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    AddToISelQueue(N0);
+    AddToISelQueue(N1);
+    AddToISelQueue(InFlag);
+    SDOperand Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getValue()), MVT::i32);
+    SDOperand Ops[] = { N0, N1, Tmp2, N3, InFlag };
+    unsigned Opc = 0;
+    switch (VT) {
+    default: assert(false && "Illegal conditional move type!");
+      break;
+    case MVT::f32:
+      Opc = ARM::FNEGScc;
+      break;
+    case MVT::f64:
+      Opc = ARM::FNEGDcc;
+      break; 
+    }
+    return CurDAG->SelectNodeTo(Op.Val, Opc, VT, Ops, 5);
+  }
+  }
+  return SelectCode(Op);
+}
+
+/// createARMISelDag - This pass converts a legalized DAG into a
+/// ARM-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createARMISelDag(ARMTargetMachine &TM) {
+  return new ARMDAGToDAGISel(TM);
+}

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
new file mode 100644
index 0000000..6f63fbd
--- /dev/null
+++ b/lib/Target/ARM/ARMISelLowering.cpp

@@ -0,0 +1,1859 @@
+//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Instruction.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
+    : TargetLowering(TM), ARMPCLabelIndex(0) {
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+
+  if (Subtarget->isTargetDarwin()) {
+    // Don't have these.
+    setLibcallName(RTLIB::UINTTOFP_I64_F32, NULL);
+    setLibcallName(RTLIB::UINTTOFP_I64_F64, NULL);
+
+    // Uses VFP for Thumb libfuncs if available.
+    if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
+      // Single-precision floating-point arithmetic.
+      setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
+      setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
+      setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
+      setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
+
+      // Double-precision floating-point arithmetic.
+      setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
+      setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
+      setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
+      setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
+
+      // Single-precision comparisons.
+      setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
+      setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
+      setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
+      setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
+      setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
+      setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
+      setLibcallName(RTLIB::UO_F32,  "__unordsf2vfp");
+      setLibcallName(RTLIB::O_F32,   "__unordsf2vfp");
+
+      setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
+      setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
+
+      // Double-precision comparisons.
+      setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
+      setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
+      setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
+      setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
+      setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
+      setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
+      setLibcallName(RTLIB::UO_F64,  "__unorddf2vfp");
+      setLibcallName(RTLIB::O_F64,   "__unorddf2vfp");
+
+      setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
+      setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
+
+      // Floating-point to integer conversions.
+      // i64 conversions are done via library routines even when generating VFP
+      // instructions, so use the same ones.
+      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
+      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
+      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
+      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
+
+      // Conversions between floating types.
+      setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
+      setLibcallName(RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp");
+
+      // Integer to floating-point conversions.
+      // i64 conversions are done via library routines even when generating VFP
+      // instructions, so use the same ones.
+      // FIXME: There appears to be some naming inconsistency in ARM libgcc: e.g.
+      // __floatunsidf vs. __floatunssidfvfp.
+      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
+      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
+      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
+      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
+    }
+  }
+
+  addRegisterClass(MVT::i32, ARM::GPRRegisterClass);
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) {
+    addRegisterClass(MVT::f32, ARM::SPRRegisterClass);
+    addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
+  }
+  computeRegisterProperties();
+
+  // ARM does not have f32 extending load.
+  setLoadXAction(ISD::EXTLOAD, MVT::f32, Expand);
+
+  // ARM supports all 4 flavors of integer indexed load / store.
+  for (unsigned im = (unsigned)ISD::PRE_INC;
+       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+    setIndexedLoadAction(im,  MVT::i1,  Legal);
+    setIndexedLoadAction(im,  MVT::i8,  Legal);
+    setIndexedLoadAction(im,  MVT::i16, Legal);
+    setIndexedLoadAction(im,  MVT::i32, Legal);
+    setIndexedStoreAction(im, MVT::i1,  Legal);
+    setIndexedStoreAction(im, MVT::i8,  Legal);
+    setIndexedStoreAction(im, MVT::i16, Legal);
+    setIndexedStoreAction(im, MVT::i32, Legal);
+  }
+
+  // i64 operation support.
+  if (Subtarget->isThumb()) {
+    setOperationAction(ISD::MUL,     MVT::i64, Expand);
+    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
+    setOperationAction(ISD::MULHS,   MVT::i32, Expand);
+  } else {
+    setOperationAction(ISD::MUL,     MVT::i64, Custom);
+    setOperationAction(ISD::MULHU,   MVT::i32, Custom);
+    if (!Subtarget->hasV6Ops())
+      setOperationAction(ISD::MULHS, MVT::i32, Custom);
+  }
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL,       MVT::i64, Custom);
+  setOperationAction(ISD::SRA,       MVT::i64, Custom);
+
+  // ARM does not have ROTL.
+  setOperationAction(ISD::ROTL,  MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  if (!Subtarget->hasV5TOps() || Subtarget->isThumb())
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+
+  // Only ARMv6 has BSWAP.
+  if (!Subtarget->hasV6Ops())
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  // These are expanded into libcalls.
+  setOperationAction(ISD::SDIV,  MVT::i32, Expand);
+  setOperationAction(ISD::UDIV,  MVT::i32, Expand);
+  setOperationAction(ISD::SREM,  MVT::i32, Expand);
+  setOperationAction(ISD::UREM,  MVT::i32, Expand);
+  
+  // Support label based line numbers.
+  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+
+  setOperationAction(ISD::RET,           MVT::Other, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
+  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+
+  // Expand mem operations genericly.
+  setOperationAction(ISD::MEMSET          , MVT::Other, Expand);
+  setOperationAction(ISD::MEMCPY          , MVT::Other, Custom);
+  setOperationAction(ISD::MEMMOVE         , MVT::Other, Expand);
+  
+  // Use the default implementation.
+  setOperationAction(ISD::VASTART           , MVT::Other, Expand);
+  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand); 
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
+
+  if (!Subtarget->hasV6Ops()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+  }
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb())
+    // Turn f64->i64 into FMRRD iff target supports vfp2.
+    setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom);
+  
+  setOperationAction(ISD::SETCC    , MVT::i32, Expand);
+  setOperationAction(ISD::SETCC    , MVT::f32, Expand);
+  setOperationAction(ISD::SETCC    , MVT::f64, Expand);
+  setOperationAction(ISD::SELECT   , MVT::i32, Expand);
+  setOperationAction(ISD::SELECT   , MVT::f32, Expand);
+  setOperationAction(ISD::SELECT   , MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::BRCOND   , MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC    , MVT::i32,   Custom);
+  setOperationAction(ISD::BR_CC    , MVT::f32,   Custom);
+  setOperationAction(ISD::BR_CC    , MVT::f64,   Custom);
+  setOperationAction(ISD::BR_JT    , MVT::Other, Custom);
+
+  setOperationAction(ISD::VASTART,       MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY,        MVT::Other, Expand); 
+  setOperationAction(ISD::VAEND,         MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE,     MVT::Other, Expand); 
+  setOperationAction(ISD::STACKRESTORE,  MVT::Other, Expand);
+
+  // FP Constants can't be immediates.
+  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+
+  // We don't support sin/cos/fmod/copysign
+  setOperationAction(ISD::FSIN     , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN     , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS     , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS     , MVT::f64, Expand);
+  setOperationAction(ISD::FREM     , MVT::f64, Expand);
+  setOperationAction(ISD::FREM     , MVT::f32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+  
+  // int <-> fp are custom expanded into bit_convert + ARMISD ops.
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+
+  setStackPointerRegisterToSaveRestore(ARM::SP);
+  setSchedulingPreference(SchedulingForRegPressure);
+  setIfCvtBlockSizeLimit(Subtarget->isThumb() ? 0 : 10);
+  setIfCvtDupBlockSizeLimit(Subtarget->isThumb() ? 0 : 2);
+
+  maxStoresPerMemcpy = 1;   //// temporary - rewrite interface to use type
+}
+
+
+const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case ARMISD::Wrapper:       return "ARMISD::Wrapper";
+  case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
+  case ARMISD::CALL:          return "ARMISD::CALL";
+  case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
+  case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
+  case ARMISD::tCALL:         return "ARMISD::tCALL";
+  case ARMISD::BRCOND:        return "ARMISD::BRCOND";
+  case ARMISD::BR_JT:         return "ARMISD::BR_JT";
+  case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
+  case ARMISD::CMP:           return "ARMISD::CMP";
+  case ARMISD::CMPNZ:         return "ARMISD::CMPNZ";
+  case ARMISD::CMPFP:         return "ARMISD::CMPFP";
+  case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
+  case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
+  case ARMISD::CMOV:          return "ARMISD::CMOV";
+  case ARMISD::CNEG:          return "ARMISD::CNEG";
+    
+  case ARMISD::FTOSI:         return "ARMISD::FTOSI";
+  case ARMISD::FTOUI:         return "ARMISD::FTOUI";
+  case ARMISD::SITOF:         return "ARMISD::SITOF";
+  case ARMISD::UITOF:         return "ARMISD::UITOF";
+  case ARMISD::MULHILOU:      return "ARMISD::MULHILOU";
+  case ARMISD::MULHILOS:      return "ARMISD::MULHILOS";
+
+  case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
+  case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
+  case ARMISD::RRX:           return "ARMISD::RRX";
+      
+  case ARMISD::FMRRD:         return "ARMISD::FMRRD";
+  case ARMISD::FMDRR:         return "ARMISD::FMDRR";
+
+  case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+
+/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
+static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown condition code!");
+  case ISD::SETNE:  return ARMCC::NE;
+  case ISD::SETEQ:  return ARMCC::EQ;
+  case ISD::SETGT:  return ARMCC::GT;
+  case ISD::SETGE:  return ARMCC::GE;
+  case ISD::SETLT:  return ARMCC::LT;
+  case ISD::SETLE:  return ARMCC::LE;
+  case ISD::SETUGT: return ARMCC::HI;
+  case ISD::SETUGE: return ARMCC::HS;
+  case ISD::SETULT: return ARMCC::LO;
+  case ISD::SETULE: return ARMCC::LS;
+  }
+}
+
+/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. It
+/// returns true if the operands should be inverted to form the proper
+/// comparison.
+static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                        ARMCC::CondCodes &CondCode2) {
+  bool Invert = false;
+  CondCode2 = ARMCC::AL;
+  switch (CC) {
+  default: assert(0 && "Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
+  case ISD::SETGT:
+  case ISD::SETOGT: CondCode = ARMCC::GT; break;
+  case ISD::SETGE:
+  case ISD::SETOGE: CondCode = ARMCC::GE; break;
+  case ISD::SETOLT: CondCode = ARMCC::MI; break;
+  case ISD::SETOLE: CondCode = ARMCC::GT; Invert = true; break;
+  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
+  case ISD::SETO:   CondCode = ARMCC::VC; break;
+  case ISD::SETUO:  CondCode = ARMCC::VS; break;
+  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
+  case ISD::SETUGT: CondCode = ARMCC::HI; break;
+  case ISD::SETUGE: CondCode = ARMCC::PL; break;
+  case ISD::SETLT:
+  case ISD::SETULT: CondCode = ARMCC::LT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: CondCode = ARMCC::LE; break;
+  case ISD::SETNE:
+  case ISD::SETUNE: CondCode = ARMCC::NE; break;
+  }
+  return Invert;
+}
+
+static void
+HowToPassArgument(MVT::ValueType ObjectVT, unsigned NumGPRs,
+                  unsigned StackOffset, unsigned &NeededGPRs,
+                  unsigned &NeededStackSize, unsigned &GPRPad,
+                  unsigned &StackPad, unsigned Flags) {
+  NeededStackSize = 0;
+  NeededGPRs = 0;
+  StackPad = 0;
+  GPRPad = 0;
+  unsigned align = (Flags >> ISD::ParamFlags::OrigAlignmentOffs);
+  GPRPad = NumGPRs % ((align + 3)/4);
+  StackPad = StackOffset % align;
+  unsigned firstGPR = NumGPRs + GPRPad;
+  switch (ObjectVT) {
+  default: assert(0 && "Unhandled argument type!");
+  case MVT::i32:
+  case MVT::f32:
+    if (firstGPR < 4)
+      NeededGPRs = 1;
+    else
+      NeededStackSize = 4;
+    break;
+  case MVT::i64:
+  case MVT::f64:
+    if (firstGPR < 3)
+      NeededGPRs = 2;
+    else if (firstGPR == 3) {
+      NeededGPRs = 1;
+      NeededStackSize = 4;
+    } else
+      NeededStackSize = 8;
+  }
+}
+
+/// LowerCALL - Lowering a ISD::CALL node into a callseq_start <-
+/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
+/// nodes.
+SDOperand ARMTargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType RetVT= Op.Val->getValueType(0);
+  SDOperand Chain    = Op.getOperand(0);
+  unsigned CallConv  = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+  assert((CallConv == CallingConv::C ||
+          CallConv == CallingConv::Fast) && "unknown calling convention");
+  SDOperand Callee   = Op.getOperand(4);
+  unsigned NumOps    = (Op.getNumOperands() - 5) / 2;
+  unsigned ArgOffset = 0;   // Frame mechanisms handle retaddr slot
+  unsigned NumGPRs = 0;     // GPRs used for parameter passing.
+
+  // Count how many bytes are to be pushed on the stack.
+  unsigned NumBytes = 0;
+
+  // Add up all the space actually used.
+  for (unsigned i = 0; i < NumOps; ++i) {
+    unsigned ObjSize;
+    unsigned ObjGPRs;
+    unsigned StackPad;
+    unsigned GPRPad;
+    MVT::ValueType ObjectVT = Op.getOperand(5+2*i).getValueType();
+    unsigned Flags = Op.getConstantOperandVal(5+2*i+1);
+    HowToPassArgument(ObjectVT, NumGPRs, NumBytes, ObjGPRs, ObjSize,
+                      GPRPad, StackPad, Flags);
+    NumBytes += ObjSize + StackPad;
+    NumGPRs += ObjGPRs + GPRPad;
+  }
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getConstant(NumBytes, MVT::i32));
+
+  SDOperand StackPtr = DAG.getRegister(ARM::SP, MVT::i32);
+
+  static const unsigned GPRArgRegs[] = {
+    ARM::R0, ARM::R1, ARM::R2, ARM::R3
+  };
+
+  NumGPRs = 0;
+  std::vector<std::pair<unsigned, SDOperand> > RegsToPass;
+  std::vector<SDOperand> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDOperand Arg = Op.getOperand(5+2*i);
+    unsigned Flags = Op.getConstantOperandVal(5+2*i+1);
+    MVT::ValueType ArgVT = Arg.getValueType();
+
+    unsigned ObjSize;
+    unsigned ObjGPRs;
+    unsigned GPRPad;
+    unsigned StackPad;
+    HowToPassArgument(ArgVT, NumGPRs, ArgOffset, ObjGPRs,
+                      ObjSize, GPRPad, StackPad, Flags);
+    NumGPRs += GPRPad;
+    ArgOffset += StackPad;
+    if (ObjGPRs > 0) {
+      switch (ArgVT) {
+      default: assert(0 && "Unexpected ValueType for argument!");
+      case MVT::i32:
+        RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs], Arg));
+        break;
+      case MVT::f32:
+        RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs],
+                                 DAG.getNode(ISD::BIT_CONVERT, MVT::i32, Arg)));
+        break;
+      case MVT::i64: {
+        SDOperand Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Arg,
+                                   DAG.getConstant(0, getPointerTy()));
+        SDOperand Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Arg,
+                                   DAG.getConstant(1, getPointerTy()));
+        RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs], Lo));
+        if (ObjGPRs == 2)
+          RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs+1], Hi));
+        else {
+          SDOperand PtrOff= DAG.getConstant(ArgOffset, StackPtr.getValueType());
+          PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff);
+          MemOpChains.push_back(DAG.getStore(Chain, Hi, PtrOff, NULL, 0));
+        }
+        break;
+      }
+      case MVT::f64: {
+        SDOperand Cvt = DAG.getNode(ARMISD::FMRRD,
+                                    DAG.getVTList(MVT::i32, MVT::i32),
+                                    &Arg, 1);
+        RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs], Cvt));
+        if (ObjGPRs == 2)
+          RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs+1],
+                                              Cvt.getValue(1)));
+        else {
+          SDOperand PtrOff= DAG.getConstant(ArgOffset, StackPtr.getValueType());
+          PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff);
+          MemOpChains.push_back(DAG.getStore(Chain, Cvt.getValue(1), PtrOff,
+                                             NULL, 0));
+        }
+        break;
+      }
+      }
+    } else {
+      assert(ObjSize != 0);
+      SDOperand PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+      PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+    }
+
+    NumGPRs += ObjGPRs;
+    ArgOffset += ObjSize;
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  bool isDirect = false;
+  bool isARMFunc = false;
+  bool isLocalARMFunc = false;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    GlobalValue *GV = G->getGlobal();
+    isDirect = true;
+    bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() ||
+                  GV->hasLinkOnceLinkage());
+    bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
+                   getTargetMachine().getRelocationModel() != Reloc::Static;
+    isARMFunc = !Subtarget->isThumb() || isStub;
+    // ARM call to a local ARM function is predicable.
+    isLocalARMFunc = !Subtarget->isThumb() && !isExt;
+    // tBX takes a register source operand.
+    if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) {
+      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex,
+                                                           ARMCP::CPStub, 4);
+      SDOperand CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 2);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), CPAddr, NULL, 0); 
+      SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, getPointerTy(), Callee, PICLabel);
+   } else
+      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    isDirect = true;
+    bool isStub = Subtarget->isTargetDarwin() &&
+                  getTargetMachine().getRelocationModel() != Reloc::Static;
+    isARMFunc = !Subtarget->isThumb() || isStub;
+    // tBX takes a register source operand.
+    const char *Sym = S->getSymbol();
+    if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) {
+      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(Sym, ARMPCLabelIndex,
+                                                           ARMCP::CPStub, 4);
+      SDOperand CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 2);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), CPAddr, NULL, 0); 
+      SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, getPointerTy(), Callee, PICLabel);
+    } else
+      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+  }
+
+  // FIXME: handle tail calls differently.
+  unsigned CallOpc;
+  if (Subtarget->isThumb()) {
+    if (!Subtarget->hasV5TOps() && (!isDirect || isARMFunc))
+      CallOpc = ARMISD::CALL_NOLINK;
+    else
+      CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
+  } else {
+    CallOpc = (isDirect || Subtarget->hasV5TOps())
+      ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL)
+      : ARMISD::CALL_NOLINK;
+  }
+  if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb()) {
+    // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK
+    Chain = DAG.getCopyToReg(Chain, ARM::LR,
+                             DAG.getNode(ISD::UNDEF, MVT::i32), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  std::vector<MVT::ValueType> NodeTys;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+
+  std::vector<SDOperand> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.Val)
+    Ops.push_back(InFlag);
+  Chain = DAG.getNode(CallOpc, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  SDOperand CSOps[] = { Chain, DAG.getConstant(NumBytes, MVT::i32), InFlag };
+  Chain = DAG.getNode(ISD::CALLSEQ_END, 
+                      DAG.getNodeValueTypes(MVT::Other, MVT::Flag),
+                      ((RetVT != MVT::Other) ? 2 : 1), CSOps, 3);
+  if (RetVT != MVT::Other)
+    InFlag = Chain.getValue(1);
+
+  std::vector<SDOperand> ResultVals;
+  NodeTys.clear();
+
+  // If the call has results, copy the values out of the ret val registers.
+  switch (RetVT) {
+  default: assert(0 && "Unexpected ret value!");
+  case MVT::Other:
+    break;
+  case MVT::i32:
+    Chain = DAG.getCopyFromReg(Chain, ARM::R0, MVT::i32, InFlag).getValue(1);
+    ResultVals.push_back(Chain.getValue(0));
+    if (Op.Val->getValueType(1) == MVT::i32) {
+      // Returns a i64 value.
+      Chain = DAG.getCopyFromReg(Chain, ARM::R1, MVT::i32,
+                                 Chain.getValue(2)).getValue(1);
+      ResultVals.push_back(Chain.getValue(0));
+      NodeTys.push_back(MVT::i32);
+    }
+    NodeTys.push_back(MVT::i32);
+    break;
+  case MVT::f32:
+    Chain = DAG.getCopyFromReg(Chain, ARM::R0, MVT::i32, InFlag).getValue(1);
+    ResultVals.push_back(DAG.getNode(ISD::BIT_CONVERT, MVT::f32,
+                                     Chain.getValue(0)));
+    NodeTys.push_back(MVT::f32);
+    break;
+  case MVT::f64: {
+    SDOperand Lo = DAG.getCopyFromReg(Chain, ARM::R0, MVT::i32, InFlag);
+    SDOperand Hi = DAG.getCopyFromReg(Lo, ARM::R1, MVT::i32, Lo.getValue(2));
+    ResultVals.push_back(DAG.getNode(ARMISD::FMDRR, MVT::f64, Lo, Hi));
+    NodeTys.push_back(MVT::f64);
+    break;
+  }
+  }
+
+  NodeTys.push_back(MVT::Other);
+
+  if (ResultVals.empty())
+    return Chain;
+
+  ResultVals.push_back(Chain);
+  SDOperand Res = DAG.getNode(ISD::MERGE_VALUES, NodeTys, &ResultVals[0],
+                              ResultVals.size());
+  return Res.getValue(Op.ResNo);
+}
+
+static SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Copy;
+  SDOperand Chain = Op.getOperand(0);
+  switch(Op.getNumOperands()) {
+  default:
+    assert(0 && "Do not know how to return this many arguments!");
+    abort();
+  case 1: {
+    SDOperand LR = DAG.getRegister(ARM::LR, MVT::i32);
+    return DAG.getNode(ARMISD::RET_FLAG, MVT::Other, Chain);
+  }
+  case 3:
+    Op = Op.getOperand(1);
+    if (Op.getValueType() == MVT::f32) {
+      Op = DAG.getNode(ISD::BIT_CONVERT, MVT::i32, Op);
+    } else if (Op.getValueType() == MVT::f64) {
+      // Recursively legalize f64 -> i64.
+      Op = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Op);
+      return DAG.getNode(ISD::RET, MVT::Other, Chain, Op,
+                         DAG.getConstant(0, MVT::i32));
+    }
+    Copy = DAG.getCopyToReg(Chain, ARM::R0, Op, SDOperand());
+    if (DAG.getMachineFunction().liveout_empty())
+      DAG.getMachineFunction().addLiveOut(ARM::R0);
+    break;
+  case 5:
+    Copy = DAG.getCopyToReg(Chain, ARM::R1, Op.getOperand(3), SDOperand());
+    Copy = DAG.getCopyToReg(Copy, ARM::R0, Op.getOperand(1), Copy.getValue(1));
+    // If we haven't noted the R0+R1 are live out, do so now.
+    if (DAG.getMachineFunction().liveout_empty()) {
+      DAG.getMachineFunction().addLiveOut(ARM::R0);
+      DAG.getMachineFunction().addLiveOut(ARM::R1);
+    }
+    break;
+  }
+
+  //We must use RET_FLAG instead of BRIND because BRIND doesn't have a flag
+  return DAG.getNode(ARMISD::RET_FLAG, MVT::Other, Copy, Copy.getValue(1));
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 
+// their target countpart wrapped in the ARMISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOVi.
+static SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDOperand Res;
+  if (CP->isMachineConstantPoolEntry())
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment());
+  else
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment());
+  return DAG.getNode(ARMISD::Wrapper, MVT::i32, Res);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+SDOperand
+ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                                 SelectionDAG &DAG) {
+  MVT::ValueType PtrVT = getPointerTy();
+  unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+  ARMConstantPoolValue *CPV =
+    new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue,
+                             PCAdj, "tlsgd", true);
+  SDOperand Argument = DAG.getTargetConstantPool(CPV, PtrVT, 2);
+  Argument = DAG.getNode(ARMISD::Wrapper, MVT::i32, Argument);
+  Argument = DAG.getLoad(PtrVT, DAG.getEntryNode(), Argument, NULL, 0);
+  SDOperand Chain = Argument.getValue(1);
+
+  SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+  Argument = DAG.getNode(ARMISD::PIC_ADD, PtrVT, Argument, PICLabel);
+
+  // call __tls_get_addr.
+  ArgListTy Args;
+  ArgListEntry Entry;
+  Entry.Node = Argument;
+  Entry.Ty = (const Type *) Type::Int32Ty;
+  Args.push_back(Entry);
+  std::pair<SDOperand, SDOperand> CallResult =
+    LowerCallTo(Chain, (const Type *) Type::Int32Ty, false, false,
+                CallingConv::C, false,
+                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG);
+  return CallResult.first;
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or
+// "local exec" model.
+SDOperand
+ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                            SelectionDAG &DAG) {
+  GlobalValue *GV = GA->getGlobal();
+  SDOperand Offset;
+  SDOperand Chain = DAG.getEntryNode();
+  MVT::ValueType PtrVT = getPointerTy();
+  // Get the Thread Pointer
+  SDOperand ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, PtrVT);
+
+  if (GV->isDeclaration()){
+    // initial exec model
+    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue,
+                               PCAdj, "gottpoff", true);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 2);
+    Offset = DAG.getNode(ARMISD::Wrapper, MVT::i32, Offset);
+    Offset = DAG.getLoad(PtrVT, Chain, Offset, NULL, 0);
+    Chain = Offset.getValue(1);
+
+    SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+    Offset = DAG.getNode(ARMISD::PIC_ADD, PtrVT, Offset, PICLabel);
+
+    Offset = DAG.getLoad(PtrVT, Chain, Offset, NULL, 0);
+  } else {
+    // local exec model
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(GV, ARMCP::CPValue, "tpoff");
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 2);
+    Offset = DAG.getNode(ARMISD::Wrapper, MVT::i32, Offset);
+    Offset = DAG.getLoad(PtrVT, Chain, Offset, NULL, 0);
+  }
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, PtrVT, ThreadPointer, Offset);
+}
+
+SDOperand
+ARMTargetLowering::LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) {
+  // TODO: implement the "local dynamic" model
+  assert(Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF targets");
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
+  // otherwise use the "Local Exec" TLS Model
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
+    return LowerToTLSGeneralDynamicModel(GA, DAG);
+  else
+    return LowerToTLSExecModels(GA, DAG);
+}
+
+SDOperand ARMTargetLowering::LowerGlobalAddressELF(SDOperand Op,
+                                                   SelectionDAG &DAG) {
+  MVT::ValueType PtrVT = getPointerTy();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  if (RelocM == Reloc::PIC_) {
+    bool UseGOTOFF = GV->hasInternalLinkage() || GV->hasHiddenVisibility();
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(GV, ARMCP::CPValue, UseGOTOFF ? "GOTOFF":"GOT");
+    SDOperand CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 2);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr);
+    SDOperand Result = DAG.getLoad(PtrVT, DAG.getEntryNode(), CPAddr, NULL, 0);
+    SDOperand Chain = Result.getValue(1);
+    SDOperand GOT = DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, PtrVT);
+    Result = DAG.getNode(ISD::ADD, PtrVT, Result, GOT);
+    if (!UseGOTOFF)
+      Result = DAG.getLoad(PtrVT, Chain, Result, NULL, 0);
+    return Result;
+  } else {
+    SDOperand CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 2);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr);
+    return DAG.getLoad(PtrVT, DAG.getEntryNode(), CPAddr, NULL, 0);
+  }
+}
+
+/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol
+/// even in non-static mode.
+static bool GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) {
+  return RelocM != Reloc::Static &&
+    (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+     (GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode()));
+}
+
+SDOperand ARMTargetLowering::LowerGlobalAddressDarwin(SDOperand Op,
+                                                      SelectionDAG &DAG) {
+  MVT::ValueType PtrVT = getPointerTy();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  bool IsIndirect = GVIsIndirectSymbol(GV, RelocM);
+  SDOperand CPAddr;
+  if (RelocM == Reloc::Static)
+    CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 2);
+  else {
+    unsigned PCAdj = (RelocM != Reloc::PIC_)
+      ? 0 : (Subtarget->isThumb() ? 4 : 8);
+    ARMCP::ARMCPKind Kind = IsIndirect ? ARMCP::CPNonLazyPtr
+      : ARMCP::CPValue;
+    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex,
+                                                         Kind, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 2);
+  }
+  CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr);
+
+  SDOperand Result = DAG.getLoad(PtrVT, DAG.getEntryNode(), CPAddr, NULL, 0);
+  SDOperand Chain = Result.getValue(1);
+
+  if (RelocM == Reloc::PIC_) {
+    SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+    Result = DAG.getNode(ARMISD::PIC_ADD, PtrVT, Result, PICLabel);
+  }
+  if (IsIndirect)
+    Result = DAG.getLoad(PtrVT, Chain, Result, NULL, 0);
+
+  return Result;
+}
+
+SDOperand ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDOperand Op,
+                                                      SelectionDAG &DAG){
+  assert(Subtarget->isTargetELF() &&
+         "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
+  MVT::ValueType PtrVT = getPointerTy();
+  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+  ARMConstantPoolValue *CPV = new ARMConstantPoolValue("_GLOBAL_OFFSET_TABLE_",
+                                                       ARMPCLabelIndex,
+                                                       ARMCP::CPValue, PCAdj);
+  SDOperand CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 2);
+  CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr);
+  SDOperand Result = DAG.getLoad(PtrVT, DAG.getEntryNode(), CPAddr, NULL, 0);
+  SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32);
+  return DAG.getNode(ARMISD::PIC_ADD, PtrVT, Result, PICLabel);
+}
+
+static SDOperand LowerVASTART(SDOperand Op, SelectionDAG &DAG,
+                              unsigned VarArgsFrameIndex) {
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+  SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2));
+  return DAG.getStore(Op.getOperand(0), FR, Op.getOperand(1), SV->getValue(),
+                      SV->getOffset());
+}
+
+static SDOperand LowerFORMAL_ARGUMENT(SDOperand Op, SelectionDAG &DAG,
+                                      unsigned *vRegs, unsigned ArgNo,
+                                      unsigned &NumGPRs, unsigned &ArgOffset) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MVT::ValueType ObjectVT = Op.getValue(ArgNo).getValueType();
+  SDOperand Root = Op.getOperand(0);
+  std::vector<SDOperand> ArgValues;
+  SSARegMap *RegMap = MF.getSSARegMap();
+
+  static const unsigned GPRArgRegs[] = {
+    ARM::R0, ARM::R1, ARM::R2, ARM::R3
+  };
+
+  unsigned ObjSize;
+  unsigned ObjGPRs;
+  unsigned GPRPad;
+  unsigned StackPad;
+  unsigned Flags = Op.getConstantOperandVal(ArgNo + 3);
+  HowToPassArgument(ObjectVT, NumGPRs, ArgOffset, ObjGPRs,
+                    ObjSize, GPRPad, StackPad, Flags);
+  NumGPRs += GPRPad;
+  ArgOffset += StackPad;
+
+  SDOperand ArgValue;
+  if (ObjGPRs == 1) {
+    unsigned VReg = RegMap->createVirtualRegister(&ARM::GPRRegClass);
+    MF.addLiveIn(GPRArgRegs[NumGPRs], VReg);
+    vRegs[NumGPRs] = VReg;
+    ArgValue = DAG.getCopyFromReg(Root, VReg, MVT::i32);
+    if (ObjectVT == MVT::f32)
+      ArgValue = DAG.getNode(ISD::BIT_CONVERT, MVT::f32, ArgValue);
+  } else if (ObjGPRs == 2) {
+    unsigned VReg = RegMap->createVirtualRegister(&ARM::GPRRegClass);
+    MF.addLiveIn(GPRArgRegs[NumGPRs], VReg);
+    vRegs[NumGPRs] = VReg;
+    ArgValue = DAG.getCopyFromReg(Root, VReg, MVT::i32);
+
+    VReg = RegMap->createVirtualRegister(&ARM::GPRRegClass);
+    MF.addLiveIn(GPRArgRegs[NumGPRs+1], VReg);
+    vRegs[NumGPRs+1] = VReg;
+    SDOperand ArgValue2 = DAG.getCopyFromReg(Root, VReg, MVT::i32);
+
+    if (ObjectVT == MVT::i64)
+      ArgValue = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, ArgValue, ArgValue2);
+    else
+      ArgValue = DAG.getNode(ARMISD::FMDRR, MVT::f64, ArgValue, ArgValue2);
+  }
+  NumGPRs += ObjGPRs;
+
+  if (ObjSize) {
+    // If the argument is actually used, emit a load from the right stack
+    // slot.
+    if (!Op.Val->hasNUsesOfValue(0, ArgNo)) {
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
+      SDOperand FIN = DAG.getFrameIndex(FI, MVT::i32);
+      if (ObjGPRs == 0)
+        ArgValue = DAG.getLoad(ObjectVT, Root, FIN, NULL, 0);
+      else {
+        SDOperand ArgValue2 =
+          DAG.getLoad(MVT::i32, Root, FIN, NULL, 0);
+        if (ObjectVT == MVT::i64)
+          ArgValue= DAG.getNode(ISD::BUILD_PAIR, MVT::i64, ArgValue, ArgValue2);
+        else
+          ArgValue= DAG.getNode(ARMISD::FMDRR, MVT::f64, ArgValue, ArgValue2);
+      }
+    } else {
+      // Don't emit a dead load.
+      ArgValue = DAG.getNode(ISD::UNDEF, ObjectVT);
+    }
+
+    ArgOffset += ObjSize;   // Move on to the next argument.
+  }
+
+  return ArgValue;
+}
+
+SDOperand
+ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) {
+  std::vector<SDOperand> ArgValues;
+  SDOperand Root = Op.getOperand(0);
+  unsigned ArgOffset = 0;   // Frame mechanisms handle retaddr slot
+  unsigned NumGPRs = 0;     // GPRs used for parameter passing.
+  unsigned VRegs[4];
+
+  unsigned NumArgs = Op.Val->getNumValues()-1;
+  for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
+    ArgValues.push_back(LowerFORMAL_ARGUMENT(Op, DAG, VRegs, ArgNo,
+                                             NumGPRs, ArgOffset));
+
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  if (isVarArg) {
+    static const unsigned GPRArgRegs[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3
+    };
+
+    MachineFunction &MF = DAG.getMachineFunction();
+    SSARegMap *RegMap = MF.getSSARegMap();
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+    unsigned VARegSize = (4 - NumGPRs) * 4;
+    unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
+    if (VARegSaveSize) {
+      // If this function is vararg, store any remaining integer argument regs
+      // to their spots on the stack so that they may be loaded by deferencing
+      // the result of va_next.
+      AFI->setVarArgsRegSaveSize(VARegSaveSize);
+      VarArgsFrameIndex = MFI->CreateFixedObject(VARegSaveSize, ArgOffset +
+                                                 VARegSaveSize - VARegSize);
+      SDOperand FIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+
+      SmallVector<SDOperand, 4> MemOps;
+      for (; NumGPRs < 4; ++NumGPRs) {
+        unsigned VReg = RegMap->createVirtualRegister(&ARM::GPRRegClass);
+        MF.addLiveIn(GPRArgRegs[NumGPRs], VReg);
+        SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::i32);
+        SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                          DAG.getConstant(4, getPointerTy()));
+      }
+      if (!MemOps.empty())
+        Root = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                           &MemOps[0], MemOps.size());
+    } else
+      // This will point to the next argument passed via stack.
+      VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset);
+  }
+
+  ArgValues.push_back(Root);
+
+  // Return the new list of results.
+  std::vector<MVT::ValueType> RetVT(Op.Val->value_begin(),
+                                    Op.Val->value_end());
+  return DAG.getNode(ISD::MERGE_VALUES, RetVT, &ArgValues[0], ArgValues.size());
+}
+
+/// isFloatingPointZero - Return true if this is +0.0.
+static bool isFloatingPointZero(SDOperand Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->isExactlyValue(0.0);
+  else if (ISD::isEXTLoad(Op.Val) || ISD::isNON_EXTLoad(Op.Val)) {
+    // Maybe this has already been legalized into the constant pool?
+    if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
+      SDOperand WrapperOp = Op.getOperand(1).getOperand(0);
+      if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
+        if (ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+          return CFP->isExactlyValue(0.0);
+    }
+  }
+  return false;
+}
+
+static bool isLegalCmpImmediate(unsigned C, bool isThumb) {
+  return ( isThumb && (C & ~255U) == 0) ||
+         (!isThumb && ARM_AM::getSOImmVal(C) != -1);
+}
+
+/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
+/// the given operands.
+static SDOperand getARMCmp(SDOperand LHS, SDOperand RHS, ISD::CondCode CC,
+                           SDOperand &ARMCC, SelectionDAG &DAG, bool isThumb) {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.Val)) {
+    unsigned C = RHSC->getValue();
+    if (!isLegalCmpImmediate(C, isThumb)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default: break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if (isLegalCmpImmediate(C-1, isThumb)) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          RHS = DAG.getConstant(C-1, MVT::i32);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if (C > 0 && isLegalCmpImmediate(C-1, isThumb)) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          RHS = DAG.getConstant(C-1, MVT::i32);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if (isLegalCmpImmediate(C+1, isThumb)) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          RHS = DAG.getConstant(C+1, MVT::i32);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if (C < 0xffffffff && isLegalCmpImmediate(C+1, isThumb)) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          RHS = DAG.getConstant(C+1, MVT::i32);
+        }
+        break;
+      }
+    }
+  }
+
+  ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+  ARMISD::NodeType CompareType;
+  switch (CondCode) {
+  default:
+    CompareType = ARMISD::CMP;
+    break;
+  case ARMCC::EQ:
+  case ARMCC::NE:
+  case ARMCC::MI:
+  case ARMCC::PL:
+    // Uses only N and Z Flags
+    CompareType = ARMISD::CMPNZ;
+    break;
+  }
+  ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  return DAG.getNode(CompareType, MVT::Flag, LHS, RHS);
+}
+
+/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
+static SDOperand getVFPCmp(SDOperand LHS, SDOperand RHS, SelectionDAG &DAG) {
+  SDOperand Cmp;
+  if (!isFloatingPointZero(RHS))
+    Cmp = DAG.getNode(ARMISD::CMPFP, MVT::Flag, LHS, RHS);
+  else
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, MVT::Flag, LHS);
+  return DAG.getNode(ARMISD::FMSTAT, MVT::Flag, Cmp);
+}
+
+static SDOperand LowerSELECT_CC(SDOperand Op, SelectionDAG &DAG,
+                                const ARMSubtarget *ST) {
+  MVT::ValueType VT = Op.getValueType();
+  SDOperand LHS = Op.getOperand(0);
+  SDOperand RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDOperand TrueVal = Op.getOperand(2);
+  SDOperand FalseVal = Op.getOperand(3);
+
+  if (LHS.getValueType() == MVT::i32) {
+    SDOperand ARMCC;
+    SDOperand CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    SDOperand Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb());
+    return DAG.getNode(ARMISD::CMOV, VT, FalseVal, TrueVal, ARMCC, CCR, Cmp);
+  }
+
+  ARMCC::CondCodes CondCode, CondCode2;
+  if (FPCCToARMCC(CC, CondCode, CondCode2))
+    std::swap(TrueVal, FalseVal);
+
+  SDOperand ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  SDOperand CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDOperand Cmp = getVFPCmp(LHS, RHS, DAG);
+  SDOperand Result = DAG.getNode(ARMISD::CMOV, VT, FalseVal, TrueVal,
+                                 ARMCC, CCR, Cmp);
+  if (CondCode2 != ARMCC::AL) {
+    SDOperand ARMCC2 = DAG.getConstant(CondCode2, MVT::i32);
+    // FIXME: Needs another CMP because flag can have but one use.
+    SDOperand Cmp2 = getVFPCmp(LHS, RHS, DAG);
+    Result = DAG.getNode(ARMISD::CMOV, VT, Result, TrueVal, ARMCC2, CCR, Cmp2);
+  }
+  return Result;
+}
+
+static SDOperand LowerBR_CC(SDOperand Op, SelectionDAG &DAG,
+                            const ARMSubtarget *ST) {
+  SDOperand  Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDOperand    LHS = Op.getOperand(2);
+  SDOperand    RHS = Op.getOperand(3);
+  SDOperand   Dest = Op.getOperand(4);
+
+  if (LHS.getValueType() == MVT::i32) {
+    SDOperand ARMCC;
+    SDOperand CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    SDOperand Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb());
+    return DAG.getNode(ARMISD::BRCOND, MVT::Other, Chain, Dest, ARMCC, CCR,Cmp);
+  }
+
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+  ARMCC::CondCodes CondCode, CondCode2;
+  if (FPCCToARMCC(CC, CondCode, CondCode2))
+    // Swap the LHS/RHS of the comparison if needed.
+    std::swap(LHS, RHS);
+  
+  SDOperand Cmp = getVFPCmp(LHS, RHS, DAG);
+  SDOperand ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  SDOperand CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDOperand Ops[] = { Chain, Dest, ARMCC, CCR, Cmp };
+  SDOperand Res = DAG.getNode(ARMISD::BRCOND, VTList, Ops, 5);
+  if (CondCode2 != ARMCC::AL) {
+    ARMCC = DAG.getConstant(CondCode2, MVT::i32);
+    SDOperand Ops[] = { Res, Dest, ARMCC, CCR, Res.getValue(1) };
+    Res = DAG.getNode(ARMISD::BRCOND, VTList, Ops, 5);
+  }
+  return Res;
+}
+
+SDOperand ARMTargetLowering::LowerBR_JT(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Table = Op.getOperand(1);
+  SDOperand Index = Op.getOperand(2);
+
+  MVT::ValueType PTy = getPointerTy();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
+  SDOperand UId =  DAG.getConstant(AFI->createJumpTableUId(), PTy);
+  SDOperand JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
+  Table = DAG.getNode(ARMISD::WrapperJT, MVT::i32, JTI, UId);
+  Index = DAG.getNode(ISD::MUL, PTy, Index, DAG.getConstant(4, PTy));
+  SDOperand Addr = DAG.getNode(ISD::ADD, PTy, Index, Table);
+  bool isPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  Addr = DAG.getLoad(isPIC ? (MVT::ValueType)MVT::i32 : PTy,
+                     Chain, Addr, NULL, 0);
+  Chain = Addr.getValue(1);
+  if (isPIC)
+    Addr = DAG.getNode(ISD::ADD, PTy, Addr, Table);
+  return DAG.getNode(ARMISD::BR_JT, MVT::Other, Chain, Addr, JTI, UId);
+}
+
+static SDOperand LowerFP_TO_INT(SDOperand Op, SelectionDAG &DAG) {
+  unsigned Opc =
+    Op.getOpcode() == ISD::FP_TO_SINT ? ARMISD::FTOSI : ARMISD::FTOUI;
+  Op = DAG.getNode(Opc, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(ISD::BIT_CONVERT, MVT::i32, Op);
+}
+
+static SDOperand LowerINT_TO_FP(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType VT = Op.getValueType();
+  unsigned Opc =
+    Op.getOpcode() == ISD::SINT_TO_FP ? ARMISD::SITOF : ARMISD::UITOF;
+
+  Op = DAG.getNode(ISD::BIT_CONVERT, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(Opc, VT, Op);
+}
+
+static SDOperand LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) {
+  // Implement fcopysign with a fabs and a conditional fneg.
+  SDOperand Tmp0 = Op.getOperand(0);
+  SDOperand Tmp1 = Op.getOperand(1);
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType SrcVT = Tmp1.getValueType();
+  SDOperand AbsVal = DAG.getNode(ISD::FABS, VT, Tmp0);
+  SDOperand Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG);
+  SDOperand ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32);
+  SDOperand CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  return DAG.getNode(ARMISD::CNEG, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp);
+}
+
+static SDOperand LowerBIT_CONVERT(SDOperand Op, SelectionDAG &DAG) {
+  // Turn f64->i64 into FMRRD.
+  assert(Op.getValueType() == MVT::i64 &&
+         Op.getOperand(0).getValueType() == MVT::f64);
+
+  Op = Op.getOperand(0);
+  SDOperand Cvt = DAG.getNode(ARMISD::FMRRD, DAG.getVTList(MVT::i32, MVT::i32),
+                              &Op, 1);
+  
+  // Merge the pieces into a single i64 value.
+  return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Cvt, Cvt.getValue(1));
+}
+
+static SDOperand LowerMUL(SDOperand Op, SelectionDAG &DAG) {
+  // FIXME: All this code is target-independent.  Create a new target-indep
+  // MULHILO node and move this code to the legalizer.
+  //
+  assert(Op.getValueType() == MVT::i64 && "Only handles i64 expand right now!");
+  
+  SDOperand LL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(0),
+                             DAG.getConstant(0, MVT::i32));
+  SDOperand RL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(1),
+                             DAG.getConstant(0, MVT::i32));
+
+  unsigned LHSSB = DAG.ComputeNumSignBits(Op.getOperand(0));
+  unsigned RHSSB = DAG.ComputeNumSignBits(Op.getOperand(1));
+  
+  SDOperand Lo, Hi;
+  // Figure out how to lower this multiply.
+  if (LHSSB >= 33 && RHSSB >= 33) {
+    // If the input values are both sign extended, we can emit a mulhs+mul.
+    Lo = DAG.getNode(ISD::MUL, MVT::i32, LL, RL);
+    Hi = DAG.getNode(ISD::MULHS, MVT::i32, LL, RL);
+  } else if (LHSSB == 32 && RHSSB == 32 &&
+             DAG.MaskedValueIsZero(Op.getOperand(0), 0xFFFFFFFF00000000ULL) &&
+             DAG.MaskedValueIsZero(Op.getOperand(1), 0xFFFFFFFF00000000ULL)) {
+    // If the inputs are zero extended, use mulhu.
+    Lo = DAG.getNode(ISD::MUL, MVT::i32, LL, RL);
+    Hi = DAG.getNode(ISD::MULHU, MVT::i32, LL, RL);
+  } else {
+    SDOperand LH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(0),
+                               DAG.getConstant(1, MVT::i32));
+    SDOperand RH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(1),
+                               DAG.getConstant(1, MVT::i32));
+  
+    // Lo,Hi = umul LHS, RHS.
+    SDOperand Ops[] = { LL, RL };
+    SDOperand UMul64 = DAG.getNode(ARMISD::MULHILOU,
+                                   DAG.getVTList(MVT::i32, MVT::i32), Ops, 2);
+    Lo = UMul64;
+    Hi = UMul64.getValue(1);
+    RH = DAG.getNode(ISD::MUL, MVT::i32, LL, RH);
+    LH = DAG.getNode(ISD::MUL, MVT::i32, LH, RL);
+    Hi = DAG.getNode(ISD::ADD, MVT::i32, Hi, RH);
+    Hi = DAG.getNode(ISD::ADD, MVT::i32, Hi, LH);
+  }
+  
+  // Merge the pieces into a single i64 value.
+  return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi);
+}
+
+static SDOperand LowerMULHU(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Ops[] = { Op.getOperand(0), Op.getOperand(1) };
+  return DAG.getNode(ARMISD::MULHILOU,
+                     DAG.getVTList(MVT::i32, MVT::i32), Ops, 2).getValue(1);
+}
+
+static SDOperand LowerMULHS(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Ops[] = { Op.getOperand(0), Op.getOperand(1) };
+  return DAG.getNode(ARMISD::MULHILOS,
+                     DAG.getVTList(MVT::i32, MVT::i32), Ops, 2).getValue(1);
+}
+
+static SDOperand LowerSRx(SDOperand Op, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  assert(Op.getValueType() == MVT::i64 &&
+         (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) &&
+         "Unknown shift to lower!");
+  
+  // We only lower SRA, SRL of 1 here, all others use generic lowering.
+  if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
+      cast<ConstantSDNode>(Op.getOperand(1))->getValue() != 1)
+    return SDOperand();
+  
+  // If we are in thumb mode, we don't have RRX.
+  if (ST->isThumb()) return SDOperand();
+  
+  // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
+  SDOperand Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(0),
+                             DAG.getConstant(0, MVT::i32));
+  SDOperand Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(0),
+                             DAG.getConstant(1, MVT::i32));
+
+  // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
+  // captures the result into a carry flag.
+  unsigned Opc = Op.getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
+  Hi = DAG.getNode(Opc, DAG.getVTList(MVT::i32, MVT::Flag), &Hi, 1);
+  
+  // The low part is an ARMISD::RRX operand, which shifts the carry in.
+  Lo = DAG.getNode(ARMISD::RRX, MVT::i32, Lo, Hi.getValue(1));
+  
+  // Merge the pieces into a single i64 value.
+  return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi);
+}
+
+SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Dest = Op.getOperand(1);
+  SDOperand Src = Op.getOperand(2);
+  SDOperand Count = Op.getOperand(3);
+  unsigned Align =
+    (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue();
+  if (Align == 0) Align = 1;
+
+  ConstantSDNode *I = dyn_cast<ConstantSDNode>(Count);
+  // Just call memcpy if:
+  // not 4-byte aligned
+  // size is unknown
+  // size is >= the threshold.
+  if ((Align & 3) != 0 || 
+       !I ||
+       I->getValue() >= 64 ||
+       (I->getValue() & 3) != 0) {
+    MVT::ValueType IntPtr = getPointerTy();
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = getTargetData()->getIntPtrType();
+    Entry.Node = Op.getOperand(1); Args.push_back(Entry);
+    Entry.Node = Op.getOperand(2); Args.push_back(Entry);
+    Entry.Node = Op.getOperand(3); Args.push_back(Entry);
+    std::pair<SDOperand,SDOperand> CallResult =
+      LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false,
+                  DAG.getExternalSymbol("memcpy", IntPtr), Args, DAG);
+    return CallResult.second;
+  }
+
+  // Otherwise do repeated 4-byte loads and stores.  To be improved.
+  assert((I->getValue() & 3) == 0);
+  assert((Align & 3) == 0);
+  unsigned NumMemOps = I->getValue() >> 2;
+  unsigned EmittedNumMemOps = 0;
+  unsigned SrcOff = 0, DstOff = 0;
+  MVT::ValueType VT = MVT::i32;
+  unsigned VTSize = 4;
+  const unsigned MAX_LOADS_IN_LDM = 6;
+  SDOperand LoadChains[MAX_LOADS_IN_LDM];
+  SDOperand Loads[MAX_LOADS_IN_LDM];
+
+  // Emit up to 4 loads, then a TokenFactor barrier, then the same
+  // number of stores.  The loads and stores will get combined into
+  // ldm/stm later on.
+  while(EmittedNumMemOps < NumMemOps) {
+    unsigned i;
+    for (i=0; i<MAX_LOADS_IN_LDM && EmittedNumMemOps+i < NumMemOps; i++) {
+      Loads[i] = DAG.getLoad(VT, Chain,
+                             DAG.getNode(ISD::ADD, VT, Src, 
+                                         DAG.getConstant(SrcOff, VT)),
+                             NULL, 0);
+      LoadChains[i] = Loads[i].getValue(1);
+      SrcOff += VTSize;
+    }
+
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &LoadChains[0], i);
+
+    for (i=0; i<MAX_LOADS_IN_LDM && EmittedNumMemOps+i < NumMemOps; i++) {
+      Chain = DAG.getStore(Chain, Loads[i],
+                           DAG.getNode(ISD::ADD, VT, Dest, 
+                                       DAG.getConstant(DstOff, VT)),
+                           NULL, 0);
+      DstOff += VTSize;
+    }
+    EmittedNumMemOps += i;
+  }
+
+  return Chain;
+}
+
+SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Don't know how to custom lower this!"); abort();
+  case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:
+    return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
+      LowerGlobalAddressELF(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::CALL:          return LowerCALL(Op, DAG);
+  case ISD::RET:           return LowerRET(Op, DAG);
+  case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG, Subtarget);
+  case ISD::BR_CC:         return LowerBR_CC(Op, DAG, Subtarget);
+  case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
+  case ISD::VASTART:       return LowerVASTART(Op, DAG, VarArgsFrameIndex);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
+  case ISD::BIT_CONVERT:   return LowerBIT_CONVERT(Op, DAG);
+  case ISD::MUL:           return LowerMUL(Op, DAG);
+  case ISD::MULHU:         return LowerMULHU(Op, DAG);
+  case ISD::MULHS:         return LowerMULHS(Op, DAG);
+  case ISD::SRL:
+  case ISD::SRA:           return LowerSRx(Op, DAG, Subtarget);
+  case ISD::FORMAL_ARGUMENTS:
+    return LowerFORMAL_ARGUMENTS(Op, DAG);
+  case ISD::RETURNADDR:    break;
+  case ISD::FRAMEADDR:     break;
+  case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+  case ISD::MEMCPY:        return LowerMEMCPY(Op, DAG);
+  }
+  return SDOperand();
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+ARMTargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                           MachineBasicBlock *BB) {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  switch (MI->getOpcode()) {
+  default: assert(false && "Unexpected instr type to insert");
+  case ARM::tMOVCCr: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    ilist<MachineBasicBlock>::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB  = BB;
+    MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB  = new MachineBasicBlock(LLVM_BB);
+    BuildMI(BB, TII->get(ARM::tBcc)).addMBB(sinkMBB)
+      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
+    MachineFunction *F = BB->getParent();
+    F->getBasicBlockList().insert(It, copy0MBB);
+    F->getBasicBlockList().insert(It, sinkMBB);
+    // Update machine-CFG edges by first adding all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
+        e = BB->succ_end(); i != e; ++i)
+      sinkMBB->addSuccessor(*i);
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while(!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(BB, TII->get(ARM::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+    delete MI;   // The pseudo instruction is gone now.
+    return BB;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+static bool isLegalAddressImmediate(int64_t V, MVT::ValueType VT,
+                                    const ARMSubtarget *Subtarget) {
+  if (V == 0)
+    return true;
+
+  if (Subtarget->isThumb()) {
+    if (V < 0)
+      return false;
+
+    unsigned Scale = 1;
+    switch (VT) {
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+      // Scale == 1;
+      break;
+    case MVT::i16:
+      // Scale == 2;
+      Scale = 2;
+      break;
+    case MVT::i32:
+      // Scale == 4;
+      Scale = 4;
+      break;
+    }
+
+    if ((V & (Scale - 1)) != 0)
+      return false;
+    V /= Scale;
+    return V == V & ((1LL << 5) - 1);
+  }
+
+  if (V < 0)
+    V = - V;
+  switch (VT) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i32:
+    // +- imm12
+    return V == V & ((1LL << 12) - 1);
+  case MVT::i16:
+    // +- imm8
+    return V == V & ((1LL << 8) - 1);
+  case MVT::f32:
+  case MVT::f64:
+    if (!Subtarget->hasVFP2())
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == V & ((1LL << 8) - 1);
+  }
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, 
+                                              const Type *Ty) const {
+  if (!isLegalAddressImmediate(AM.BaseOffs, getValueType(Ty), Subtarget))
+    return false;
+  
+  // Can never fold addr of global into load/store.
+  if (AM.BaseGV) 
+    return false;
+  
+  switch (AM.Scale) {
+  case 0:  // no scale reg, must be "r+i" or "r", or "i".
+    break;
+  case 1:
+    if (Subtarget->isThumb())
+      return false;
+    // FALL THROUGH.
+  default:
+    // ARM doesn't support any R+R*scale+imm addr modes.
+    if (AM.BaseOffs)
+      return false;
+    
+    int Scale = AM.Scale;
+    switch (getValueType(Ty)) {
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i32:
+    case MVT::i64:
+      // This assumes i64 is legalized to a pair of i32. If not (i.e.
+      // ldrd / strd are used, then its address mode is same as i16.
+      // r + r
+      if (Scale < 0) Scale = -Scale;
+      if (Scale == 1)
+        return true;
+      // r + r << imm
+      return isPowerOf2_32(Scale & ~1);
+    case MVT::i16:
+      // r + r
+      if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+        return true;
+      return false;
+      
+    case MVT::isVoid:
+      // Note, we allow "void" uses (basically, uses that aren't loads or
+      // stores), because arm allows folding a scale into many arithmetic
+      // operations.  This should be made more precise and revisited later.
+      
+      // Allow r << imm, but the imm has to be a multiple of two.
+      if (AM.Scale & 1) return false;
+      return isPowerOf2_32(AM.Scale);
+    }
+    break;
+  }
+  return true;
+}
+
+
+static bool getIndexedAddressParts(SDNode *Ptr, MVT::ValueType VT,
+                                   bool isSEXTLoad, SDOperand &Base,
+                                   SDOperand &Offset, bool &isInc,
+                                   SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+
+  if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
+    // AddressingMode 3
+    Base = Ptr->getOperand(0);
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getValue();
+      if (RHSC < 0 && RHSC > -256) {
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        return true;
+      }
+    }
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Offset = Ptr->getOperand(1);
+    return true;
+  } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
+    // AddressingMode 2
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getValue();
+      if (RHSC < 0 && RHSC > -0x1000) {
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        Base = Ptr->getOperand(0);
+        return true;
+      }
+    }
+
+    if (Ptr->getOpcode() == ISD::ADD) {
+      isInc = true;
+      ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0));
+      if (ShOpcVal != ARM_AM::no_shift) {
+        Base = Ptr->getOperand(1);
+        Offset = Ptr->getOperand(0);
+      } else {
+        Base = Ptr->getOperand(0);
+        Offset = Ptr->getOperand(1);
+      }
+      return true;
+    }
+
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Base = Ptr->getOperand(0);
+    Offset = Ptr->getOperand(1);
+    return true;
+  }
+
+  // FIXME: Use FLDM / FSTM to emulate indexed FP load / store.
+  return false;
+}
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool
+ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDOperand &Base,
+                                             SDOperand &Offset,
+                                             ISD::MemIndexedMode &AM,
+                                             SelectionDAG &DAG) {
+  if (Subtarget->isThumb())
+    return false;
+
+  MVT::ValueType VT;
+  SDOperand Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT  = LD->getLoadedVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    VT  = ST->getStoredVT();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = getIndexedAddressParts(Ptr.Val, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  if (isLegal) {
+    AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
+    return true;
+  }
+  return false;
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                   SDOperand &Base,
+                                                   SDOperand &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   SelectionDAG &DAG) {
+  if (Subtarget->isThumb())
+    return false;
+
+  MVT::ValueType VT;
+  SDOperand Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT  = LD->getLoadedVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT  = ST->getStoredVT();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = getIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  if (isLegal) {
+    AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+    return true;
+  }
+  return false;
+}
+
+void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                       uint64_t Mask,
+                                                       uint64_t &KnownZero, 
+                                                       uint64_t &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  KnownZero = 0;
+  KnownOne = 0;
+  switch (Op.getOpcode()) {
+  default: break;
+  case ARMISD::CMOV: {
+    // Bits are known zero/one if known on the LHS and RHS.
+    DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+    if (KnownZero == 0 && KnownOne == 0) return;
+
+    uint64_t KnownZeroRHS, KnownOneRHS;
+    DAG.ComputeMaskedBits(Op.getOperand(1), Mask,
+                          KnownZeroRHS, KnownOneRHS, Depth+1);
+    KnownZero &= KnownZeroRHS;
+    KnownOne  &= KnownOneRHS;
+    return;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+ARMTargetLowering::ConstraintType
+ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'l': return C_RegisterClass;
+    case 'w': return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*> 
+ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT::ValueType VT) const {
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'l':
+    // FIXME: in thumb mode, 'l' is only low-regs.
+    // FALL THROUGH.
+    case 'r':
+      return std::make_pair(0U, ARM::GPRRegisterClass);
+    case 'w':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, ARM::SPRRegisterClass);
+      if (VT == MVT::f64)
+        return std::make_pair(0U, ARM::DPRRegisterClass);
+      break;
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+std::vector<unsigned> ARMTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT::ValueType VT) const {
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {      // GCC ARM Constraint Letters
+  default: break;
+  case 'l':
+  case 'r':
+    return make_vector<unsigned>(ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+                                 ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+                                 ARM::R8, ARM::R9, ARM::R10, ARM::R11,
+                                 ARM::R12, ARM::LR, 0);
+  case 'w':
+    if (VT == MVT::f32)
+      return make_vector<unsigned>(ARM::S0, ARM::S1, ARM::S2, ARM::S3,
+                                   ARM::S4, ARM::S5, ARM::S6, ARM::S7,
+                                   ARM::S8, ARM::S9, ARM::S10, ARM::S11,
+                                   ARM::S12,ARM::S13,ARM::S14,ARM::S15,
+                                   ARM::S16,ARM::S17,ARM::S18,ARM::S19,
+                                   ARM::S20,ARM::S21,ARM::S22,ARM::S23,
+                                   ARM::S24,ARM::S25,ARM::S26,ARM::S27,
+                                   ARM::S28,ARM::S29,ARM::S30,ARM::S31, 0);
+    if (VT == MVT::f64)
+      return make_vector<unsigned>(ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+                                   ARM::D4, ARM::D5, ARM::D6, ARM::D7,
+                                   ARM::D8, ARM::D9, ARM::D10,ARM::D11,
+                                   ARM::D12,ARM::D13,ARM::D14,ARM::D15, 0);
+      break;
+  }
+
+  return std::vector<unsigned>();
+}

diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
new file mode 100644
index 0000000..2b66f23
--- /dev/null
+++ b/lib/Target/ARM/ARMISelLowering.h

@@ -0,0 +1,144 @@
+//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMISELLOWERING_H
+#define ARMISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include <vector>
+
+namespace llvm {
+  class ARMConstantPoolValue;
+  class ARMSubtarget;
+
+  namespace ARMISD {
+    // ARM Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+ARM::INSTRUCTION_LIST_END,
+
+      Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
+                    // TargetExternalSymbol, and TargetGlobalAddress.
+      WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
+      
+      CALL,         // Function call.
+      CALL_PRED,    // Function call that's predicable.
+      CALL_NOLINK,  // Function call with branch not branch-and-link.
+      tCALL,        // Thumb function call.
+      BRCOND,       // Conditional branch.
+      BR_JT,        // Jumptable branch.
+      RET_FLAG,     // Return with a flag operand.
+
+      PIC_ADD,      // Add with a PC operand and a PIC label.
+
+      CMP,          // ARM compare instructions.
+      CMPNZ,        // ARM compare that uses only N or Z flags.
+      CMPFP,        // ARM VFP compare instruction, sets FPSCR.
+      CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
+      FMSTAT,       // ARM fmstat instruction.
+      CMOV,         // ARM conditional move instructions.
+      CNEG,         // ARM conditional negate instructions.
+      
+      FTOSI,        // FP to sint within a FP register.
+      FTOUI,        // FP to uint within a FP register.
+      SITOF,        // sint to FP within a FP register.
+      UITOF,        // uint to FP within a FP register.
+
+      MULHILOU,     // Lo,Hi = umul LHS, RHS.
+      MULHILOS,     // Lo,Hi = smul LHS, RHS.
+      
+      SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
+      SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
+      RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
+      
+      FMRRD,        // double to two gprs.
+      FMDRR,         // Two gprs to double.
+
+      THREAD_POINTER
+    };
+  }
+
+  //===----------------------------------------------------------------------===//
+  //  ARMTargetLowering - ARM Implementation of the TargetLowering interface
+  
+  class ARMTargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+  public:
+    ARMTargetLowering(TargetMachine &TM);
+
+    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                                       MachineBasicBlock *MBB);
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    virtual bool getPreIndexedAddressParts(SDNode *N, SDOperand &Base,
+                                           SDOperand &Offset,
+                                           ISD::MemIndexedMode &AM,
+                                           SelectionDAG &DAG);
+
+    /// getPostIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if this node can be
+    /// combined with a load / store to form a post-indexed load / store.
+    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                            SDOperand &Base, SDOperand &Offset,
+                                            ISD::MemIndexedMode &AM,
+                                            SelectionDAG &DAG);
+
+    virtual void computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                uint64_t Mask,
+                                                uint64_t &KnownZero, 
+                                                uint64_t &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth) const;
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*> 
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT::ValueType VT) const;
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                      MVT::ValueType VT) const;
+  private:
+    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const ARMSubtarget *Subtarget;
+
+    /// ARMPCLabelIndex - Keep track the number of ARM PC labels created.
+    ///
+    unsigned ARMPCLabelIndex;
+
+    SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerGlobalAddressDarwin(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerGlobalAddressELF(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                            SelectionDAG &DAG);
+    SDOperand LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                       SelectionDAG &DAG);
+    SDOperand LowerGLOBAL_OFFSET_TABLE(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerBR_JT(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG);
+  };
+}
+
+#endif  // ARMISELLOWERING_H

diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
new file mode 100644
index 0000000..b404ec0
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrInfo.cpp

@@ -0,0 +1,612 @@
+//===- ARMInstrInfo.cpp - ARM Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstrInfo.h"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMGenInstrInfo.inc"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<bool> EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
+                                  cl::desc("Enable ARM 2-addr to 3-addr conv"));
+
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
+  : TargetInstrInfo(ARMInsts, sizeof(ARMInsts)/sizeof(ARMInsts[0])),
+    RI(*this, STI) {
+}
+
+const TargetRegisterClass *ARMInstrInfo::getPointerRegClass() const {
+  return &ARM::GPRRegClass;
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+///
+bool ARMInstrInfo::isMoveInstr(const MachineInstr &MI,
+                               unsigned &SrcReg, unsigned &DstReg) const {
+  MachineOpCode oc = MI.getOpcode();
+  switch (oc) {
+  default:
+    return false;
+  case ARM::FCPYS:
+  case ARM::FCPYD:
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  case ARM::MOVr:
+  case ARM::tMOVr:
+    assert(MI.getInstrDescriptor()->numOperands >= 2 &&
+           MI.getOperand(0).isRegister() &&
+           MI.getOperand(1).isRegister() &&
+           "Invalid ARM MOV instruction");
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+}
+
+unsigned ARMInstrInfo::isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const{
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::LDR:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImmediate() && 
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::FLDD:
+  case ARM::FLDS:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isImmediate() && 
+        MI->getOperand(2).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::tRestore:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isImmediate() && 
+        MI->getOperand(2).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned ARMInstrInfo::isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::STR:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImmediate() && 
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::FSTD:
+  case ARM::FSTS:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isImmediate() && 
+        MI->getOperand(2).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::tSpill:
+    if (MI->getOperand(1).isFrameIndex() &&
+        MI->getOperand(2).isImmediate() && 
+        MI->getOperand(2).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+static unsigned getUnindexedOpcode(unsigned Opc) {
+  switch (Opc) {
+  default: break;
+  case ARM::LDR_PRE:
+  case ARM::LDR_POST:
+    return ARM::LDR;
+  case ARM::LDRH_PRE:
+  case ARM::LDRH_POST:
+    return ARM::LDRH;
+  case ARM::LDRB_PRE:
+  case ARM::LDRB_POST:
+    return ARM::LDRB;
+  case ARM::LDRSH_PRE:
+  case ARM::LDRSH_POST:
+    return ARM::LDRSH;
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSB_POST:
+    return ARM::LDRSB;
+  case ARM::STR_PRE:
+  case ARM::STR_POST:
+    return ARM::STR;
+  case ARM::STRH_PRE:
+  case ARM::STRH_POST:
+    return ARM::STRH;
+  case ARM::STRB_PRE:
+  case ARM::STRB_POST:
+    return ARM::STRB;
+  }
+  return 0;
+}
+
+MachineInstr *
+ARMInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    LiveVariables &LV) const {
+  if (!EnableARM3Addr)
+    return NULL;
+
+  MachineInstr *MI = MBBI;
+  unsigned TSFlags = MI->getInstrDescriptor()->TSFlags;
+  bool isPre = false;
+  switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
+  default: return NULL;
+  case ARMII::IndexModePre:
+    isPre = true;
+    break;
+  case ARMII::IndexModePost:
+    break;
+  }
+
+  // Try spliting an indexed load / store to a un-indexed one plus an add/sub
+  // operation.
+  unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
+  if (MemOpc == 0)
+    return NULL;
+
+  MachineInstr *UpdateMI = NULL;
+  MachineInstr *MemMI = NULL;
+  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+  const TargetInstrDescriptor *TID = MI->getInstrDescriptor();
+  unsigned NumOps = TID->numOperands;
+  bool isLoad = (TID->Flags & M_LOAD_FLAG) != 0;
+  const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
+  const MachineOperand &Base = MI->getOperand(2);
+  const MachineOperand &Offset = MI->getOperand(NumOps-3);
+  unsigned WBReg = WB.getReg();
+  unsigned BaseReg = Base.getReg();
+  unsigned OffReg = Offset.getReg();
+  unsigned OffImm = MI->getOperand(NumOps-2).getImm();
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
+  switch (AddrMode) {
+  default:
+    assert(false && "Unknown indexed op!");
+    return NULL;
+  case ARMII::AddrMode2: {
+    bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+    if (OffReg == 0) {
+      int SOImmVal = ARM_AM::getSOImmVal(Amt);
+      if (SOImmVal == -1)
+        // Can't encode it in a so_imm operand. This transformation will
+        // add more than 1 instruction. Abandon!
+        return NULL;
+      UpdateMI = BuildMI(get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(SOImmVal)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else if (Amt != 0) {
+      ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+      unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
+      UpdateMI = BuildMI(get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg)
+        .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else 
+      UpdateMI = BuildMI(get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  case ARMII::AddrMode3 : {
+    bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM3Offset(OffImm);
+    if (OffReg == 0)
+      // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
+      UpdateMI = BuildMI(get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt)
+        .addImm(Pred).addReg(0).addReg(0);
+    else
+      UpdateMI = BuildMI(get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  }
+
+  std::vector<MachineInstr*> NewMIs;
+  if (isPre) {
+    if (isLoad)
+      MemMI = BuildMI(get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+    NewMIs.push_back(MemMI);
+    NewMIs.push_back(UpdateMI);
+  } else {
+    if (isLoad)
+      MemMI = BuildMI(get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+    if (WB.isDead())
+      UpdateMI->getOperand(0).setIsDead();
+    NewMIs.push_back(UpdateMI);
+    NewMIs.push_back(MemMI);
+  }
+  
+  // Transfer LiveVariables states, kill / dead info.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegister() && MO.getReg() &&
+        MRegisterInfo::isVirtualRegister(MO.getReg())) {
+      unsigned Reg = MO.getReg();
+      LiveVariables::VarInfo &VI = LV.getVarInfo(Reg);
+      if (MO.isDef()) {
+        MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
+        if (MO.isDead())
+          LV.addVirtualRegisterDead(Reg, NewMI);
+        // Update the defining instruction.
+        if (VI.DefInst == MI)
+          VI.DefInst = NewMI;
+      }
+      if (MO.isUse() && MO.isKill()) {
+        for (unsigned j = 0; j < 2; ++j) {
+          // Look at the two new MI's in reverse order.
+          MachineInstr *NewMI = NewMIs[j];
+          int NIdx = NewMI->findRegisterUseOperandIdx(Reg);
+          if (NIdx == -1)
+            continue;
+          LV.addVirtualRegisterKilled(Reg, NewMI);
+          if (VI.removeKill(MI))
+            VI.Kills.push_back(NewMI);
+          break;
+        }
+      }
+    }
+  }
+
+  MFI->insert(MBBI, NewMIs[1]);
+  MFI->insert(MBBI, NewMIs[0]);
+  return NewMIs[0];
+}
+
+// Branch analysis.
+bool ARMInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 std::vector<MachineOperand> &Cond) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+  
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastOpc == ARM::B || LastOpc == ARM::tB) {
+      TBB = LastInst->getOperand(0).getMachineBasicBlock();
+      return false;
+    }
+    if (LastOpc == ARM::Bcc || LastOpc == ARM::tBcc) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(0).getMachineBasicBlock();
+      Cond.push_back(LastInst->getOperand(1));
+      Cond.push_back(LastInst->getOperand(2));
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+  
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+  
+  // If the block ends with ARM::B/ARM::tB and a ARM::Bcc/ARM::tBcc, handle it.
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+  if ((SecondLastOpc == ARM::Bcc && LastOpc == ARM::B) ||
+      (SecondLastOpc == ARM::tBcc && LastOpc == ARM::tB)) {
+    TBB =  SecondLastInst->getOperand(0).getMachineBasicBlock();
+    Cond.push_back(SecondLastInst->getOperand(1));
+    Cond.push_back(SecondLastInst->getOperand(2));
+    FBB = LastInst->getOperand(0).getMachineBasicBlock();
+    return false;
+  }
+  
+  // If the block ends with two unconditional branches, handle it.  The second 
+  // one is not executed, so remove it.
+  if ((SecondLastOpc == ARM::B || SecondLastOpc==ARM::tB) &&
+      (LastOpc == ARM::B || LastOpc == ARM::tB)) {
+    TBB = SecondLastInst->getOperand(0).getMachineBasicBlock();
+    I = LastInst;
+    I->eraseFromParent();
+    return false;
+  }
+
+  // Likewise if it ends with a branch table followed by an unconditional branch.
+  // The branch folder can create these, and we must get rid of them for
+  // correctness of Thumb constant islands.
+  if ((SecondLastOpc == ARM::BR_JTr || SecondLastOpc==ARM::BR_JTm ||
+       SecondLastOpc == ARM::BR_JTadd || SecondLastOpc==ARM::tBR_JTr) &&
+      (LastOpc == ARM::B || LastOpc == ARM::tB)) {
+    I = LastInst;
+    I->eraseFromParent();
+    return true;
+  } 
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+
+unsigned ARMInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int BOpc   = AFI->isThumbFunction() ? ARM::tB : ARM::B;
+  int BccOpc = AFI->isThumbFunction() ? ARM::tBcc : ARM::Bcc;
+
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+  
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != BccOpc)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned ARMInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const std::vector<MachineOperand> &Cond) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int BOpc   = AFI->isThumbFunction() ? ARM::tB : ARM::B;
+  int BccOpc = AFI->isThumbFunction() ? ARM::tBcc : ARM::Bcc;
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "ARM branch conditions have two components!");
+  
+  if (FBB == 0) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, get(BOpc)).addMBB(TBB);
+    else
+      BuildMI(&MBB, get(BccOpc)).addMBB(TBB)
+        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+    return 1;
+  }
+  
+  // Two-way conditional branch.
+  BuildMI(&MBB, get(BccOpc)).addMBB(TBB)
+    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+  BuildMI(&MBB, get(BOpc)).addMBB(FBB);
+  return 2;
+}
+
+bool ARMInstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case ARM::BX_RET:   // Return.
+  case ARM::LDM_RET:
+  case ARM::tBX_RET:
+  case ARM::tBX_RET_vararg:
+  case ARM::tPOP_RET:
+  case ARM::B:
+  case ARM::tB:       // Uncond branch.
+  case ARM::tBR_JTr:
+  case ARM::BR_JTr:   // Jumptable branch.
+  case ARM::BR_JTm:   // Jumptable branch through mem.
+  case ARM::BR_JTadd: // Jumptable branch add to pc.
+    return true;
+  default: return false;
+  }
+}
+
+bool ARMInstrInfo::
+ReverseBranchCondition(std::vector<MachineOperand> &Cond) const {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+  Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+  return false;
+}
+
+bool ARMInstrInfo::isPredicated(const MachineInstr *MI) const {
+  int PIdx = MI->findFirstPredOperandIdx();
+  return PIdx != -1 && MI->getOperand(PIdx).getImmedValue() != ARMCC::AL;
+}
+
+bool ARMInstrInfo::PredicateInstruction(MachineInstr *MI,
+                                const std::vector<MachineOperand> &Pred) const {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::B || Opc == ARM::tB) {
+    MI->setInstrDescriptor(get(Opc == ARM::B ? ARM::Bcc : ARM::tBcc));
+    MI->addImmOperand(Pred[0].getImmedValue());
+    MI->addRegOperand(Pred[1].getReg(), false);
+    return true;
+  }
+
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI->getOperand(PIdx);
+    PMO.setImm(Pred[0].getImmedValue());
+    MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
+    return true;
+  }
+  return false;
+}
+
+bool
+ARMInstrInfo::SubsumesPredicate(const std::vector<MachineOperand> &Pred1,
+                                const std::vector<MachineOperand> &Pred2) const{
+  if (Pred1.size() > 2 || Pred2.size() > 2)
+    return false;
+
+  ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImmedValue();
+  ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImmedValue();
+  if (CC1 == CC2)
+    return true;
+
+  switch (CC1) {
+  default:
+    return false;
+  case ARMCC::AL:
+    return true;
+  case ARMCC::HS:
+    return CC2 == ARMCC::HI;
+  case ARMCC::LS:
+    return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
+  case ARMCC::GE:
+    return CC2 == ARMCC::GT;
+  case ARMCC::LE:
+    return CC2 == ARMCC::LT;
+  }
+}
+
+bool ARMInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                    std::vector<MachineOperand> &Pred) const {
+  const TargetInstrDescriptor *TID = MI->getInstrDescriptor();
+  if (!TID->ImplicitDefs && (TID->Flags & M_HAS_OPTIONAL_DEF) == 0)
+    return false;
+
+  bool Found = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == ARM::CPSR) {
+      Pred.push_back(MO);
+      Found = true;
+    }
+  }
+
+  return Found;
+}
+
+
+/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) DISABLE_INLINE;
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) {
+  return JT[JTI].MBBs.size();
+}
+
+/// GetInstSize - Return the size of the specified MachineInstr.
+///
+unsigned ARM::GetInstSize(MachineInstr *MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const TargetAsmInfo *TAI = MF->getTarget().getTargetAsmInfo();
+
+  // Basic size info comes from the TSFlags field.
+  const TargetInstrDescriptor *TID = MI->getInstrDescriptor();
+  unsigned TSFlags = TID->TSFlags;
+  
+  switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
+  default:
+    // If this machine instr is an inline asm, measure it.
+    if (MI->getOpcode() == ARM::INLINEASM)
+      return TAI->getInlineAsmLength(MI->getOperand(0).getSymbolName());
+    if (MI->getOpcode() == ARM::LABEL)
+      return 0;
+    assert(0 && "Unknown or unset size field for instr!");
+    break;
+  case ARMII::Size8Bytes: return 8;          // Arm instruction x 2.
+  case ARMII::Size4Bytes: return 4;          // Arm instruction.
+  case ARMII::Size2Bytes: return 2;          // Thumb instruction.
+  case ARMII::SizeSpecial: {
+    switch (MI->getOpcode()) {
+    case ARM::CONSTPOOL_ENTRY:
+      // If this machine instr is a constant pool entry, its size is recorded as
+      // operand #2.
+      return MI->getOperand(2).getImm();
+    case ARM::BR_JTr:
+    case ARM::BR_JTm:
+    case ARM::BR_JTadd:
+    case ARM::tBR_JTr: {
+      // These are jumptable branches, i.e. a branch followed by an inlined
+      // jumptable. The size is 4 + 4 * number of entries.
+      unsigned NumOps = TID->numOperands;
+      MachineOperand JTOP =
+        MI->getOperand(NumOps - ((TID->Flags & M_PREDICABLE) ? 3 : 2));
+      unsigned JTI = JTOP.getJumpTableIndex();
+      MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+      const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+      assert(JTI < JT.size());
+      // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
+      // 4 aligned. The assembler / linker may add 2 byte padding just before
+      // the JT entries.  The size does not include this padding; the
+      // constant islands pass does separate bookkeeping for it.
+      // FIXME: If we know the size of the function is less than (1 << 16) *2
+      // bytes, we can use 16-bit entries instead. Then there won't be an
+      // alignment issue.
+      return getNumJTEntries(JT, JTI) * 4 + 
+             (MI->getOpcode()==ARM::tBR_JTr ? 2 : 4);
+    }
+    default:
+      // Otherwise, pseudo-instruction sizes are zero.
+      return 0;
+    }
+  }
+  }
+}
+
+/// GetFunctionSize - Returns the size of the specified MachineFunction.
+///
+unsigned ARM::GetFunctionSize(MachineFunction &MF) {
+  unsigned FnSize = 0;
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+    for (MachineBasicBlock::iterator I = MBB.begin(),E = MBB.end(); I != E; ++I)
+      FnSize += ARM::GetInstSize(I);
+  }
+  return FnSize;
+}

diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
new file mode 100644
index 0000000..2c158b8
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrInfo.h

@@ -0,0 +1,133 @@
+//===- ARMInstrInfo.h - ARM Instruction Information -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMINSTRUCTIONINFO_H
+#define ARMINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "ARMRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction Flags.
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the addressing mode used.  Zero is unused
+    // so that we can tell if we forgot to set a value.
+
+    AddrModeMask  = 0xf,
+    AddrMode1     = 1,
+    AddrMode2     = 2,
+    AddrMode3     = 3,
+    AddrMode4     = 4,
+    AddrMode5     = 5,
+    AddrModeT1    = 6,
+    AddrModeT2    = 7,
+    AddrModeT4    = 8,
+    AddrModeTs    = 9,   // i8 * 4 for pc and sp relative data
+
+    // Size* - Flags to keep track of the size of an instruction.
+    SizeShift     = 4,
+    SizeMask      = 7 << SizeShift,
+    SizeSpecial   = 1,   // 0 byte pseudo or special case.
+    Size8Bytes    = 2,
+    Size4Bytes    = 3,
+    Size2Bytes    = 4,
+    
+    // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load
+    // and store ops 
+    IndexModeShift = 7,
+    IndexModeMask  = 3 << IndexModeShift,
+    IndexModePre   = 1,
+    IndexModePost  = 2,
+    
+    // Opcode
+    OpcodeShift   = 9,
+    OpcodeMask    = 0xf << OpcodeShift
+  };
+}
+
+class ARMInstrInfo : public TargetInstrInfo {
+  const ARMRegisterInfo RI;
+public:
+  ARMInstrInfo(const ARMSubtarget &STI);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// getPointerRegClass - Return the register class to use to hold pointers.
+  /// This is used for addressing modes.
+  virtual const TargetRegisterClass *getPointerRegClass() const;
+
+  /// Return true if the instruction is a register to register move and
+  /// leave the source and dest operands in the passed parameters.
+  ///
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg) const;
+  virtual unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables &LV) const;
+
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             std::vector<MachineOperand> &Cond) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const std::vector<MachineOperand> &Cond) const;
+  virtual bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const;
+  virtual bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const;
+
+  // Predication support.
+  virtual bool isPredicated(const MachineInstr *MI) const;
+
+  virtual
+  bool PredicateInstruction(MachineInstr *MI,
+                            const std::vector<MachineOperand> &Pred) const;
+
+  virtual
+  bool SubsumesPredicate(const std::vector<MachineOperand> &Pred1,
+                         const std::vector<MachineOperand> &Pred1) const;
+
+  virtual bool DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const;
+};
+
+  // Utility routines
+  namespace ARM {
+    /// GetInstSize - Returns the size of the specified MachineInstr.
+    ///
+    unsigned GetInstSize(MachineInstr *MI);
+
+    /// GetFunctionSize - Returns the size of the specified MachineFunction.
+    ///
+    unsigned GetFunctionSize(MachineFunction &MF);
+  }
+}
+
+#endif

diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
new file mode 100644
index 0000000..adc203b
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrInfo.td

@@ -0,0 +1,1320 @@
+//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM specific DAG Nodes.
+//
+
+// Type profiles.
+def SDT_ARMCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>;
+
+def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
+
+def SDT_ARMcall    : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+
+def SDT_ARMCMov    : SDTypeProfile<1, 3,
+                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                    SDTCisVT<3, i32>]>;
+
+def SDT_ARMBrcond  : SDTypeProfile<0, 2,
+                                   [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+
+def SDT_ARMBrJT    : SDTypeProfile<0, 3,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+                                   SDTCisVT<2, i32>]>;
+
+def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                          SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+
+def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+
+// Node definitions.
+def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
+def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
+
+def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeq,
+    		              [SDNPHasChain, SDNPOutFlag]>;
+def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeq,
+    		              [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+def ARMcall          : SDNode<"ARMISD::CALL", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def ARMcall_pred    : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTRet,
+                              [SDNPHasChain, SDNPOptInFlag]>;
+
+def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
+                              [SDNPInFlag]>;
+def ARMcneg          : SDNode<"ARMISD::CNEG", SDT_ARMCMov,
+                              [SDNPInFlag]>;
+
+def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
+                              [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+def ARMbrjt          : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
+                              [SDNPHasChain]>;
+
+def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
+                              [SDNPOutFlag]>;
+
+def ARMcmpNZ         : SDNode<"ARMISD::CMPNZ", SDT_ARMCmp,
+                              [SDNPOutFlag]>;
+
+def ARMpic_add       : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
+
+def ARMsrl_flag      : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>;
+def ARMsra_flag      : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>;
+def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInFlag ]>;
+
+def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasV5T  : Predicate<"Subtarget->hasV5TOps()">;
+def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">;
+def HasV6   : Predicate<"Subtarget->hasV6Ops()">;
+def IsThumb : Predicate<"Subtarget->isThumb()">;
+def IsARM   : Predicate<"!Subtarget->isThumb()">;
+
+//===----------------------------------------------------------------------===//
+// ARM Flag Definitions.
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+
+//===----------------------------------------------------------------------===//
+//  ARM specific transformation functions and pattern fragments.
+//
+
+// so_imm_XFORM - Return a so_imm value packed into the format described for
+// so_imm def below.
+def so_imm_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(N->getValue()),
+                                   MVT::i32);
+}]>;
+
+// so_imm_neg_XFORM - Return a so_imm value packed into the format described for
+// so_imm_neg def below.
+def so_imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(-(int)N->getValue()),
+                                   MVT::i32);
+}]>;
+
+// so_imm_not_XFORM - Return a so_imm value packed into the format described for
+// so_imm_not def below.
+def so_imm_not_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(~(int)N->getValue()),
+                                   MVT::i32);
+}]>;
+
+// rot_imm predicate - True if the 32-bit immediate is equal to 8, 16, or 24.
+def rot_imm : PatLeaf<(i32 imm), [{
+  int32_t v = (int32_t)N->getValue();
+  return v == 8 || v == 16 || v == 24;
+}]>;
+
+/// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15].
+def imm1_15 : PatLeaf<(i32 imm), [{
+  return (int32_t)N->getValue() >= 1 && (int32_t)N->getValue() < 16;
+}]>;
+
+/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
+def imm16_31 : PatLeaf<(i32 imm), [{
+  return (int32_t)N->getValue() >= 16 && (int32_t)N->getValue() < 32;
+}]>;
+
+def so_imm_neg : 
+  PatLeaf<(imm), [{ return ARM_AM::getSOImmVal(-(int)N->getValue()) != -1; }],
+          so_imm_neg_XFORM>;
+
+def so_imm_not :
+  PatLeaf<(imm), [{ return ARM_AM::getSOImmVal(~(int)N->getValue()) != -1; }],
+          so_imm_not_XFORM>;
+
+// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
+def sext_16_node : PatLeaf<(i32 GPR:$a), [{
+  return CurDAG->ComputeNumSignBits(SDOperand(N,0)) >= 17;
+}]>;
+
+
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// Branch target.
+def brtarget : Operand<OtherVT>;
+
+// A list of registers separated by comma. Used by load/store multiple.
+def reglist : Operand<i32> {
+  let PrintMethod = "printRegisterList";
+}
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  let PrintMethod = "printCPInstOperand";
+}
+
+def jtblock_operand : Operand<i32> {
+  let PrintMethod = "printJTBlockOperand";
+}
+
+// Local PC labels.
+def pclabel : Operand<i32> {
+  let PrintMethod = "printPCLabel";
+}
+
+// shifter_operand operands: so_reg and so_imm.
+def so_reg : Operand<i32>,    // reg reg imm
+            ComplexPattern<i32, 3, "SelectShifterOperandReg",
+                            [shl,srl,sra,rotr]> {
+  let PrintMethod = "printSORegOperand";
+  let MIOperandInfo = (ops GPR, GPR, i32imm);
+}
+
+// so_imm - Match a 32-bit shifter_operand immediate operand, which is an
+// 8-bit immediate rotated by an arbitrary number of bits.  so_imm values are
+// represented in the imm field in the same 12-bit form that they are encoded
+// into so_imm instructions: the 8-bit immediate is the least significant bits
+// [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11].
+def so_imm : Operand<i32>,
+             PatLeaf<(imm),
+                     [{ return ARM_AM::getSOImmVal(N->getValue()) != -1; }],
+                     so_imm_XFORM> {
+  let PrintMethod = "printSOImmOperand";
+}
+
+// Break so_imm's up into two pieces.  This handles immediates with up to 16
+// bits set in them.  This uses so_imm2part to match and so_imm2part_[12] to
+// get the first/second pieces.
+def so_imm2part : Operand<i32>,
+                  PatLeaf<(imm),
+             [{ return ARM_AM::isSOImmTwoPartVal((unsigned)N->getValue()); }]> {
+  let PrintMethod = "printSOImm2PartOperand";
+}
+
+def so_imm2part_1 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getSOImmTwoPartFirst((unsigned)N->getValue());
+  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32);
+}]>;
+
+def so_imm2part_2 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getSOImmTwoPartSecond((unsigned)N->getValue());
+  return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32);
+}]>;
+
+
+// Define ARM specific addressing modes.
+
+// addrmode2 := reg +/- reg shop imm
+// addrmode2 := reg +/- imm12
+//
+def addrmode2 : Operand<i32>,
+                ComplexPattern<i32, 3, "SelectAddrMode2", []> {
+  let PrintMethod = "printAddrMode2Operand";
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def am2offset : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2Offset", []> {
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmode3 := reg +/- reg
+// addrmode3 := reg +/- imm8
+//
+def addrmode3 : Operand<i32>,
+                ComplexPattern<i32, 3, "SelectAddrMode3", []> {
+  let PrintMethod = "printAddrMode3Operand";
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def am3offset : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode3Offset", []> {
+  let PrintMethod = "printAddrMode3OffsetOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmode4 := reg, <mode|W>
+//
+def addrmode4 : Operand<i32>,
+                ComplexPattern<i32, 2, "", []> {
+  let PrintMethod = "printAddrMode4Operand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmode5 := reg +/- imm8*4
+//
+def addrmode5 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode5", []> {
+  let PrintMethod = "printAddrMode5Operand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmodepc := pc + reg
+//
+def addrmodepc : Operand<i32>,
+                 ComplexPattern<i32, 2, "SelectAddrModePC", []> {
+  let PrintMethod = "printAddrModePCOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
+// register whose default is 0 (no register).
+def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
+                                     (ops (i32 14), (i32 zero_reg))> {
+  let PrintMethod = "printPredicateOperand";
+}
+
+// Conditional code result for instructions whose 's' bit is set, e.g. subs.
+//
+def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
+  let PrintMethod = "printSBitModifierOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction flags.  These need to match ARMInstrInfo.h.
+//
+
+// Addressing mode.
+class AddrMode<bits<4> val> {
+  bits<4> Value = val;
+}
+def AddrModeNone : AddrMode<0>;
+def AddrMode1    : AddrMode<1>;
+def AddrMode2    : AddrMode<2>;
+def AddrMode3    : AddrMode<3>;
+def AddrMode4    : AddrMode<4>;
+def AddrMode5    : AddrMode<5>;
+def AddrModeT1   : AddrMode<6>;
+def AddrModeT2   : AddrMode<7>;
+def AddrModeT4   : AddrMode<8>;
+def AddrModeTs   : AddrMode<9>;
+
+// Instruction size.
+class SizeFlagVal<bits<3> val> {
+  bits<3> Value = val;
+}
+def SizeInvalid  : SizeFlagVal<0>;  // Unset.
+def SizeSpecial  : SizeFlagVal<1>;  // Pseudo or special.
+def Size8Bytes   : SizeFlagVal<2>;
+def Size4Bytes   : SizeFlagVal<3>;
+def Size2Bytes   : SizeFlagVal<4>;
+
+// Load / store index mode.
+class IndexMode<bits<2> val> {
+  bits<2> Value = val;
+}
+def IndexModeNone : IndexMode<0>;
+def IndexModePre  : IndexMode<1>;
+def IndexModePost : IndexMode<2>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction templates.
+//
+
+// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode.
+class ARMPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM];
+}
+class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5TE];
+}
+class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV6];
+}
+
+class InstARM<bits<4> opcod, AddrMode am, SizeFlagVal sz, IndexMode im,
+              string cstr>
+  : Instruction {
+  let Namespace = "ARM";
+
+  bits<4> Opcode = opcod;
+  AddrMode AM = am;
+  bits<4> AddrModeBits = AM.Value;
+  
+  SizeFlagVal SZ = sz;
+  bits<3> SizeFlag = SZ.Value;
+
+  IndexMode IM = im;
+  bits<2> IndexModeBits = IM.Value;
+  
+  let Constraints = cstr;
+}
+
+class PseudoInst<dag ops, string asm, list<dag> pattern>
+  : InstARM<0, AddrModeNone, SizeSpecial, IndexModeNone, ""> {
+  let OperandList = ops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+}
+
+// Almost all ARM instructions are predicable.
+class I<dag oprnds, AddrMode am, SizeFlagVal sz, IndexMode im,
+        string opc, string asm, string cstr, list<dag> pattern>
+  // FIXME: Set all opcodes to 0 for now.
+  : InstARM<0, am, sz, im, cstr> {
+  let OperandList = !con(oprnds, (ops pred:$p));
+  let AsmString   = !strconcat(opc, !strconcat("${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Same as I except it can optionally modify CPSR.
+class sI<dag oprnds, AddrMode am, SizeFlagVal sz, IndexMode im,
+        string opc, string asm, string cstr, list<dag> pattern>
+  // FIXME: Set all opcodes to 0 for now.
+  : InstARM<0, am, sz, im, cstr> {
+  let OperandList = !con(oprnds, (ops pred:$p, cc_out:$s));
+  let AsmString   = !strconcat(opc, !strconcat("${p}${s}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+class AI<dag ops, string opc, string asm, list<dag> pattern>
+  : I<ops, AddrModeNone, Size4Bytes, IndexModeNone, opc, asm, "", pattern>;
+class AsI<dag ops, string opc, string asm, list<dag> pattern>
+  : sI<ops, AddrModeNone, Size4Bytes, IndexModeNone, opc, asm, "", pattern>;
+class AI1<dag ops, string opc, string asm, list<dag> pattern>
+  : I<ops, AddrMode1, Size4Bytes, IndexModeNone, opc, asm, "", pattern>;
+class AsI1<dag ops, string opc, string asm, list<dag> pattern>
+  : sI<ops, AddrMode1, Size4Bytes, IndexModeNone, opc, asm, "", pattern>;
+class AI2<dag ops, string opc, string asm, list<dag> pattern>
+  : I<ops, AddrMode2, Size4Bytes, IndexModeNone, opc, asm, "", pattern>;
+class AI3<dag ops, string opc, string asm, list<dag> pattern>
+  : I<ops, AddrMode3, Size4Bytes, IndexModeNone, opc, asm, "", pattern>;
+class AI4<dag ops, string opc, string asm, list<dag> pattern>
+  : I<ops, AddrMode4, Size4Bytes, IndexModeNone, opc, asm, "", pattern>;
+class AI1x2<dag ops, string opc, string asm, list<dag> pattern>
+  : I<ops, AddrMode1, Size8Bytes, IndexModeNone, opc, asm, "", pattern>;
+
+// Pre-indexed ops
+class AI2pr<dag ops, string opc, string asm, string cstr, list<dag> pattern>
+  : I<ops, AddrMode2, Size4Bytes, IndexModePre, opc, asm, cstr, pattern>;
+class AI3pr<dag ops, string opc, string asm, string cstr, list<dag> pattern>
+  : I<ops, AddrMode3, Size4Bytes, IndexModePre, opc, asm, cstr, pattern>;
+
+// Post-indexed ops
+class AI2po<dag ops, string opc, string asm, string cstr, list<dag> pattern>
+  : I<ops, AddrMode2, Size4Bytes, IndexModePost, opc, asm, cstr, pattern>;
+class AI3po<dag ops, string opc, string asm, string cstr, list<dag> pattern>
+  : I<ops, AddrMode3, Size4Bytes, IndexModePost, opc, asm, cstr, pattern>;
+
+
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
+
+
+/// AI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a
+/// binop that produces a value.
+multiclass AsI1_bin_irs<string opc, PatFrag opnode> {
+  def ri : AsI1<(ops GPR:$dst, GPR:$a, so_imm:$b),
+               opc, " $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>;
+  def rr : AsI1<(ops GPR:$dst, GPR:$a, GPR:$b),
+               opc, " $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>;
+  def rs : AsI1<(ops GPR:$dst, GPR:$a, so_reg:$b),
+               opc, " $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>;
+}
+
+/// ASI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the
+/// instruction modifies the CSPR register.
+multiclass ASI1_bin_s_irs<string opc, PatFrag opnode> {
+  def ri : AI1<(ops GPR:$dst, GPR:$a, so_imm:$b),
+               opc, "s $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>, Imp<[], [CPSR]>;
+  def rr : AI1<(ops GPR:$dst, GPR:$a, GPR:$b),
+               opc, "s $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>, Imp<[], [CPSR]>;
+  def rs : AI1<(ops GPR:$dst, GPR:$a, so_reg:$b),
+               opc, "s $dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>, Imp<[], [CPSR]>;
+}
+
+/// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// patterns. Similar to AsI1_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+multiclass AI1_cmp_irs<string opc, PatFrag opnode> {
+  def ri : AI1<(ops GPR:$a, so_imm:$b),
+               opc, " $a, $b",
+               [(opnode GPR:$a, so_imm:$b)]>, Imp<[], [CPSR]>;
+  def rr : AI1<(ops GPR:$a, GPR:$b),
+               opc, " $a, $b",
+               [(opnode GPR:$a, GPR:$b)]>, Imp<[], [CPSR]>;
+  def rs : AI1<(ops GPR:$a, so_reg:$b),
+               opc, " $a, $b",
+               [(opnode GPR:$a, so_reg:$b)]>, Imp<[], [CPSR]>;
+}
+
+/// AI_unary_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+multiclass AI_unary_rrot<string opc, PatFrag opnode> {
+  def r     : AI<(ops GPR:$dst, GPR:$Src),
+                 opc, " $dst, $Src",
+                 [(set GPR:$dst, (opnode GPR:$Src))]>, Requires<[IsARM, HasV6]>;
+  def r_rot : AI<(ops GPR:$dst, GPR:$Src, i32imm:$rot),
+                 opc, " $dst, $Src, ror $rot",
+                 [(set GPR:$dst, (opnode (rotr GPR:$Src, rot_imm:$rot)))]>,
+              Requires<[IsARM, HasV6]>;
+}
+
+/// AI_bin_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+multiclass AI_bin_rrot<string opc, PatFrag opnode> {
+  def rr     : AI<(ops GPR:$dst, GPR:$LHS, GPR:$RHS),
+                  opc, " $dst, $LHS, $RHS",
+                  [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>,
+                  Requires<[IsARM, HasV6]>;
+  def rr_rot : AI<(ops GPR:$dst, GPR:$LHS, GPR:$RHS, i32imm:$rot),
+                  opc, " $dst, $LHS, $RHS, ror $rot",
+                  [(set GPR:$dst, (opnode GPR:$LHS,
+                                          (rotr GPR:$RHS, rot_imm:$rot)))]>,
+                  Requires<[IsARM, HasV6]>;
+}
+
+// Special cases.
+class XI<dag oprnds, AddrMode am, SizeFlagVal sz, IndexMode im,
+         string asm, string cstr, list<dag> pattern>
+  // FIXME: Set all opcodes to 0 for now.
+  : InstARM<0, am, sz, im, cstr> {
+  let OperandList = oprnds;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+class AXI<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrModeNone, Size4Bytes, IndexModeNone, asm, "", pattern>;
+class AXI1<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrMode1, Size4Bytes, IndexModeNone, asm, "", pattern>;
+class AXI2<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrMode2, Size4Bytes, IndexModeNone, asm, "", pattern>;
+class AXI3<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrMode3, Size4Bytes, IndexModeNone, asm, "", pattern>;
+class AXI4<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrMode4, Size4Bytes, IndexModeNone, asm, "", pattern>;
+
+class AXIx2<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrModeNone, Size8Bytes, IndexModeNone, asm, "", pattern>;
+
+// BR_JT instructions
+class JTI<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrModeNone, SizeSpecial, IndexModeNone, asm, "", pattern>;
+class JTI1<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrMode1, SizeSpecial, IndexModeNone, asm, "", pattern>;
+class JTI2<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrMode2, SizeSpecial, IndexModeNone, asm, "", pattern>;
+
+/// AsXI1_bin_c_irs - Same as AsI1_bin_irs but without the predicate operand and
+/// setting carry bit. But it can optionally set CPSR.
+multiclass AsXI1_bin_c_irs<string opc, PatFrag opnode> {
+  def ri : AXI1<(ops GPR:$dst, GPR:$a, so_imm:$b, cc_out:$s),
+               !strconcat(opc, "${s} $dst, $a, $b"),
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>, Imp<[CPSR], []>;
+  def rr : AXI1<(ops GPR:$dst, GPR:$a, GPR:$b, cc_out:$s),
+               !strconcat(opc, "${s} $dst, $a, $b"),
+               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>, Imp<[CPSR], []>;
+  def rs : AXI1<(ops GPR:$dst, GPR:$a, so_reg:$b, cc_out:$s),
+               !strconcat(opc, "${s} $dst, $a, $b"),
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>, Imp<[CPSR], []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+def IMPLICIT_DEF_GPR : 
+PseudoInst<(ops GPR:$rD, pred:$p),
+           "@ IMPLICIT_DEF_GPR $rD",
+           [(set GPR:$rD, (undef))]>;
+
+
+/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+/// the function.  The first operand is the ID# for this instruction, the second
+/// is the index into the MachineConstantPool that this is, the third is the
+/// size in bytes of this constant pool entry.
+let isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+PseudoInst<(ops cpinst_operand:$instid, cpinst_operand:$cpidx, i32imm:$size),
+           "${instid:label} ${cpidx:cpentry}", []>;
+
+def ADJCALLSTACKUP :
+PseudoInst<(ops i32imm:$amt, pred:$p),
+           "@ ADJCALLSTACKUP $amt",
+           [(ARMcallseq_end imm:$amt)]>, Imp<[SP],[SP]>;
+
+def ADJCALLSTACKDOWN : 
+PseudoInst<(ops i32imm:$amt, pred:$p),
+           "@ ADJCALLSTACKDOWN $amt",
+           [(ARMcallseq_start imm:$amt)]>, Imp<[SP],[SP]>;
+
+def DWARF_LOC :
+PseudoInst<(ops i32imm:$line, i32imm:$col, i32imm:$file),
+           ".loc $file, $line, $col",
+           [(dwarf_loc (i32 imm:$line), (i32 imm:$col), (i32 imm:$file))]>;
+
+let isNotDuplicable = 1 in {
+def PICADD : AXI1<(ops GPR:$dst, GPR:$a, pclabel:$cp, pred:$p),
+                   "$cp:\n\tadd$p $dst, pc, $a",
+                   [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>;
+
+let isLoad = 1, AddedComplexity = 10 in {
+def PICLD   : AXI2<(ops GPR:$dst, addrmodepc:$addr, pred:$p),
+                  "${addr:label}:\n\tldr$p $dst, $addr",
+                  [(set GPR:$dst, (load addrmodepc:$addr))]>;
+
+def PICLDZH : AXI3<(ops GPR:$dst, addrmodepc:$addr, pred:$p),
+                  "${addr:label}:\n\tldr${p}h $dst, $addr",
+                  [(set GPR:$dst, (zextloadi16 addrmodepc:$addr))]>;
+
+def PICLDZB : AXI2<(ops GPR:$dst, addrmodepc:$addr, pred:$p),
+                  "${addr:label}:\n\tldr${p}b $dst, $addr",
+                  [(set GPR:$dst, (zextloadi8 addrmodepc:$addr))]>;
+
+def PICLDH  : AXI3<(ops GPR:$dst, addrmodepc:$addr, pred:$p),
+                  "${addr:label}:\n\tldr${p}h $dst, $addr",
+                  [(set GPR:$dst, (extloadi16 addrmodepc:$addr))]>;
+
+def PICLDB  : AXI2<(ops GPR:$dst, addrmodepc:$addr, pred:$p),
+                  "${addr:label}:\n\tldr${p}b $dst, $addr",
+                  [(set GPR:$dst, (extloadi8 addrmodepc:$addr))]>;
+
+def PICLDSH : AXI3<(ops GPR:$dst, addrmodepc:$addr, pred:$p),
+                  "${addr:label}:\n\tldr${p}sh $dst, $addr",
+                  [(set GPR:$dst, (sextloadi16 addrmodepc:$addr))]>;
+
+def PICLDSB : AXI3<(ops GPR:$dst, addrmodepc:$addr, pred:$p),
+                  "${addr:label}:\n\tldr${p}sb $dst, $addr",
+                  [(set GPR:$dst, (sextloadi8 addrmodepc:$addr))]>;
+}
+let isStore = 1, AddedComplexity = 10 in {
+def PICSTR  : AXI2<(ops GPR:$src, addrmodepc:$addr, pred:$p),
+               "${addr:label}:\n\tstr$p $src, $addr",
+               [(store GPR:$src, addrmodepc:$addr)]>;
+
+def PICSTRH : AXI3<(ops GPR:$src, addrmodepc:$addr, pred:$p),
+               "${addr:label}:\n\tstr${p}h $src, $addr",
+               [(truncstorei16 GPR:$src, addrmodepc:$addr)]>;
+
+def PICSTRB : AXI2<(ops GPR:$src, addrmodepc:$addr, pred:$p),
+               "${addr:label}:\n\tstr${p}b $src, $addr",
+               [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
+}
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1 in
+  def BX_RET : AI<(ops), "bx", " lr", [(ARMretflag)]>;
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+let isLoad = 1, isReturn = 1, isTerminator = 1 in
+  def LDM_RET : AXI4<(ops addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops),
+                    "ldm${p}${addr:submode} $addr, $dst1",
+                    []>;
+
+let isCall = 1, noResults = 1,
+  Defs = [R0, R1, R2, R3, R12, LR,
+          D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in {
+  def BL  : AXI<(ops i32imm:$func, variable_ops),
+                "bl ${func:call}",
+                [(ARMcall tglobaladdr:$func)]>;
+
+  def BL_pred : AI<(ops i32imm:$func, variable_ops),
+                    "bl", " ${func:call}",
+                    [(ARMcall_pred tglobaladdr:$func)]>;
+
+  // ARMv5T and above
+  def BLX : AXI<(ops GPR:$dst, variable_ops),
+                "blx $dst",
+                [(ARMcall GPR:$dst)]>, Requires<[IsARM, HasV5T]>;
+  let Uses = [LR] in {
+    // ARMv4T
+    def BX : AXIx2<(ops GPR:$dst, variable_ops),
+                  "mov lr, pc\n\tbx $dst",
+                  [(ARMcall_nolink GPR:$dst)]>;
+  }
+}
+
+let isBranch = 1, isTerminator = 1, noResults = 1 in {
+  // B is "predicable" since it can be xformed into a Bcc.
+  let isBarrier = 1 in {
+    let isPredicable = 1 in
+    def B : AXI<(ops brtarget:$dst), "b $dst",
+                [(br bb:$dst)]>;
+
+  let isNotDuplicable = 1 in {
+  def BR_JTr : JTI<(ops GPR:$dst, jtblock_operand:$jt, i32imm:$id),
+                    "mov pc, $dst \n$jt",
+                    [(ARMbrjt GPR:$dst, tjumptable:$jt, imm:$id)]>;
+  def BR_JTm : JTI2<(ops addrmode2:$dst, jtblock_operand:$jt, i32imm:$id),
+                     "ldr pc, $dst \n$jt",
+                     [(ARMbrjt (i32 (load addrmode2:$dst)), tjumptable:$jt,
+                       imm:$id)]>;
+  def BR_JTadd : JTI1<(ops GPR:$dst, GPR:$idx, jtblock_operand:$jt, i32imm:$id),
+                       "add pc, $dst, $idx \n$jt",
+                       [(ARMbrjt (add GPR:$dst, GPR:$idx), tjumptable:$jt,
+                         imm:$id)]>;
+  }
+  }
+
+  // FIXME: should be able to write a pattern for ARMBrcond, but can't use
+  // a two-value operand where a dag node expects two operands. :( 
+  def Bcc : AI<(ops brtarget:$dst), "b", " $dst",
+                [/*(ARMbrcond bb:$dst, imm:$cc, CCR:$ccr)*/]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+// Load
+let isLoad = 1 in {
+def LDR  : AI2<(ops GPR:$dst, addrmode2:$addr),
+               "ldr", " $dst, $addr",
+               [(set GPR:$dst, (load addrmode2:$addr))]>;
+
+// Special LDR for loads from non-pc-relative constpools.
+let isReMaterializable = 1 in
+def LDRcp : AI2<(ops GPR:$dst, addrmode2:$addr),
+                 "ldr", " $dst, $addr", []>;
+
+// Loads with zero extension
+def LDRH  : AI3<(ops GPR:$dst, addrmode3:$addr),
+                 "ldr", "h $dst, $addr",
+                [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>;
+
+def LDRB  : AI2<(ops GPR:$dst, addrmode2:$addr),
+                 "ldr", "b $dst, $addr",
+                [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>;
+
+// Loads with sign extension
+def LDRSH : AI3<(ops GPR:$dst, addrmode3:$addr),
+                 "ldr", "sh $dst, $addr",
+                [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>;
+
+def LDRSB : AI3<(ops GPR:$dst, addrmode3:$addr),
+                 "ldr", "sb $dst, $addr",
+                [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>;
+
+// Load doubleword
+def LDRD  : AI3<(ops GPR:$dst, addrmode3:$addr),
+                 "ldr", "d $dst, $addr",
+                []>, Requires<[IsARM, HasV5T]>;
+
+// Indexed loads
+def LDR_PRE  : AI2pr<(ops GPR:$dst, GPR:$base_wb, addrmode2:$addr),
+                    "ldr", " $dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDR_POST : AI2po<(ops GPR:$dst, GPR:$base_wb, GPR:$base, am2offset:$offset),
+                    "ldr", " $dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRH_PRE  : AI3pr<(ops GPR:$dst, GPR:$base_wb, addrmode3:$addr),
+                     "ldr", "h $dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRH_POST : AI3po<(ops GPR:$dst, GPR:$base_wb, GPR:$base,am3offset:$offset),
+                     "ldr", "h $dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRB_PRE  : AI2pr<(ops GPR:$dst, GPR:$base_wb, addrmode2:$addr),
+                     "ldr", "b $dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRB_POST : AI2po<(ops GPR:$dst, GPR:$base_wb, GPR:$base,am2offset:$offset),
+                     "ldr", "b $dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRSH_PRE : AI3pr<(ops GPR:$dst, GPR:$base_wb, addrmode3:$addr),
+                      "ldr", "sh $dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRSH_POST: AI3po<(ops GPR:$dst, GPR:$base_wb, GPR:$base,am3offset:$offset),
+                      "ldr", "sh $dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRSB_PRE : AI3pr<(ops GPR:$dst, GPR:$base_wb, addrmode3:$addr),
+                      "ldr", "sb $dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRSB_POST: AI3po<(ops GPR:$dst, GPR:$base_wb, GPR:$base,am3offset:$offset),
+                      "ldr", "sb $dst, [$base], $offset", "$base = $base_wb", []>;
+} // isLoad
+
+// Store
+let isStore = 1 in {
+def STR  : AI2<(ops GPR:$src, addrmode2:$addr),
+               "str", " $src, $addr",
+               [(store GPR:$src, addrmode2:$addr)]>;
+
+// Stores with truncate
+def STRH : AI3<(ops GPR:$src, addrmode3:$addr),
+               "str", "h $src, $addr",
+               [(truncstorei16 GPR:$src, addrmode3:$addr)]>;
+
+def STRB : AI2<(ops GPR:$src, addrmode2:$addr),
+               "str", "b $src, $addr",
+               [(truncstorei8 GPR:$src, addrmode2:$addr)]>;
+
+// Store doubleword
+def STRD : AI3<(ops GPR:$src, addrmode3:$addr),
+               "str", "d $src, $addr",
+               []>, Requires<[IsARM, HasV5T]>;
+
+// Indexed stores
+def STR_PRE  : AI2pr<(ops GPR:$base_wb, GPR:$src, GPR:$base, am2offset:$offset),
+                    "str", " $src, [$base, $offset]!", "$base = $base_wb",
+                    [(set GPR:$base_wb,
+                      (pre_store GPR:$src, GPR:$base, am2offset:$offset))]>;
+
+def STR_POST : AI2po<(ops  GPR:$base_wb, GPR:$src, GPR:$base,am2offset:$offset),
+                    "str", " $src, [$base], $offset", "$base = $base_wb",
+                    [(set GPR:$base_wb,
+                      (post_store GPR:$src, GPR:$base, am2offset:$offset))]>;
+
+def STRH_PRE : AI3pr<(ops  GPR:$base_wb, GPR:$src, GPR:$base,am3offset:$offset),
+                     "str", "h $src, [$base, $offset]!", "$base = $base_wb",
+                    [(set GPR:$base_wb,
+                      (pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>;
+
+def STRH_POST: AI3po<(ops  GPR:$base_wb, GPR:$src, GPR:$base,am3offset:$offset),
+                     "str", "h $src, [$base], $offset", "$base = $base_wb",
+                    [(set GPR:$base_wb, (post_truncsti16 GPR:$src,
+                                         GPR:$base, am3offset:$offset))]>;
+
+def STRB_PRE : AI2pr<(ops  GPR:$base_wb, GPR:$src, GPR:$base,am2offset:$offset),
+                     "str", "b $src, [$base, $offset]!", "$base = $base_wb",
+                    [(set GPR:$base_wb, (pre_truncsti8 GPR:$src,
+                                         GPR:$base, am2offset:$offset))]>;
+
+def STRB_POST: AI2po<(ops  GPR:$base_wb, GPR:$src, GPR:$base,am2offset:$offset),
+                     "str", "b $src, [$base], $offset", "$base = $base_wb",
+                    [(set GPR:$base_wb, (post_truncsti8 GPR:$src,
+                                         GPR:$base, am2offset:$offset))]>;
+} // isStore
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+let isLoad = 1 in
+def LDM : AXI4<(ops addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops),
+               "ldm${p}${addr:submode} $addr, $dst1",
+               []>;
+
+let isStore = 1 in
+def STM : AXI4<(ops addrmode4:$addr, pred:$p, reglist:$src1, variable_ops),
+               "stm${p}${addr:submode} $addr, $src1",
+               []>;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+
+def MOVr : AsI1<(ops GPR:$dst, GPR:$src),
+                 "mov", " $dst, $src", []>;
+def MOVs : AsI1<(ops GPR:$dst, so_reg:$src),
+                 "mov", " $dst, $src", [(set GPR:$dst, so_reg:$src)]>;
+
+let isReMaterializable = 1 in
+def MOVi : AsI1<(ops GPR:$dst, so_imm:$src),
+                 "mov", " $dst, $src", [(set GPR:$dst, so_imm:$src)]>;
+
+def MOVrx       : AsI1<(ops GPR:$dst, GPR:$src),
+                      "mov", " $dst, $src, rrx",
+                      [(set GPR:$dst, (ARMrrx GPR:$src))]>;
+
+// These aren't really mov instructions, but we have to define them this way
+// due to flag operands.
+
+def MOVsrl_flag : AI1<(ops GPR:$dst, GPR:$src),
+                      "mov", "s $dst, $src, lsr #1",
+                      [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, Imp<[], [CPSR]>;
+def MOVsra_flag : AI1<(ops GPR:$dst, GPR:$src),
+                      "mov", "s $dst, $src, asr #1",
+                      [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, Imp<[], [CPSR]>;
+
+//===----------------------------------------------------------------------===//
+//  Extend Instructions.
+//
+
+// Sign extenders
+
+defm SXTB  : AI_unary_rrot<"sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
+defm SXTH  : AI_unary_rrot<"sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
+
+defm SXTAB : AI_bin_rrot<"sxtab",
+                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+defm SXTAH : AI_bin_rrot<"sxtah",
+                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+
+// TODO: SXT(A){B|H}16
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+defm UXTB   : AI_unary_rrot<"uxtb"  , UnOpFrag<(and node:$Src, 0x000000FF)>>;
+defm UXTH   : AI_unary_rrot<"uxth"  , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+defm UXTB16 : AI_unary_rrot<"uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+def : ARMV6Pat<(and (shl GPR:$Src, 8), 0xFF00FF),
+               (UXTB16r_rot GPR:$Src, 24)>;
+def : ARMV6Pat<(and (srl GPR:$Src, 8), 0xFF00FF),
+               (UXTB16r_rot GPR:$Src, 8)>;
+
+defm UXTAB : AI_bin_rrot<"uxtab",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+defm UXTAH : AI_bin_rrot<"uxtah",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+}
+
+// This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
+//defm UXTAB16 : xxx<"uxtab16", 0xff00ff>;
+
+// TODO: UXT(A){B|H}16
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+defm ADD  : AsI1_bin_irs<"add", BinOpFrag<(add  node:$LHS, node:$RHS)>>;
+defm SUB  : AsI1_bin_irs<"sub", BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// ADD and SUB with 's' bit set.
+defm ADDS : ASI1_bin_s_irs<"add", BinOpFrag<(addc node:$LHS, node:$RHS)>>;
+defm SUBS : ASI1_bin_s_irs<"sub", BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+
+// FIXME: Do not allow ADC / SBC to be predicated for now.
+defm ADC  : AsXI1_bin_c_irs<"adc", BinOpFrag<(adde node:$LHS, node:$RHS)>>;
+defm SBC  : AsXI1_bin_c_irs<"sbc", BinOpFrag<(sube node:$LHS, node:$RHS)>>;
+
+// These don't define reg/reg forms, because they are handled above.
+def RSBri : AsI1<(ops GPR:$dst, GPR:$a, so_imm:$b),
+                  "rsb", " $dst, $a, $b",
+                  [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]>;
+
+def RSBrs : AsI1<(ops GPR:$dst, GPR:$a, so_reg:$b),
+                  "rsb", " $dst, $a, $b",
+                  [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]>;
+
+// RSB with 's' bit set.
+def RSBSri : AI1<(ops GPR:$dst, GPR:$a, so_imm:$b),
+                 "rsb", "s $dst, $a, $b",
+                 [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]>, Imp<[], [CPSR]>;
+def RSBSrs : AI1<(ops GPR:$dst, GPR:$a, so_reg:$b),
+                 "rsb", "s $dst, $a, $b",
+                 [(set GPR:$dst, (subc so_reg:$b, GPR:$a))]>, Imp<[], [CPSR]>;
+
+// FIXME: Do not allow RSC to be predicated for now. But they can set CPSR.
+def RSCri : AXI1<(ops GPR:$dst, GPR:$a, so_imm:$b, cc_out:$s),
+                 "rsc${s} $dst, $a, $b",
+                 [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>, Imp<[CPSR], []>;
+def RSCrs : AXI1<(ops GPR:$dst, GPR:$a, so_reg:$b, cc_out:$s),
+                 "rsc${s} $dst, $a, $b",
+                 [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>, Imp<[CPSR], []>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+def : ARMPat<(add    GPR:$src, so_imm_neg:$imm),
+             (SUBri  GPR:$src, so_imm_neg:$imm)>;
+
+//def : ARMPat<(addc   GPR:$src, so_imm_neg:$imm),
+//             (SUBSri GPR:$src, so_imm_neg:$imm)>;
+//def : ARMPat<(adde   GPR:$src, so_imm_neg:$imm),
+//             (SBCri  GPR:$src, so_imm_neg:$imm)>;
+
+// Note: These are implemented in C++ code, because they have to generate
+// ADD/SUBrs instructions, which use a complex pattern that a xform function
+// cannot produce.
+// (mul X, 2^n+1) -> (add (X << n), X)
+// (mul X, 2^n-1) -> (rsb X, (X << n))
+
+
+//===----------------------------------------------------------------------===//
+//  Bitwise Instructions.
+//
+
+defm AND   : AsI1_bin_irs<"and", BinOpFrag<(and node:$LHS, node:$RHS)>>;
+defm ORR   : AsI1_bin_irs<"orr", BinOpFrag<(or  node:$LHS, node:$RHS)>>;
+defm EOR   : AsI1_bin_irs<"eor", BinOpFrag<(xor node:$LHS, node:$RHS)>>;
+defm BIC   : AsI1_bin_irs<"bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+def  MVNr  : AsI<(ops GPR:$dst, GPR:$src),
+                 "mvn", " $dst, $src", [(set GPR:$dst, (not GPR:$src))]>;
+def  MVNs  : AsI<(ops GPR:$dst, so_reg:$src),
+                 "mvn", " $dst, $src", [(set GPR:$dst, (not so_reg:$src))]>;
+let isReMaterializable = 1 in
+def  MVNi  : AsI<(ops GPR:$dst, so_imm:$imm),
+                 "mvn", " $dst, $imm", [(set GPR:$dst, so_imm_not:$imm)]>;
+
+def : ARMPat<(and   GPR:$src, so_imm_not:$imm),
+             (BICri GPR:$src, so_imm_not:$imm)>;
+
+//===----------------------------------------------------------------------===//
+//  Multiply Instructions.
+//
+
+def MUL  : AsI<(ops GPR:$dst, GPR:$a, GPR:$b),
+                "mul", " $dst, $a, $b",
+                [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>;
+
+def MLA  : AsI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$c),
+                "mla", " $dst, $a, $b, $c",
+                [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
+
+// Extra precision multiplies with low / high results
+def SMULL : AsI<(ops GPR:$ldst, GPR:$hdst, GPR:$a, GPR:$b),
+                "smull", " $ldst, $hdst, $a, $b", []>;
+
+def UMULL : AsI<(ops GPR:$ldst, GPR:$hdst, GPR:$a, GPR:$b),
+                "umull", " $ldst, $hdst, $a, $b", []>;
+
+// Multiply + accumulate
+def SMLAL : AsI<(ops GPR:$ldst, GPR:$hdst, GPR:$a, GPR:$b),
+                "smlal", " $ldst, $hdst, $a, $b", []>;
+
+def UMLAL : AsI<(ops GPR:$ldst, GPR:$hdst, GPR:$a, GPR:$b),
+                "umlal", " $ldst, $hdst, $a, $b", []>;
+
+def UMAAL : AI<(ops GPR:$ldst, GPR:$hdst, GPR:$a, GPR:$b),
+               "umaal", " $ldst, $hdst, $a, $b", []>,
+            Requires<[IsARM, HasV6]>;
+
+// Most significant word multiply
+def SMMUL : AI<(ops GPR:$dst, GPR:$a, GPR:$b),
+               "smmul", " $dst, $a, $b",
+               [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>,
+            Requires<[IsARM, HasV6]>;
+
+def SMMLA : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$c),
+               "smmla", " $dst, $a, $b, $c",
+               [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>,
+            Requires<[IsARM, HasV6]>;
+
+
+def SMMLS : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$c),
+               "smmls", " $dst, $a, $b, $c",
+               [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>,
+               Requires<[IsARM, HasV6]>;
+
+multiclass AI_smul<string opc, PatFrag opnode> {
+  def BB : AI<(ops GPR:$dst, GPR:$a, GPR:$b),
+              !strconcat(opc, "bb"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+                                      (sext_inreg GPR:$b, i16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+  def BT : AI<(ops GPR:$dst, GPR:$a, GPR:$b),
+              !strconcat(opc, "bt"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+                                      (sra GPR:$b, 16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+  def TB : AI<(ops GPR:$dst, GPR:$a, GPR:$b),
+              !strconcat(opc, "tb"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sra GPR:$a, 16),
+                                      (sext_inreg GPR:$b, i16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+  def TT : AI<(ops GPR:$dst, GPR:$a, GPR:$b),
+              !strconcat(opc, "tt"), " $dst, $a, $b",
+              [(set GPR:$dst, (opnode (sra GPR:$a, 16),
+                                      (sra GPR:$b, 16)))]>,
+            Requires<[IsARM, HasV5TE]>;
+  def WB : AI<(ops GPR:$dst, GPR:$a, GPR:$b),
+              !strconcat(opc, "wb"), " $dst, $a, $b",
+              [(set GPR:$dst, (sra (opnode GPR:$a,
+                                    (sext_inreg GPR:$b, i16)), 16))]>,
+           Requires<[IsARM, HasV5TE]>;
+  def WT : AI<(ops GPR:$dst, GPR:$a, GPR:$b),
+              !strconcat(opc, "wt"), " $dst, $a, $b",
+              [(set GPR:$dst, (sra (opnode GPR:$a,
+                                    (sra GPR:$b, 16)), 16))]>,
+            Requires<[IsARM, HasV5TE]>;
+}
+
+multiclass AI_smla<string opc, PatFrag opnode> {
+  def BB : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc,
+                               (opnode (sext_inreg GPR:$a, i16),
+                                       (sext_inreg GPR:$b, i16))))]>,
+           Requires<[IsARM, HasV5TE]>;
+  def BT : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
+                                                     (sra GPR:$b, 16))))]>,
+           Requires<[IsARM, HasV5TE]>;
+  def TB : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, 16),
+                                                 (sext_inreg GPR:$b, i16))))]>,
+           Requires<[IsARM, HasV5TE]>;
+  def TT : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, 16),
+                                                     (sra GPR:$b, 16))))]>,
+            Requires<[IsARM, HasV5TE]>;
+
+  def WB : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+                                            (sext_inreg GPR:$b, i16)), 16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+  def WT : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc),
+              !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+                                                   (sra GPR:$b, 16)), 16)))]>,
+            Requires<[IsARM, HasV5TE]>;
+}
+
+defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+
+// TODO: Halfword multiple accumulate long: SMLAL<x><y>
+// TODO: Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+
+//===----------------------------------------------------------------------===//
+//  Misc. Arithmetic Instructions.
+//
+
+def CLZ  : AI<(ops GPR:$dst, GPR:$src),
+              "clz", " $dst, $src",
+              [(set GPR:$dst, (ctlz GPR:$src))]>, Requires<[IsARM, HasV5T]>;
+
+def REV  : AI<(ops GPR:$dst, GPR:$src),
+              "rev", " $dst, $src",
+              [(set GPR:$dst, (bswap GPR:$src))]>, Requires<[IsARM, HasV6]>;
+
+def REV16 : AI<(ops GPR:$dst, GPR:$src),
+               "rev16", " $dst, $src",
+               [(set GPR:$dst,
+                   (or (and (srl GPR:$src, 8), 0xFF),
+                       (or (and (shl GPR:$src, 8), 0xFF00),
+                           (or (and (srl GPR:$src, 8), 0xFF0000),
+                               (and (shl GPR:$src, 8), 0xFF000000)))))]>,
+               Requires<[IsARM, HasV6]>;
+
+def REVSH : AI<(ops GPR:$dst, GPR:$src),
+               "revsh", " $dst, $src",
+               [(set GPR:$dst,
+                  (sext_inreg
+                    (or (srl (and GPR:$src, 0xFF00), 8),
+                        (shl GPR:$src, 8)), i16))]>,
+               Requires<[IsARM, HasV6]>;
+
+def PKHBT : AI<(ops GPR:$dst, GPR:$src1, GPR:$src2, i32imm:$shamt),
+               "pkhbt", " $dst, $src1, $src2, LSL $shamt",
+               [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
+                                   (and (shl GPR:$src2, (i32 imm:$shamt)),
+                                        0xFFFF0000)))]>,
+               Requires<[IsARM, HasV6]>;
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)),
+               (PKHBT GPR:$src1, GPR:$src2, 0)>;
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
+               (PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>;
+
+
+def PKHTB : AI<(ops  GPR:$dst, GPR:$src1, GPR:$src2, i32imm:$shamt),
+               "pkhtb", " $dst, $src1, $src2, ASR $shamt",
+               [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
+                                   (and (sra GPR:$src2, imm16_31:$shamt),
+                                        0xFFFF)))]>, Requires<[IsARM, HasV6]>;
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, 16)),
+               (PKHTB GPR:$src1, GPR:$src2, 16)>;
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000),
+                   (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)),
+               (PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>;
+
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+defm CMP  : AI1_cmp_irs<"cmp", BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+defm CMN  : AI1_cmp_irs<"cmn", BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
+
+// Note that TST/TEQ don't set all the same flags that CMP does!
+defm TST  : AI1_cmp_irs<"tst", BinOpFrag<(ARMcmpNZ (and node:$LHS, node:$RHS), 0)>>;
+defm TEQ  : AI1_cmp_irs<"teq", BinOpFrag<(ARMcmpNZ (xor node:$LHS, node:$RHS), 0)>>;
+
+defm CMPnz : AI1_cmp_irs<"cmp", BinOpFrag<(ARMcmpNZ node:$LHS, node:$RHS)>>;
+defm CMNnz : AI1_cmp_irs<"cmn", BinOpFrag<(ARMcmpNZ node:$LHS,(ineg node:$RHS))>>;
+
+def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm),
+             (CMNri  GPR:$src, so_imm_neg:$imm)>;
+
+def : ARMPat<(ARMcmpNZ GPR:$src, so_imm_neg:$imm),
+             (CMNri  GPR:$src, so_imm_neg:$imm)>;
+
+
+// Conditional moves
+// FIXME: should be able to write a pattern for ARMcmov, but can't use
+// a two-value operand where a dag node expects two operands. :( 
+def MOVCCr : AI<(ops GPR:$dst, GPR:$false, GPR:$true),
+                "mov", " $dst, $true",
+      [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst">;
+
+def MOVCCs : AI<(ops GPR:$dst, GPR:$false, so_reg:$true),
+                "mov", " $dst, $true",
+   [/*(set GPR:$dst, (ARMcmov GPR:$false, so_reg:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst">;
+
+def MOVCCi : AI<(ops GPR:$dst, GPR:$false, so_imm:$true),
+                "mov", " $dst, $true",
+   [/*(set GPR:$dst, (ARMcmov GPR:$false, so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst">;
+
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def LEApcrel : AXI1<(ops GPR:$dst, i32imm:$label, pred:$p),
+                   !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
+                                         "${:private}PCRELL${:uid}+8))\n"),
+                              !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                         "add$p $dst, pc, #PCRELV${:uid}")),
+                   []>;
+
+def LEApcrelJT : AXI1<(ops GPR:$dst, i32imm:$label, i32imm:$id, pred:$p),
+          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
+                                         "${:private}PCRELL${:uid}+8))\n"),
+                              !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                         "add$p $dst, pc, #PCRELV${:uid}")),
+                   []>;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+let isCall = 1,
+  Defs = [R0, R12, LR, CPSR] in {
+  def TPsoft : AXI<(ops),
+               "bl __aeabi_read_tp",
+               [(set R0, ARMthread_pointer)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>;
+def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
+def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+             (LEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// Large immediate handling.
+
+// Two piece so_imms.
+let isReMaterializable = 1 in
+def MOVi2pieces : AI1x2<(ops GPR:$dst, so_imm2part:$src),
+                         "mov", " $dst, $src",
+                         [(set GPR:$dst, so_imm2part:$src)]>;
+
+def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS),
+              (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                     (so_imm2part_2 imm:$RHS))>;
+def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS),
+              (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                     (so_imm2part_2 imm:$RHS))>;
+
+// TODO: add,sub,and, 3-instr forms?
+
+
+// Direct calls
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
+
+// zextload i1 -> zextload i8
+def : ARMPat<(zextloadi1 addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+
+// extload -> zextload
+def : ARMPat<(extloadi1  addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+def : ARMPat<(extloadi8  addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+def : ARMPat<(extloadi16 addrmode3:$addr),  (LDRH addrmode3:$addr)>;
+
+// truncstore i1 -> truncstore i8
+def : ARMPat<(truncstorei1 GPR:$src, addrmode2:$dst), 
+             (STRB GPR:$src, addrmode2:$dst)>;
+def : ARMPat<(pre_truncsti1 GPR:$src, GPR:$base, am2offset:$offset), 
+             (STRB_PRE GPR:$src, GPR:$base, am2offset:$offset)>;
+def : ARMPat<(post_truncsti1 GPR:$src, GPR:$base, am2offset:$offset), 
+             (STRB_POST GPR:$src, GPR:$base, am2offset:$offset)>;
+
+// smul* and smla*
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, 16), 16), (sra (shl GPR:$b, 16), 16)),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, 16), 16), (sra GPR:$b, 16)),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, 16)),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, 16), (sra (shl GPR:$b, 16), 16)),
+                 (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, 16), sext_16_node:$b),
+                (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, 16), 16)), 16),
+                 (SMULWB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), 16),
+                 (SMULWB GPR:$a, GPR:$b)>;
+
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, 16), 16),
+                           (sra (shl GPR:$b, 16), 16))),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, sext_16_node:$b)),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, 16), 16), (sra GPR:$b, 16))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, (sra GPR:$b, 16))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, 16), (sra (shl GPR:$b, 16), 16))),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, 16), sext_16_node:$b)),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, (sra (shl GPR:$b, 16), 16)), 16)),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, sext_16_node:$b), 16)),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+
+//===----------------------------------------------------------------------===//
+// Thumb Support
+//
+
+include "ARMInstrThumb.td"
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//
+
+include "ARMInstrVFP.td"

diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
new file mode 100644
index 0000000..27231da
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrThumb.td

@@ -0,0 +1,596 @@
+//===- ARMInstrThumb.td - Thumb support for ARM ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Thumb specific DAG Nodes.
+//
+
+def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
+                      [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+// TI - Thumb instruction.
+
+// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
+class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb];
+}
+
+class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, HasV5T];
+}
+
+class ThumbI<dag ops, AddrMode am, SizeFlagVal sz,
+             string asm, string cstr, list<dag> pattern>
+  // FIXME: Set all opcodes to 0 for now.
+  : InstARM<0, am, sz, IndexModeNone, cstr> {
+  let OperandList = ops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+class TI<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeNone, Size2Bytes, asm, "", pattern>;
+class TI1<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeT1, Size2Bytes, asm, "", pattern>;
+class TI2<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeT2, Size2Bytes, asm, "", pattern>;
+class TI4<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeT4, Size2Bytes, asm, "", pattern>;
+class TIs<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeTs, Size2Bytes, asm, "", pattern>;
+
+// Two-address instructions
+class TIt<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeNone, Size2Bytes, asm, "$lhs = $dst", pattern>;
+
+// BL, BLX(1) are translated by assembler into two instructions
+class TIx2<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeNone, Size4Bytes, asm, "", pattern>;
+
+// BR_JT instructions
+class TJTI<dag ops, string asm, list<dag> pattern>
+  : ThumbI<ops, AddrModeNone, SizeSpecial, asm, "", pattern>;
+
+def imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-(int)N->getValue(), MVT::i32);
+}]>;
+def imm_comp_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getValue()), MVT::i32);
+}]>;
+
+
+/// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7].
+def imm0_7 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getValue() < 8;
+}]>;
+def imm0_7_neg : PatLeaf<(i32 imm), [{
+  return (uint32_t)-N->getValue() < 8;
+}], imm_neg_XFORM>;
+
+def imm0_255 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getValue() < 256;
+}]>;
+def imm0_255_comp : PatLeaf<(i32 imm), [{
+  return ~((uint32_t)N->getValue()) < 256;
+}]>;
+
+def imm8_255 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getValue() >= 8 && (uint32_t)N->getValue() < 256;
+}]>;
+def imm8_255_neg : PatLeaf<(i32 imm), [{
+  unsigned Val = -N->getValue();
+  return Val >= 8 && Val < 256;
+}], imm_neg_XFORM>;
+
+// Break imm's up into two pieces: an immediate + a left shift.
+// This uses thumb_immshifted to match and thumb_immshifted_val and
+// thumb_immshifted_shamt to get the val/shift pieces.
+def thumb_immshifted : PatLeaf<(imm), [{
+  return ARM_AM::isThumbImmShiftedVal((unsigned)N->getValue());
+}]>;
+
+def thumb_immshifted_val : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def thumb_immshifted_shamt : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+// Define Thumb specific addressing modes.
+
+// t_addrmode_rr := reg + reg
+//
+def t_addrmode_rr : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg);
+}
+
+// t_addrmode_s4 := reg + reg
+//                  reg + imm5 * 4
+//
+def t_addrmode_s4 : Operand<i32>,
+                    ComplexPattern<i32, 3, "SelectThumbAddrModeS4", []> {
+  let PrintMethod = "printThumbAddrModeS4Operand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm, GPR:$offsreg);
+}
+
+// t_addrmode_s2 := reg + reg
+//                  reg + imm5 * 2
+//
+def t_addrmode_s2 : Operand<i32>,
+                    ComplexPattern<i32, 3, "SelectThumbAddrModeS2", []> {
+  let PrintMethod = "printThumbAddrModeS2Operand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm, GPR:$offsreg);
+}
+
+// t_addrmode_s1 := reg + reg
+//                  reg + imm5
+//
+def t_addrmode_s1 : Operand<i32>,
+                    ComplexPattern<i32, 3, "SelectThumbAddrModeS1", []> {
+  let PrintMethod = "printThumbAddrModeS1Operand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm, GPR:$offsreg);
+}
+
+// t_addrmode_sp := sp + imm8 * 4
+//
+def t_addrmode_sp : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
+  let PrintMethod = "printThumbAddrModeSPOperand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+def tADJCALLSTACKUP :
+PseudoInst<(ops i32imm:$amt),
+           "@ tADJCALLSTACKUP $amt",
+           [(ARMcallseq_end imm:$amt)]>, Imp<[SP],[SP]>, Requires<[IsThumb]>;
+
+def tADJCALLSTACKDOWN : 
+PseudoInst<(ops i32imm:$amt),
+           "@ tADJCALLSTACKDOWN $amt",
+           [(ARMcallseq_start imm:$amt)]>, Imp<[SP],[SP]>, Requires<[IsThumb]>;
+
+let isNotDuplicable = 1 in
+def tPICADD : TIt<(ops GPR:$dst, GPR:$lhs, pclabel:$cp),
+                  "$cp:\n\tadd $dst, pc",
+                  [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1 in {
+  def tBX_RET : TI<(ops), "bx lr", [(ARMretflag)]>;
+  // Alternative return instruction used by vararg functions.
+  def tBX_RET_vararg : TI<(ops GPR:$dst), "bx $dst", []>;
+}
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+let isLoad = 1, isReturn = 1, isTerminator = 1 in
+def tPOP_RET : TI<(ops reglist:$dst1, variable_ops),
+                   "pop $dst1", []>;
+
+let isCall = 1, noResults = 1, 
+  Defs = [R0, R1, R2, R3, LR,
+          D0, D1, D2, D3, D4, D5, D6, D7] in {
+  def tBL  : TIx2<(ops i32imm:$func, variable_ops),
+                   "bl ${func:call}",
+                   [(ARMtcall tglobaladdr:$func)]>;
+  // ARMv5T and above
+  def tBLXi : TIx2<(ops i32imm:$func, variable_ops),
+                    "blx ${func:call}",
+                    [(ARMcall tglobaladdr:$func)]>, Requires<[HasV5T]>;
+  def tBLXr : TI<(ops GPR:$dst, variable_ops),
+                  "blx $dst",
+                  [(ARMtcall GPR:$dst)]>, Requires<[HasV5T]>;
+  // ARMv4T
+  def tBX : TIx2<(ops GPR:$dst, variable_ops),
+                  "cpy lr, pc\n\tbx $dst",
+                  [(ARMcall_nolink GPR:$dst)]>;
+}
+
+let isBranch = 1, isTerminator = 1, noResults = 1 in {
+  let isBarrier = 1 in {
+    let isPredicable = 1 in
+    def tB   : TI<(ops brtarget:$dst), "b $dst", [(br bb:$dst)]>;
+
+  // Far jump
+  def tBfar  : TIx2<(ops brtarget:$dst), "bl $dst\t@ far jump", []>;
+
+  def tBR_JTr : TJTI<(ops GPR:$dst, jtblock_operand:$jt, i32imm:$id),
+                     "cpy pc, $dst \n\t.align\t2\n$jt",
+                     [(ARMbrjt GPR:$dst, tjumptable:$jt, imm:$id)]>;
+  }
+}
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects two operands. :( 
+let isBranch = 1, isTerminator = 1, noResults = 1 in
+  def tBcc : TI<(ops brtarget:$dst, pred:$cc), "b$cc $dst",
+                 [/*(ARMbrcond bb:$dst, imm:$cc)*/]>;
+
+//===----------------------------------------------------------------------===//
+//  Load Store Instructions.
+//
+
+let isLoad = 1 in {
+def tLDR : TI4<(ops GPR:$dst, t_addrmode_s4:$addr),
+               "ldr $dst, $addr",
+               [(set GPR:$dst, (load t_addrmode_s4:$addr))]>;
+
+def tLDRB : TI1<(ops GPR:$dst, t_addrmode_s1:$addr),
+                "ldrb $dst, $addr",
+                [(set GPR:$dst, (zextloadi8 t_addrmode_s1:$addr))]>;
+
+def tLDRH : TI2<(ops GPR:$dst, t_addrmode_s2:$addr),
+                "ldrh $dst, $addr",
+                [(set GPR:$dst, (zextloadi16 t_addrmode_s2:$addr))]>;
+
+def tLDRSB : TI1<(ops GPR:$dst, t_addrmode_rr:$addr),
+                 "ldrsb $dst, $addr",
+                 [(set GPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>;
+
+def tLDRSH : TI2<(ops GPR:$dst, t_addrmode_rr:$addr),
+                 "ldrsh $dst, $addr",
+                 [(set GPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>;
+
+def tLDRspi : TIs<(ops GPR:$dst, t_addrmode_sp:$addr),
+                  "ldr $dst, $addr",
+                  [(set GPR:$dst, (load t_addrmode_sp:$addr))]>;
+
+// Special instruction for restore. It cannot clobber condition register
+// when it's expanded by eliminateCallFramePseudoInstr().
+def tRestore : TIs<(ops GPR:$dst, t_addrmode_sp:$addr),
+                    "ldr $dst, $addr", []>;
+
+// Load tconstpool
+def tLDRpci : TIs<(ops GPR:$dst, i32imm:$addr),
+                  "ldr $dst, $addr",
+                  [(set GPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>;
+
+// Special LDR for loads from non-pc-relative constpools.
+let isReMaterializable = 1 in
+def tLDRcp  : TIs<(ops GPR:$dst, i32imm:$addr),
+                  "ldr $dst, $addr", []>;
+} // isLoad
+
+let isStore = 1 in {
+def tSTR : TI4<(ops GPR:$src, t_addrmode_s4:$addr),
+               "str $src, $addr",
+               [(store GPR:$src, t_addrmode_s4:$addr)]>;
+
+def tSTRB : TI1<(ops GPR:$src, t_addrmode_s1:$addr),
+                 "strb $src, $addr",
+                 [(truncstorei8 GPR:$src, t_addrmode_s1:$addr)]>;
+
+def tSTRH : TI2<(ops GPR:$src, t_addrmode_s2:$addr),
+                 "strh $src, $addr",
+                 [(truncstorei16 GPR:$src, t_addrmode_s2:$addr)]>;
+
+def tSTRspi : TIs<(ops GPR:$src, t_addrmode_sp:$addr),
+                   "str $src, $addr",
+                   [(store GPR:$src, t_addrmode_sp:$addr)]>;
+
+// Special instruction for spill. It cannot clobber condition register
+// when it's expanded by eliminateCallFramePseudoInstr().
+def tSpill : TIs<(ops GPR:$src, t_addrmode_sp:$addr),
+                  "str $src, $addr", []>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+// TODO: A7-44: LDMIA - load multiple
+
+let isLoad = 1 in
+def tPOP : TI<(ops reglist:$dst1, variable_ops),
+               "pop $dst1", []>;
+
+let isStore = 1 in
+def tPUSH : TI<(ops reglist:$src1, variable_ops),
+                "push $src1", []>;
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+// Add with carry
+def tADC : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+               "adc $dst, $rhs",
+               [(set GPR:$dst, (adde GPR:$lhs, GPR:$rhs))]>;
+
+def tADDS : TI<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+               "add $dst, $lhs, $rhs",
+               [(set GPR:$dst, (addc GPR:$lhs, GPR:$rhs))]>;
+
+
+def tADDi3 : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                "add $dst, $lhs, $rhs",
+                [(set GPR:$dst, (add GPR:$lhs, imm0_7:$rhs))]>;
+
+def tADDi8 : TIt<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                 "add $dst, $rhs",
+                 [(set GPR:$dst, (add GPR:$lhs, imm8_255:$rhs))]>;
+
+def tADDrr : TI<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "add $dst, $lhs, $rhs",
+                [(set GPR:$dst, (add GPR:$lhs, GPR:$rhs))]>;
+
+def tADDhirr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                   "add $dst, $rhs", []>;
+
+def tADDrPCi : TI<(ops GPR:$dst, i32imm:$rhs),
+                  "add $dst, pc, $rhs * 4", []>;
+def tADDrSPi : TI<(ops GPR:$dst, GPR:$sp, i32imm:$rhs),
+                  "add $dst, $sp, $rhs * 4", []>;
+def tADDspi : TIt<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                  "add $dst, $rhs * 4", []>;
+
+def tAND : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "and $dst, $rhs",
+                [(set GPR:$dst, (and GPR:$lhs, GPR:$rhs))]>;
+
+def tASRri : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                "asr $dst, $lhs, $rhs",
+                [(set GPR:$dst, (sra GPR:$lhs, imm:$rhs))]>;
+
+def tASRrr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                 "asr $dst, $rhs",
+                 [(set GPR:$dst, (sra GPR:$lhs, GPR:$rhs))]>;
+
+def tBIC : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+               "bic $dst, $rhs",
+               [(set GPR:$dst, (and GPR:$lhs, (not GPR:$rhs)))]>;
+
+
+def tCMN : TI<(ops GPR:$lhs, GPR:$rhs),
+              "cmn $lhs, $rhs",
+              [(ARMcmp GPR:$lhs, (ineg GPR:$rhs))]>;
+
+def tCMPi8 : TI<(ops GPR:$lhs, i32imm:$rhs),
+               "cmp $lhs, $rhs",
+               [(ARMcmp GPR:$lhs, imm0_255:$rhs)]>;
+
+def tCMPr : TI<(ops GPR:$lhs, GPR:$rhs),
+               "cmp $lhs, $rhs",
+               [(ARMcmp GPR:$lhs, GPR:$rhs)]>;
+
+def tTST  : TI<(ops GPR:$lhs, GPR:$rhs),
+               "tst $lhs, $rhs",
+               [(ARMcmpNZ (and GPR:$lhs, GPR:$rhs), 0)]>;
+
+def tCMNNZ : TI<(ops GPR:$lhs, GPR:$rhs),
+                "cmn $lhs, $rhs",
+                [(ARMcmpNZ GPR:$lhs, (ineg GPR:$rhs))]>;
+
+def tCMPNZi8 : TI<(ops GPR:$lhs, i32imm:$rhs),
+                 "cmp $lhs, $rhs",
+                 [(ARMcmpNZ GPR:$lhs, imm0_255:$rhs)]>;
+
+def tCMPNZr : TI<(ops GPR:$lhs, GPR:$rhs),
+                 "cmp $lhs, $rhs",
+                 [(ARMcmpNZ GPR:$lhs, GPR:$rhs)]>;
+
+// TODO: A7-37: CMP(3) - cmp hi regs
+
+def tEOR : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+               "eor $dst, $rhs",
+               [(set GPR:$dst, (xor GPR:$lhs, GPR:$rhs))]>;
+
+def tLSLri : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                "lsl $dst, $lhs, $rhs",
+                [(set GPR:$dst, (shl GPR:$lhs, imm:$rhs))]>;
+
+def tLSLrr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                 "lsl $dst, $rhs",
+                 [(set GPR:$dst, (shl GPR:$lhs, GPR:$rhs))]>;
+
+def tLSRri : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                "lsr $dst, $lhs, $rhs",
+                [(set GPR:$dst, (srl GPR:$lhs, imm:$rhs))]>;
+
+def tLSRrr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                 "lsr $dst, $rhs",
+                 [(set GPR:$dst, (srl GPR:$lhs, GPR:$rhs))]>;
+
+// FIXME: This is not rematerializable because mov changes the condition code.
+def tMOVi8 : TI<(ops GPR:$dst, i32imm:$src),
+                 "mov $dst, $src",
+                 [(set GPR:$dst, imm0_255:$src)]>;
+
+// TODO: A7-73: MOV(2) - mov setting flag.
+
+
+// Note: MOV(2) of two low regs updates the flags, so we emit this as 'cpy',
+// which is MOV(3).  This also supports high registers.
+def tMOVr  : TI<(ops GPR:$dst, GPR:$src),
+                 "cpy $dst, $src", []>;
+
+def tMUL : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+               "mul $dst, $rhs",
+               [(set GPR:$dst, (mul GPR:$lhs, GPR:$rhs))]>;
+
+def tMVN : TI<(ops GPR:$dst, GPR:$src),
+              "mvn $dst, $src",
+              [(set GPR:$dst, (not GPR:$src))]>;
+
+def tNEG : TI<(ops GPR:$dst, GPR:$src),
+              "neg $dst, $src",
+              [(set GPR:$dst, (ineg GPR:$src))]>;
+
+def tORR : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+               "orr $dst, $rhs",
+               [(set GPR:$dst, (or GPR:$lhs, GPR:$rhs))]>;
+
+
+def tREV : TI<(ops GPR:$dst, GPR:$src),
+              "rev $dst, $src",
+              [(set GPR:$dst, (bswap GPR:$src))]>, 
+              Requires<[IsThumb, HasV6]>;
+
+def tREV16 : TI<(ops GPR:$dst, GPR:$src),
+                "rev16 $dst, $src",
+                [(set GPR:$dst,
+                    (or (and (srl GPR:$src, 8), 0xFF),
+                        (or (and (shl GPR:$src, 8), 0xFF00),
+                            (or (and (srl GPR:$src, 8), 0xFF0000),
+                                (and (shl GPR:$src, 8), 0xFF000000)))))]>,
+                Requires<[IsThumb, HasV6]>;
+
+def tREVSH : TI<(ops GPR:$dst, GPR:$src),
+                "revsh $dst, $src",
+                [(set GPR:$dst,
+                   (sext_inreg
+                     (or (srl (and GPR:$src, 0xFFFF), 8),
+                         (shl GPR:$src, 8)), i16))]>,
+                Requires<[IsThumb, HasV6]>;
+
+def tROR : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "ror $dst, $rhs",
+                [(set GPR:$dst, (rotr GPR:$lhs, GPR:$rhs))]>;
+
+
+// Subtract with carry
+def tSBC : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "sbc $dst, $rhs",
+                [(set GPR:$dst, (sube GPR:$lhs, GPR:$rhs))]>;
+
+def tSUBS : TI<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "sub $dst, $lhs, $rhs",
+               [(set GPR:$dst, (subc GPR:$lhs, GPR:$rhs))]>;
+
+
+// TODO: A7-96: STMIA - store multiple.
+
+def tSUBi3 : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                "sub $dst, $lhs, $rhs",
+                [(set GPR:$dst, (add GPR:$lhs, imm0_7_neg:$rhs))]>;
+                
+def tSUBi8 : TIt<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                  "sub $dst, $rhs",
+                  [(set GPR:$dst, (add GPR:$lhs, imm8_255_neg:$rhs))]>;
+                
+def tSUBrr : TI<(ops GPR:$dst, GPR:$lhs, GPR:$rhs),
+                "sub $dst, $lhs, $rhs",
+                [(set GPR:$dst, (sub GPR:$lhs, GPR:$rhs))]>;
+
+def tSUBspi : TIt<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs),
+                  "sub $dst, $rhs * 4", []>;
+
+def tSXTB  : TI<(ops GPR:$dst, GPR:$src),
+                "sxtb $dst, $src",
+                [(set GPR:$dst, (sext_inreg GPR:$src, i8))]>,
+                Requires<[IsThumb, HasV6]>;
+def tSXTH  : TI<(ops GPR:$dst, GPR:$src),
+                "sxth $dst, $src",
+                [(set GPR:$dst, (sext_inreg GPR:$src, i16))]>,
+                Requires<[IsThumb, HasV6]>;
+
+
+def tUXTB  : TI<(ops GPR:$dst, GPR:$src),
+                "uxtb $dst, $src",
+                [(set GPR:$dst, (and GPR:$src, 0xFF))]>,
+                Requires<[IsThumb, HasV6]>;
+def tUXTH  : TI<(ops GPR:$dst, GPR:$src),
+                "uxth $dst, $src",
+                [(set GPR:$dst, (and GPR:$src, 0xFFFF))]>, 
+                Requires<[IsThumb, HasV6]>;
+
+
+// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC DAG operation.
+// Expanded by the scheduler into a branch sequence.
+let usesCustomDAGSchedInserter = 1 in  // Expanded by the scheduler.
+  def tMOVCCr :
+  PseudoInst<(ops GPR:$dst, GPR:$false, GPR:$true, pred:$cc),
+              "@ tMOVCCr $cc",
+              [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc))*/]>;
+
+// tLEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def tLEApcrel : TIx2<(ops GPR:$dst, i32imm:$label),
+                    !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
+                                          "${:private}PCRELL${:uid}+4))\n"),
+                               !strconcat("\tmov $dst, #PCRELV${:uid}\n",
+                                  "${:private}PCRELL${:uid}:\n\tadd $dst, pc")),
+                    []>;
+
+def tLEApcrelJT : TIx2<(ops GPR:$dst, i32imm:$label, i32imm:$id),
+          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
+                                         "${:private}PCRELL${:uid}+4))\n"),
+                     !strconcat("\tmov $dst, #PCRELV${:uid}\n",
+                                "${:private}PCRELL${:uid}:\n\tadd $dst, pc")),
+                    []>;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+let isCall = 1,
+  Defs = [R0, LR] in {
+  def tTPsoft  : TIx2<(ops),
+               "bl __aeabi_read_tp",
+               [(set R0, ARMthread_pointer)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// ConstantPool, GlobalAddress
+def : ThumbPat<(ARMWrapper  tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>;
+def : ThumbPat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
+
+// JumpTable
+def : ThumbPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+               (tLEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// Direct calls
+def : ThumbPat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>;
+def : ThumbV5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>;
+
+// Indirect calls to ARM routines
+def : ThumbV5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>;
+
+// zextload i1 -> zextload i8
+def : ThumbPat<(zextloadi1 t_addrmode_s1:$addr),
+               (tLDRB t_addrmode_s1:$addr)>;
+                  
+// extload -> zextload
+def : ThumbPat<(extloadi1  t_addrmode_s1:$addr),  (tLDRB t_addrmode_s1:$addr)>;
+def : ThumbPat<(extloadi8  t_addrmode_s1:$addr),  (tLDRB t_addrmode_s1:$addr)>;
+def : ThumbPat<(extloadi16 t_addrmode_s2:$addr),  (tLDRH t_addrmode_s2:$addr)>;
+
+// truncstore i1 -> truncstore i8
+def : ThumbPat<(truncstorei1 GPR:$src, t_addrmode_s1:$dst), 
+               (tSTRB GPR:$src, t_addrmode_s1:$dst)>;
+
+// Large immediate handling.
+
+// Two piece imms.
+def : ThumbPat<(i32 thumb_immshifted:$src),
+               (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)),
+                       (thumb_immshifted_shamt imm:$src))>;
+
+def : ThumbPat<(i32 imm0_255_comp:$src),
+               (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;

diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
new file mode 100644
index 0000000..4bb9f04
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrVFP.td

@@ -0,0 +1,386 @@
+//===- ARMInstrVFP.td - VFP support for ARM -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM VP instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM VFP Instruction templates.
+//
+
+// ARM Float Instruction
+class ASI<dag ops, string opc, string asm, list<dag> pattern>
+  : AI<ops, opc, asm, pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+class ASI5<dag ops, string opc, string asm, list<dag> pattern>
+  : I<ops, AddrMode5, Size4Bytes, IndexModeNone, opc, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+// ARM Double Instruction
+class ADI<dag ops, string opc, string asm, list<dag> pattern>
+  : AI<ops, opc, asm, pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+class ADI5<dag ops, string opc, string asm, list<dag> pattern>
+  : I<ops, AddrMode5, Size4Bytes, IndexModeNone, opc, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+// Special cases.
+class AXSI<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrModeNone, Size4Bytes, IndexModeNone, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+class AXSI5<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrMode5, Size4Bytes, IndexModeNone, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+class AXDI<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrModeNone, Size4Bytes, IndexModeNone, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+class AXDI5<dag ops, string asm, list<dag> pattern>
+  : XI<ops, AddrMode5, Size4Bytes, IndexModeNone, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+}
+
+
+def SDT_FTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDT_ITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDT_CMPFP0 :
+SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_FMDRR :
+SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
+                     SDTCisSameAs<1, 2>]>;
+
+def arm_ftoui  : SDNode<"ARMISD::FTOUI", SDT_FTOI>;
+def arm_ftosi  : SDNode<"ARMISD::FTOSI", SDT_FTOI>;
+def arm_sitof  : SDNode<"ARMISD::SITOF", SDT_ITOF>;
+def arm_uitof  : SDNode<"ARMISD::UITOF", SDT_ITOF>;
+def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTRet, [SDNPInFlag,SDNPOutFlag]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutFlag]>;
+def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutFlag]>;
+def arm_fmdrr  : SDNode<"ARMISD::FMDRR", SDT_FMDRR>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+let isLoad = 1 in {
+def FLDD  : ADI5<(ops DPR:$dst, addrmode5:$addr),
+                 "fldd", " $dst, $addr",
+                 [(set DPR:$dst, (load addrmode5:$addr))]>;
+
+def FLDS  : ASI5<(ops SPR:$dst, addrmode5:$addr),
+                 "flds", " $dst, $addr",
+                 [(set SPR:$dst, (load addrmode5:$addr))]>;
+} // isLoad
+
+let isStore = 1 in {
+def FSTD  : ADI5<(ops DPR:$src, addrmode5:$addr),
+                 "fstd", " $src, $addr",
+                 [(store DPR:$src, addrmode5:$addr)]>;
+
+def FSTS  : ASI5<(ops SPR:$src, addrmode5:$addr),
+                 "fsts", " $src, $addr",
+                 [(store SPR:$src, addrmode5:$addr)]>;
+} // isStore
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+let isLoad = 1 in {
+def FLDMD : AXDI5<(ops addrmode5:$addr, pred:$p, reglist:$dst1, variable_ops),
+                  "fldm${addr:submode}d${p} ${addr:base}, $dst1",
+                  []>;
+
+def FLDMS : AXSI5<(ops addrmode5:$addr, pred:$p, reglist:$dst1, variable_ops),
+                  "fldm${addr:submode}s${p} ${addr:base}, $dst1",
+                  []>;
+} // isLoad
+
+let isStore = 1 in {
+def FSTMD : AXDI5<(ops addrmode5:$addr, pred:$p, reglist:$src1, variable_ops),
+                 "fstm${addr:submode}d${p} ${addr:base}, $src1",
+                 []>;
+
+def FSTMS : AXSI5<(ops addrmode5:$addr, pred:$p, reglist:$src1, variable_ops),
+                 "fstm${addr:submode}s${p} ${addr:base}, $src1",
+                 []>;
+} // isStore
+
+// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
+
+//===----------------------------------------------------------------------===//
+// FP Binary Operations.
+//
+
+def FADDD  : ADI<(ops DPR:$dst, DPR:$a, DPR:$b),
+                 "faddd", " $dst, $a, $b",
+                 [(set DPR:$dst, (fadd DPR:$a, DPR:$b))]>;
+
+def FADDS  : ASI<(ops SPR:$dst, SPR:$a, SPR:$b),
+                 "fadds", " $dst, $a, $b",
+                 [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>;
+
+def FCMPED : ADI<(ops DPR:$a, DPR:$b),
+                 "fcmped", " $a, $b",
+                 [(arm_cmpfp DPR:$a, DPR:$b)]>;
+
+def FCMPES : ASI<(ops SPR:$a, SPR:$b),
+                 "fcmpes", " $a, $b",
+                 [(arm_cmpfp SPR:$a, SPR:$b)]>;
+
+def FDIVD  : ADI<(ops DPR:$dst, DPR:$a, DPR:$b),
+                 "fdivd", " $dst, $a, $b",
+                 [(set DPR:$dst, (fdiv DPR:$a, DPR:$b))]>;
+
+def FDIVS  : ASI<(ops SPR:$dst, SPR:$a, SPR:$b),
+                 "fdivs", " $dst, $a, $b",
+                 [(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>;
+
+def FMULD  : ADI<(ops DPR:$dst, DPR:$a, DPR:$b),
+                 "fmuld", " $dst, $a, $b",
+                 [(set DPR:$dst, (fmul DPR:$a, DPR:$b))]>;
+
+def FMULS  : ASI<(ops SPR:$dst, SPR:$a, SPR:$b),
+                 "fmuls", " $dst, $a, $b",
+                 [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>;
+                 
+def FNMULD  : ADI<(ops DPR:$dst, DPR:$a, DPR:$b),
+                  "fnmuld", " $dst, $a, $b",
+                  [(set DPR:$dst, (fneg (fmul DPR:$a, DPR:$b)))]>;
+
+def FNMULS  : ASI<(ops SPR:$dst, SPR:$a, SPR:$b),
+                  "fnmuls", " $dst, $a, $b",
+                  [(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]>;
+
+// Match reassociated forms only if not sign dependent rounding.
+def : Pat<(fmul (fneg DPR:$a), DPR:$b),
+          (FNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+def : Pat<(fmul (fneg SPR:$a), SPR:$b),
+          (FNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+
+
+def FSUBD  : ADI<(ops DPR:$dst, DPR:$a, DPR:$b),
+                 "fsubd", " $dst, $a, $b",
+                 [(set DPR:$dst, (fsub DPR:$a, DPR:$b))]>;
+
+def FSUBS  : ASI<(ops SPR:$dst, SPR:$a, SPR:$b),
+                 "fsubs", " $dst, $a, $b",
+                 [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]>;
+
+//===----------------------------------------------------------------------===//
+// FP Unary Operations.
+//
+
+def FABSD  : ADI<(ops DPR:$dst, DPR:$a),
+                 "fabsd", " $dst, $a",
+                 [(set DPR:$dst, (fabs DPR:$a))]>;
+
+def FABSS  : ASI<(ops SPR:$dst, SPR:$a),
+                 "fabss", " $dst, $a",
+                 [(set SPR:$dst, (fabs SPR:$a))]>;
+
+def FCMPEZD : ADI<(ops DPR:$a),
+                  "fcmpezd", " $a",
+                  [(arm_cmpfp0 DPR:$a)]>;
+
+def FCMPEZS : ASI<(ops SPR:$a),
+                  "fcmpezs", " $a",
+                  [(arm_cmpfp0 SPR:$a)]>;
+
+def FCVTDS : ADI<(ops DPR:$dst, SPR:$a),
+                 "fcvtds", " $dst, $a",
+                 [(set DPR:$dst, (fextend SPR:$a))]>;
+
+def FCVTSD : ADI<(ops SPR:$dst, DPR:$a),
+                 "fcvtsd", " $dst, $a",
+                 [(set SPR:$dst, (fround DPR:$a))]>;
+
+def FCPYD  : ADI<(ops DPR:$dst, DPR:$a),
+                 "fcpyd", " $dst, $a", []>;
+
+def FCPYS  : ASI<(ops SPR:$dst, SPR:$a),
+                 "fcpys", " $dst, $a", []>;
+
+def FNEGD  : ADI<(ops DPR:$dst, DPR:$a),
+                 "fnegd", " $dst, $a",
+                 [(set DPR:$dst, (fneg DPR:$a))]>;
+
+def FNEGS  : ASI<(ops SPR:$dst, SPR:$a),
+                 "fnegs", " $dst, $a",
+                 [(set SPR:$dst, (fneg SPR:$a))]>;
+
+def FSQRTD  : ADI<(ops DPR:$dst, DPR:$a),
+                 "fsqrtd", " $dst, $a",
+                 [(set DPR:$dst, (fsqrt DPR:$a))]>;
+
+def FSQRTS  : ASI<(ops SPR:$dst, SPR:$a),
+                 "fsqrts", " $dst, $a",
+                 [(set SPR:$dst, (fsqrt SPR:$a))]>;
+
+//===----------------------------------------------------------------------===//
+// FP <-> GPR Copies.  Int <-> FP Conversions.
+//
+
+def IMPLICIT_DEF_SPR : PseudoInst<(ops SPR:$rD, pred:$p),
+                                  "@ IMPLICIT_DEF_SPR $rD",
+                                  [(set SPR:$rD, (undef))]>;
+def IMPLICIT_DEF_DPR : PseudoInst<(ops DPR:$rD, pred:$p),
+                                  "@ IMPLICIT_DEF_DPR $rD",
+                                  [(set DPR:$rD, (undef))]>;
+
+def FMRS   : ASI<(ops GPR:$dst, SPR:$src),
+                 "fmrs", " $dst, $src",
+                 [(set GPR:$dst, (bitconvert SPR:$src))]>;
+
+def FMSR   : ASI<(ops SPR:$dst, GPR:$src),
+                 "fmsr", " $dst, $src",
+                 [(set SPR:$dst, (bitconvert GPR:$src))]>;
+
+
+def FMRRD  : ADI<(ops GPR:$dst1, GPR:$dst2, DPR:$src),
+                 "fmrrd", " $dst1, $dst2, $src",
+                 [/* FIXME: Can't write pattern for multiple result instr*/]>;
+
+// FMDHR: GPR -> SPR
+// FMDLR: GPR -> SPR
+
+def FMDRR : ADI<(ops DPR:$dst, GPR:$src1, GPR:$src2),
+                "fmdrr", " $dst, $src1, $src2",
+                [(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]>;
+
+// FMRDH: SPR -> GPR
+// FMRDL: SPR -> GPR
+// FMRRS: SPR -> GPR
+// FMRX : SPR system reg -> GPR
+
+// FMSRR: GPR -> SPR
+
+def FMSTAT : ASI<(ops), "fmstat", "", [(arm_fmstat)]>, Imp<[], [CPSR]>;
+
+// FMXR: GPR -> VFP Sstem reg
+
+
+// Int to FP:
+
+def FSITOD : ADI<(ops DPR:$dst, SPR:$a),
+                 "fsitod", " $dst, $a",
+                 [(set DPR:$dst, (arm_sitof SPR:$a))]>;
+
+def FSITOS : ASI<(ops SPR:$dst, SPR:$a),
+                 "fsitos", " $dst, $a",
+                 [(set SPR:$dst, (arm_sitof SPR:$a))]>;
+
+def FUITOD : ADI<(ops DPR:$dst, SPR:$a),
+                 "fuitod", " $dst, $a",
+                 [(set DPR:$dst, (arm_uitof SPR:$a))]>;
+
+def FUITOS : ASI<(ops SPR:$dst, SPR:$a),
+                 "fuitos", " $dst, $a",
+                 [(set SPR:$dst, (arm_uitof SPR:$a))]>;
+
+// FP to Int:
+// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+
+def FTOSIZD : ADI<(ops SPR:$dst, DPR:$a),
+                 "ftosizd", " $dst, $a",
+                 [(set SPR:$dst, (arm_ftosi DPR:$a))]>;
+
+def FTOSIZS : ASI<(ops SPR:$dst, SPR:$a),
+                 "ftosizs", " $dst, $a",
+                 [(set SPR:$dst, (arm_ftosi SPR:$a))]>;
+
+def FTOUIZD : ADI<(ops SPR:$dst, DPR:$a),
+                 "ftouizd", " $dst, $a",
+                 [(set SPR:$dst, (arm_ftoui DPR:$a))]>;
+
+def FTOUIZS : ASI<(ops SPR:$dst, SPR:$a),
+                 "ftouizs", " $dst, $a",
+                 [(set SPR:$dst, (arm_ftoui SPR:$a))]>;
+
+//===----------------------------------------------------------------------===//
+// FP FMA Operations.
+//
+
+def FMACD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b),
+                "fmacd", " $dst, $a, $b",
+                [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FMACS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b),
+                "fmacs", " $dst, $a, $b",
+                [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FMSCD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b),
+                "fmscd", " $dst, $a, $b",
+                [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FMSCS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b),
+                "fmscs", " $dst, $a, $b",
+                [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FNMACD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b),
+                 "fnmacd", " $dst, $a, $b",
+             [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FNMACS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b),
+                "fnmacs", " $dst, $a, $b",
+             [(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FNMSCD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b),
+                 "fnmscd", " $dst, $a, $b",
+             [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def FNMSCS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b),
+                "fnmscs", " $dst, $a, $b",
+             [(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+//===----------------------------------------------------------------------===//
+// FP Conditional moves.
+//
+
+def FCPYDcc  : ADI<(ops DPR:$dst, DPR:$false, DPR:$true),
+                    "fcpyd", " $dst, $true",
+                [/*(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+def FCPYScc  : ASI<(ops SPR:$dst, SPR:$false, SPR:$true),
+                    "fcpys", " $dst, $true",
+                [/*(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+def FNEGDcc  : ADI<(ops DPR:$dst, DPR:$false, DPR:$true),
+                    "fnegd", " $dst, $true",
+                [/*(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+def FNEGScc  : ASI<(ops SPR:$dst, SPR:$false, SPR:$true),
+                    "fnegs", " $dst, $true",
+                [/*(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;

diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
new file mode 100644
index 0000000..294a12b
--- /dev/null
+++ b/lib/Target/ARM/ARMJITInfo.cpp

@@ -0,0 +1,131 @@
+//===-- ARMJITInfo.cpp - Implement the JIT interfaces for the ARM target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Raul Herbster and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "ARMJITInfo.h"
+#include "ARMRelocations.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/Config/alloca.h"
+#include <cstdlib>
+using namespace llvm;
+
+void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  unsigned char *OldByte = (unsigned char *)Old;
+  *OldByte++ = 0xEA;                // Emit B opcode.
+  unsigned *OldWord = (unsigned *)OldByte;
+  unsigned NewAddr = (intptr_t)New;
+  unsigned OldAddr = (intptr_t)OldWord;
+  *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+}
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+extern "C" {
+#if defined(__arm__)
+  void ARMCompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl ARMCompilationCallback\n"
+    "ARMCompilationCallback:\n"
+    // save main registers
+    "mov    ip, sp\n"
+    "stmfd  sp!, {fp, ip, lr, pc}\n"
+    "sub    fp, ip, #4\n"
+    // arguments to Compilation Callback
+    // r0 - our lr (address of the call instruction in stub plus 4)
+    // r1 - stub's lr (address of instruction that called the stub plus 4)
+    "mov    r0, fp\n"  // stub's frame
+    "mov    r1, lr\n"  // stub's lr
+    "bl     ARMCompilationCallbackC\n"
+    // restore main registers
+    "ldmfd  sp, {fp, sp, pc}\n");
+#else // Not an ARM host
+  void ARMCompilationCallback() {
+    assert(0 && "Cannot call ARMCompilationCallback() on a non-ARM arch!\n");
+    abort();
+  }
+#endif
+}
+
+/// ARMCompilationCallbackC - This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call.  This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+extern "C" void ARMCompilationCallbackC(intptr_t *StackPtr, intptr_t RetAddr) {
+  intptr_t *RetAddrLoc = &StackPtr[-1];
+
+  assert(*RetAddrLoc == RetAddr &&
+         "Could not find return address on the stack!");
+#if 0
+  DOUT << "In callback! Addr=" << (void*)RetAddr
+       << " FP=" << (void*)StackPtr
+       << ": Resolving call to function: "
+       << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n";
+#endif
+
+  // Sanity check to make sure this really is a branch and link instruction.
+  assert(((unsigned char*)RetAddr-1)[3] == 0xEB && "Not a branch and link instr!");
+
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call.
+  *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+
+  // Change the return address to reexecute the branch and link instruction...
+  *RetAddrLoc -= 1;
+}
+
+TargetJITInfo::LazyResolverFn
+ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return ARMCompilationCallback;
+}
+
+void *ARMJITInfo::emitFunctionStub(void *Fn, MachineCodeEmitter &MCE) {
+  unsigned addr = (intptr_t)Fn-MCE.getCurrentPCValue()-4;
+  // If this is just a call to an external function, emit a branch instead of a
+  // call.  The code is the same except for one bit of the last instruction.
+  if (Fn != (void*)(intptr_t)ARMCompilationCallback) {
+    MCE.startFunctionStub(4, 2);
+    MCE.emitByte(0xEA);  // branch to the corresponding function addr 
+    MCE.emitByte((unsigned char)(addr >>  0));
+    MCE.emitByte((unsigned char)(addr >>  8));
+    MCE.emitByte((unsigned char)(addr >>  16));
+    return MCE.finishFunctionStub(0);
+  } else {
+    MCE.startFunctionStub(5, 2);
+    MCE.emitByte(0xEB);  // branch and link to the corresponding function addr
+  }
+  MCE.emitByte((unsigned char)(addr >>  0));
+  MCE.emitByte((unsigned char)(addr >>  8));
+  MCE.emitByte((unsigned char)(addr >>  16));
+
+  return MCE.finishFunctionStub(0);
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+
+}

diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h
new file mode 100644
index 0000000..bd0ea84
--- /dev/null
+++ b/lib/Target/ARM/ARMJITInfo.h

@@ -0,0 +1,50 @@
+//===- ARMJITInfo.h - ARM implementation of the JIT interface  --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Raul Herbster and is distributed under the 
+// University of Illinois Open Source License.  See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMJITINFO_H
+#define ARMJITINFO_H
+
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class ARMTargetMachine;
+
+  class ARMJITInfo : public TargetJITInfo {
+    ARMTargetMachine &TM;
+  public:
+    ARMJITInfo(ARMTargetMachine &tm) : TM(tm) {useGOT = 0;}
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    /// emitFunctionStub - Use the specified MachineCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(void *Fn, MachineCodeEmitter &MCE);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+  };
+}
+
+#endif

diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
new file mode 100644
index 0000000..7562c5b
--- /dev/null
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp

@@ -0,0 +1,750 @@
+//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-ldst-opt"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/MRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+STATISTIC(NumLDMGened , "Number of ldm instructions generated");
+STATISTIC(NumSTMGened , "Number of stm instructions generated");
+STATISTIC(NumFLDMGened, "Number of fldm instructions generated");
+STATISTIC(NumFSTMGened, "Number of fstm instructions generated");
+
+namespace {
+  struct VISIBILITY_HIDDEN ARMLoadStoreOpt : public MachineFunctionPass {
+    static char ID;
+    ARMLoadStoreOpt() : MachineFunctionPass((intptr_t)&ID) {}
+
+    const TargetInstrInfo *TII;
+    const MRegisterInfo *MRI;
+    ARMFunctionInfo *AFI;
+    RegScavenger *RS;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM load / store optimization pass";
+    }
+
+  private:
+    struct MemOpQueueEntry {
+      int Offset;
+      unsigned Position;
+      MachineBasicBlock::iterator MBBI;
+      bool Merged;
+      MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i)
+        : Offset(o), Position(p), MBBI(i), Merged(false) {};
+    };
+    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
+    typedef MemOpQueue::iterator MemOpQueueIter;
+
+    SmallVector<MachineBasicBlock::iterator, 4>
+    MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
+                 int Opcode, unsigned Size,
+                 ARMCC::CondCodes Pred, unsigned PredReg,
+                 unsigned Scratch, MemOpQueue &MemOps);
+
+    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
+    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
+    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+  };
+  char ARMLoadStoreOpt::ID = 0;
+}
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createARMLoadStoreOptimizationPass() {
+  return new ARMLoadStoreOpt();
+}
+
+static int getLoadStoreMultipleOpcode(int Opcode) {
+  switch (Opcode) {
+  case ARM::LDR:
+    NumLDMGened++;
+    return ARM::LDM;
+  case ARM::STR:
+    NumSTMGened++;
+    return ARM::STM;
+  case ARM::FLDS:
+    NumFLDMGened++;
+    return ARM::FLDMS;
+  case ARM::FSTS:
+    NumFSTMGened++;
+    return ARM::FSTMS;
+  case ARM::FLDD:
+    NumFLDMGened++;
+    return ARM::FLDMD;
+  case ARM::FSTD:
+    NumFSTMGened++;
+    return ARM::FSTMD;
+  default: abort();
+  }
+  return 0;
+}
+
+/// mergeOps - Create and insert a LDM or STM with Base as base register and
+/// registers in Regs as the register operands that would be loaded / stored.
+/// It returns true if the transformation is done. 
+static bool mergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     int Offset, unsigned Base, bool BaseKill, int Opcode,
+                     ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
+                     SmallVector<std::pair<unsigned, bool>, 8> &Regs,
+                     const TargetInstrInfo *TII) {
+  // Only a single register to load / store. Don't bother.
+  unsigned NumRegs = Regs.size();
+  if (NumRegs <= 1)
+    return false;
+
+  ARM_AM::AMSubMode Mode = ARM_AM::ia;
+  bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  if (isAM4 && Offset == 4)
+    Mode = ARM_AM::ib;
+  else if (isAM4 && Offset == -4 * (int)NumRegs + 4)
+    Mode = ARM_AM::da;
+  else if (isAM4 && Offset == -4 * (int)NumRegs)
+    Mode = ARM_AM::db;
+  else if (Offset != 0) {
+    // If starting offset isn't zero, insert a MI to materialize a new base.
+    // But only do so if it is cost effective, i.e. merging more than two
+    // loads / stores.
+    if (NumRegs <= 2)
+      return false;
+
+    unsigned NewBase;
+    if (Opcode == ARM::LDR)
+      // If it is a load, then just use one of the destination register to
+      // use as the new base.
+      NewBase = Regs[NumRegs-1].first;
+    else {
+      // Use the scratch register to use as a new base.
+      NewBase = Scratch;
+      if (NewBase == 0)
+        return false;
+    }
+    int BaseOpc = ARM::ADDri;
+    if (Offset < 0) {
+      BaseOpc = ARM::SUBri;
+      Offset = - Offset;
+    }
+    int ImmedOffset = ARM_AM::getSOImmVal(Offset);
+    if (ImmedOffset == -1)
+      return false;  // Probably not worth it then.
+
+    BuildMI(MBB, MBBI, TII->get(BaseOpc), NewBase)
+      .addReg(Base, false, false, BaseKill).addImm(ImmedOffset)
+      .addImm(Pred).addReg(PredReg).addReg(0);
+    Base = NewBase;
+    BaseKill = true;  // New base is always killed right its use.
+  }
+
+  bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD;
+  bool isDef = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+  Opcode = getLoadStoreMultipleOpcode(Opcode);
+  MachineInstrBuilder MIB = (isAM4)
+    ? BuildMI(MBB, MBBI, TII->get(Opcode)).addReg(Base, false, false, BaseKill)
+        .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg)
+    : BuildMI(MBB, MBBI, TII->get(Opcode)).addReg(Base, false, false, BaseKill)
+        .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs))
+        .addImm(Pred).addReg(PredReg);
+  for (unsigned i = 0; i != NumRegs; ++i)
+    MIB = MIB.addReg(Regs[i].first, isDef, false, Regs[i].second);
+
+  return true;
+}
+
+/// MergeLDR_STR - Merge a number of load / store instructions into one or more
+/// load / store multiple instructions.
+SmallVector<MachineBasicBlock::iterator, 4>
+ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
+                              unsigned Base, int Opcode, unsigned Size,
+                              ARMCC::CondCodes Pred, unsigned PredReg,
+                              unsigned Scratch, MemOpQueue &MemOps) {
+  SmallVector<MachineBasicBlock::iterator, 4> Merges;
+  bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  int Offset = MemOps[SIndex].Offset;
+  int SOffset = Offset;
+  unsigned Pos = MemOps[SIndex].Position;
+  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
+  unsigned PReg = MemOps[SIndex].MBBI->getOperand(0).getReg();
+  unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg);
+  bool isKill = MemOps[SIndex].MBBI->getOperand(0).isKill();
+
+  SmallVector<std::pair<unsigned,bool>, 8> Regs;
+  Regs.push_back(std::make_pair(PReg, isKill));
+  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
+    int NewOffset = MemOps[i].Offset;
+    unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg();
+    unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+    isKill = MemOps[i].MBBI->getOperand(0).isKill();
+    // AM4 - register numbers in ascending order.
+    // AM5 - consecutive register numbers in ascending order.
+    if (NewOffset == Offset + (int)Size &&
+        ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) {
+      Offset += Size;
+      Regs.push_back(std::make_pair(Reg, isKill));
+      PRegNum = RegNum;
+    } else {
+      // Can't merge this in. Try merge the earlier ones first.
+      if (mergeOps(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg,
+                   Scratch, Regs, TII)) {
+        Merges.push_back(prior(Loc));
+        for (unsigned j = SIndex; j < i; ++j) {
+          MBB.erase(MemOps[j].MBBI);
+          MemOps[j].Merged = true;
+        }
+      }
+      SmallVector<MachineBasicBlock::iterator, 4> Merges2 =
+        MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,MemOps);
+      Merges.append(Merges2.begin(), Merges2.end());
+      return Merges;
+    }
+
+    if (MemOps[i].Position > Pos) {
+      Pos = MemOps[i].Position;
+      Loc = MemOps[i].MBBI;
+    }
+  }
+
+  bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
+  if (mergeOps(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg,
+               Scratch, Regs, TII)) {
+    Merges.push_back(prior(Loc));
+    for (unsigned i = SIndex, e = MemOps.size(); i != e; ++i) {
+      MBB.erase(MemOps[i].MBBI);
+      MemOps[i].Merged = true;
+    }
+  }
+
+  return Merges;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+static ARMCC::CondCodes getInstrPredicate(MachineInstr *MI, unsigned &PredReg) {
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx == -1) {
+    PredReg = 0;
+    return ARMCC::AL;
+  }
+
+  PredReg = MI->getOperand(PIdx+1).getReg();
+  return (ARMCC::CondCodes)MI->getOperand(PIdx).getImmedValue();
+}
+
+static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
+                                       unsigned Bytes, ARMCC::CondCodes Pred,
+                                       unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  return (MI && MI->getOpcode() == ARM::SUBri &&
+          MI->getOperand(0).getReg() == Base &&
+          MI->getOperand(1).getReg() == Base &&
+          ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes &&
+          getInstrPredicate(MI, MyPredReg) == Pred &&
+          MyPredReg == PredReg);
+}
+
+static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
+                                       unsigned Bytes, ARMCC::CondCodes Pred,
+                                       unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  return (MI && MI->getOpcode() == ARM::ADDri &&
+          MI->getOperand(0).getReg() == Base &&
+          MI->getOperand(1).getReg() == Base &&
+          ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes &&
+          getInstrPredicate(MI, MyPredReg) == Pred &&
+          MyPredReg == PredReg);
+}
+
+static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case ARM::LDR:
+  case ARM::STR:
+  case ARM::FLDS:
+  case ARM::FSTS:
+    return 4;
+  case ARM::FLDD:
+  case ARM::FSTD:
+    return 8;
+  case ARM::LDM:
+  case ARM::STM:
+    return (MI->getNumOperands() - 4) * 4;
+  case ARM::FLDMS:
+  case ARM::FSTMS:
+  case ARM::FLDMD:
+  case ARM::FSTMD:
+    return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4;
+  }
+}
+
+/// mergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
+/// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible:
+///
+/// stmia rn, <ra, rb, rc>
+/// rn := rn + 4 * 3;
+/// =>
+/// stmia rn!, <ra, rb, rc>
+///
+/// rn := rn - 4 * 3;
+/// ldmia rn, <ra, rb, rc>
+/// =>
+/// ldmdb rn!, <ra, rb, rc>
+static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI) {
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(0).getReg();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  int Opcode = MI->getOpcode();
+  bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::STM;
+
+  if (isAM4) {
+    if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm()))
+      return false;
+
+    // Can't use the updating AM4 sub-mode if the base register is also a dest
+    // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
+    for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
+      if (MI->getOperand(i).getReg() == Base)
+        return false;
+    }
+
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
+    if (MBBI != MBB.begin()) {
+      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true));
+        MBB.erase(PrevMBBI);
+        return true;
+      } else if (Mode == ARM_AM::ib &&
+                 isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true));
+        MBB.erase(PrevMBBI);
+        return true;
+      }
+    }
+
+    if (MBBI != MBB.end()) {
+      MachineBasicBlock::iterator NextMBBI = next(MBBI);
+      if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
+          isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+        MBB.erase(NextMBBI);
+        return true;
+      } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
+                 isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+        MBB.erase(NextMBBI);
+        return true;
+      }
+    }
+  } else {
+    // FLDM{D|S}, FSTM{D|S} addressing mode 5 ops.
+    if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm()))
+      return false;
+
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm());
+    unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm());
+    if (MBBI != MBB.begin()) {
+      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset));
+        MBB.erase(PrevMBBI);
+        return true;
+      }
+    }
+
+    if (MBBI != MBB.end()) {
+      MachineBasicBlock::iterator NextMBBI = next(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
+        MBB.erase(NextMBBI);
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
+  switch (Opc) {
+  case ARM::LDR: return ARM::LDR_PRE;
+  case ARM::STR: return ARM::STR_PRE;
+  case ARM::FLDS: return ARM::FLDMS;
+  case ARM::FLDD: return ARM::FLDMD;
+  case ARM::FSTS: return ARM::FSTMS;
+  case ARM::FSTD: return ARM::FSTMD;
+  default: abort();
+  }
+  return 0;
+}
+
+static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
+  switch (Opc) {
+  case ARM::LDR: return ARM::LDR_POST;
+  case ARM::STR: return ARM::STR_POST;
+  case ARM::FLDS: return ARM::FLDMS;
+  case ARM::FLDD: return ARM::FLDMD;
+  case ARM::FSTS: return ARM::FSTMS;
+  case ARM::FSTD: return ARM::FSTMD;
+  default: abort();
+  }
+  return 0;
+}
+
+/// mergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
+/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
+static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     const TargetInstrInfo *TII) {
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(1).getReg();
+  bool BaseKill = MI->getOperand(1).isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  int Opcode = MI->getOpcode();
+  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  if ((isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) ||
+      (!isAM2 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0))
+    return false;
+
+  bool isLd = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD;
+  // Can't do the merge if the destination register is the same as the would-be
+  // writeback register.
+  if (isLd && MI->getOperand(0).getReg() == Base)
+    return false;
+
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool DoMerge = false;
+  ARM_AM::AddrOpc AddSub = ARM_AM::add;
+  unsigned NewOpc = 0;
+  if (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
+    } else if (isAM2 && isMatchingIncrement(PrevMBBI, Base, Bytes,
+                                            Pred, PredReg)) {
+      DoMerge = true;
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
+    }
+    if (DoMerge)
+      MBB.erase(PrevMBBI);
+  }
+
+  if (!DoMerge && MBBI != MBB.end()) {
+    MachineBasicBlock::iterator NextMBBI = next(MBBI);
+    if (isAM2 && isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
+    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) {
+      DoMerge = true;
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
+    }
+    if (DoMerge)
+      MBB.erase(NextMBBI);
+  }
+
+  if (!DoMerge)
+    return false;
+
+  bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD;
+  unsigned Offset = isAM2 ? ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift)
+    : ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) ? ARM_AM::db : ARM_AM::ia,
+                        true, isDPR ? 2 : 1);
+  if (isLd) {
+    if (isAM2)
+      // LDR_PRE, LDR_POST;
+      BuildMI(MBB, MBBI, TII->get(NewOpc), MI->getOperand(0).getReg())
+        .addReg(Base, true)
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+    else
+      // FLDMS, FLDMD
+      BuildMI(MBB, MBBI, TII->get(NewOpc)).addReg(Base, false, false, BaseKill)
+        .addImm(Offset).addImm(Pred).addReg(PredReg)
+        .addReg(MI->getOperand(0).getReg(), true);
+  } else {
+    MachineOperand &MO = MI->getOperand(0);
+    if (isAM2)
+      // STR_PRE, STR_POST;
+      BuildMI(MBB, MBBI, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), false, false, MO.isKill())
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+    else
+      // FSTMS, FSTMD
+      BuildMI(MBB, MBBI, TII->get(NewOpc)).addReg(Base).addImm(Offset)
+        .addImm(Pred).addReg(PredReg)
+        .addReg(MO.getReg(), false, false, MO.isKill());
+  }
+  MBB.erase(MBBI);
+
+  return true;
+}
+
+/// isMemoryOp - Returns true if instruction is a memory operations (that this
+/// pass is capable of operating on).
+static bool isMemoryOp(MachineInstr *MI) {
+  int Opcode = MI->getOpcode();
+  switch (Opcode) {
+  default: break;
+  case ARM::LDR:
+  case ARM::STR:
+    return MI->getOperand(1).isRegister() && MI->getOperand(2).getReg() == 0;
+  case ARM::FLDS:
+  case ARM::FSTS:
+    return MI->getOperand(1).isRegister();
+  case ARM::FLDD:
+  case ARM::FSTD:
+    return MI->getOperand(1).isRegister();
+  }
+  return false;
+}
+
+/// AdvanceRS - Advance register scavenger to just before the earliest memory
+/// op that is being merged.
+void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
+  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
+  unsigned Position = MemOps[0].Position;
+  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
+    if (MemOps[i].Position < Position) {
+      Position = MemOps[i].Position;
+      Loc = MemOps[i].MBBI;
+    }
+  }
+
+  if (Loc != MBB.begin())
+    RS->forward(prior(Loc));
+}
+
+/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
+/// ops of the same base and incrementing offset into LDM / STM ops.
+bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
+  unsigned NumMerges = 0;
+  unsigned NumMemOps = 0;
+  MemOpQueue MemOps;
+  unsigned CurrBase = 0;
+  int CurrOpc = -1;
+  unsigned CurrSize = 0;
+  ARMCC::CondCodes CurrPred = ARMCC::AL;
+  unsigned CurrPredReg = 0;
+  unsigned Position = 0;
+
+  RS->enterBasicBlock(&MBB);
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    bool Advance  = false;
+    bool TryMerge = false;
+    bool Clobber  = false;
+
+    bool isMemOp = isMemoryOp(MBBI);
+    if (isMemOp) {
+      int Opcode = MBBI->getOpcode();
+      bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
+      unsigned Size = getLSMultipleTransferSize(MBBI);
+      unsigned Base = MBBI->getOperand(1).getReg();
+      unsigned PredReg = 0;
+      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
+      const TargetInstrDescriptor *TID = MBBI->getInstrDescriptor();
+      unsigned OffField = MBBI->getOperand(TID->numOperands-3).getImm();
+      int Offset = isAM2
+        ? ARM_AM::getAM2Offset(OffField) : ARM_AM::getAM5Offset(OffField) * 4;
+      if (isAM2) {
+        if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
+          Offset = -Offset;
+      } else {
+        if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
+          Offset = -Offset;
+      }
+      // Watch out for:
+      // r4 := ldr [r5]
+      // r5 := ldr [r5, #4]
+      // r6 := ldr [r5, #8]
+      //
+      // The second ldr has effectively broken the chain even though it
+      // looks like the later ldr(s) use the same base register. Try to
+      // merge the ldr's so far, including this one. But don't try to
+      // combine the following ldr(s).
+      Clobber = (Opcode == ARM::LDR && Base == MBBI->getOperand(0).getReg());
+      if (CurrBase == 0 && !Clobber) {
+        // Start of a new chain.
+        CurrBase = Base;
+        CurrOpc  = Opcode;
+        CurrSize = Size;
+        CurrPred = Pred;
+        CurrPredReg = PredReg;
+        MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
+        NumMemOps++;
+        Advance = true;
+      } else {
+        if (Clobber) {
+          TryMerge = true;
+          Advance = true;
+        }
+
+        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
+          // No need to match PredReg.
+          // Continue adding to the queue.
+          if (Offset > MemOps.back().Offset) {
+            MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
+            NumMemOps++;
+            Advance = true;
+          } else {
+            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
+                 I != E; ++I) {
+              if (Offset < I->Offset) {
+                MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI));
+                NumMemOps++;
+                Advance = true;
+                break;
+              } else if (Offset == I->Offset) {
+                // Collision! This can't be merged!
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (Advance) {
+      ++Position;
+      ++MBBI;
+    } else
+      TryMerge = true;
+
+    if (TryMerge) {
+      if (NumMemOps > 1) {
+        // Try to find a free register to use as a new base in case it's needed.
+        // First advance to the instruction just before the start of the chain.
+        AdvanceRS(MBB, MemOps);
+        // Find a scratch register. Make sure it's a call clobbered register or
+        // a spilled callee-saved register.
+        unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass, true);
+        if (!Scratch)
+          Scratch = RS->FindUnusedReg(&ARM::GPRRegClass,
+                                      AFI->getSpilledCSRegisters());
+        // Process the load / store instructions.
+        RS->forward(prior(MBBI));
+
+        // Merge ops.
+        SmallVector<MachineBasicBlock::iterator,4> MBBII =
+          MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
+                       CurrPred, CurrPredReg, Scratch, MemOps);
+
+        // Try folding preceeding/trailing base inc/dec into the generated
+        // LDM/STM ops.
+        for (unsigned i = 0, e = MBBII.size(); i < e; ++i)
+          if (mergeBaseUpdateLSMultiple(MBB, MBBII[i]))
+            NumMerges++;
+        NumMerges += MBBII.size();
+
+        // Try folding preceeding/trailing base inc/dec into those load/store
+        // that were not merged to form LDM/STM ops.
+        for (unsigned i = 0; i != NumMemOps; ++i)
+          if (!MemOps[i].Merged)
+            if (mergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII))
+              NumMerges++;
+
+        // RS may be pointing to an instruction that's deleted. 
+        RS->skipTo(prior(MBBI));
+      }
+
+      CurrBase = 0;
+      CurrOpc = -1;
+      CurrSize = 0;
+      CurrPred = ARMCC::AL;
+      CurrPredReg = 0;
+      if (NumMemOps) {
+        MemOps.clear();
+        NumMemOps = 0;
+      }
+
+      // If iterator hasn't been advanced and this is not a memory op, skip it.
+      // It can't start a new chain anyway.
+      if (!Advance && !isMemOp && MBBI != E) {
+        ++Position;
+        ++MBBI;
+      }
+    }
+  }
+  return NumMerges > 0;
+}
+
+/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op
+/// (bx lr) into the preceeding stack restore so it directly restore the value
+/// of LR into pc.
+///   ldmfd sp!, {r7, lr}
+///   bx lr
+/// =>
+///   ldmfd sp!, {r7, pc}
+bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+  if (MBB.empty()) return false;
+
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  if (MBBI->getOpcode() == ARM::BX_RET && MBBI != MBB.begin()) {
+    MachineInstr *PrevMI = prior(MBBI);
+    if (PrevMI->getOpcode() == ARM::LDM) {
+      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
+      if (MO.getReg() == ARM::LR) {
+        PrevMI->setInstrDescriptor(TII->get(ARM::LDM_RET));
+        MO.setReg(ARM::PC);
+        MBB.erase(MBBI);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = TM.getInstrInfo();
+  MRI = TM.getRegisterInfo();
+  RS = new RegScavenger();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= LoadStoreMultipleOpti(MBB);
+    Modified |= MergeReturnIntoLDM(MBB);
+  }
+
+  delete RS;
+  return Modified;
+}

diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
new file mode 100644
index 0000000..665c5e3
--- /dev/null
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h

@@ -0,0 +1,220 @@
+//====- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file declares ARM-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMACHINEFUNCTIONINFO_H
+#define ARMMACHINEFUNCTIONINFO_H
+
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/MRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+
+/// ARMFunctionInfo - This class is derived from MachineFunction private
+/// ARM target-specific information for each MachineFunction.
+class ARMFunctionInfo : public MachineFunctionInfo {
+
+  /// isThumb - True if this function is compiled under Thumb mode.
+  /// Used to initialized Align, so must precede it.
+  bool isThumb;
+
+  /// Align - required alignment.  ARM functions and Thumb functions with
+  /// constant pools require 4-byte alignment; other Thumb functions
+  /// require only 2-byte alignment.
+  unsigned Align;
+
+  /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+  ///
+  unsigned VarArgsRegSaveSize;
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
+
+  /// LRSpilledForFarJump - True if the LR register has been for spilled to
+  /// enable far jump.
+  bool LRSpilledForFarJump;
+
+  /// R3IsLiveIn - True if R3 is live in to this function.
+  /// FIXME: Remove when register scavenger for Thumb is done.
+  bool R3IsLiveIn;
+
+  /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
+  /// spill stack offset.
+  unsigned FramePtrSpillOffset;
+
+  /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved
+  /// register spills areas. For Mac OS X:
+  ///
+  /// GPR callee-saved (1) : r4, r5, r6, r7, lr
+  /// --------------------------------------------
+  /// GPR callee-saved (2) : r8, r10, r11
+  /// --------------------------------------------
+  /// DPR callee-saved : d8 - d15
+  unsigned GPRCS1Offset;
+  unsigned GPRCS2Offset;
+  unsigned DPRCSOffset;
+
+  /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
+  /// areas.
+  unsigned GPRCS1Size;
+  unsigned GPRCS2Size;
+  unsigned DPRCSSize;
+
+  /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices
+  /// which belong to these spill areas.
+  BitVector GPRCS1Frames;
+  BitVector GPRCS2Frames;
+  BitVector DPRCSFrames;
+
+  /// SpilledCSRegs - A BitVector mask of all spilled callee-saved registers.
+  ///
+  BitVector SpilledCSRegs;
+
+  /// JumpTableUId - Unique id for jumptables.
+  ///
+  unsigned JumpTableUId;
+
+public:
+  ARMFunctionInfo() :
+    isThumb(false), 
+    Align(2U),
+    VarArgsRegSaveSize(0), HasStackFrame(false),
+    LRSpilledForFarJump(false), R3IsLiveIn(false),
+    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
+    GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
+    JumpTableUId(0) {}
+
+  ARMFunctionInfo(MachineFunction &MF) :
+    isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
+    Align(isThumb ? 1U : 2U),
+    VarArgsRegSaveSize(0), HasStackFrame(false),
+    LRSpilledForFarJump(false), R3IsLiveIn(false),
+    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
+    GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
+    SpilledCSRegs(MF.getTarget().getRegisterInfo()->getNumRegs()),
+    JumpTableUId(0) {}
+
+  bool isThumbFunction() const { return isThumb; }
+
+  unsigned getAlign() const { return Align; }
+  void setAlign(unsigned a) { Align = a; }
+
+  unsigned getVarArgsRegSaveSize() const { return VarArgsRegSaveSize; }
+  void setVarArgsRegSaveSize(unsigned s) { VarArgsRegSaveSize = s; }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
+  void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
+
+  // FIXME: Remove when register scavenger for Thumb is done.
+  bool isR3LiveIn() const { return R3IsLiveIn; }
+  void setR3IsLiveIn(bool l) { R3IsLiveIn = l; }
+
+  unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
+  void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
+  
+  unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
+  unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
+  unsigned getDPRCalleeSavedAreaOffset()  const { return DPRCSOffset; }
+
+  void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; }
+  void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
+  void setDPRCalleeSavedAreaOffset(unsigned o)  { DPRCSOffset = o; }
+
+  unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
+  unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
+  unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
+
+  void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
+  void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
+  void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
+
+  bool isGPRCalleeSavedArea1Frame(int fi) const {
+    if (fi < 0 || fi >= (int)GPRCS1Frames.size())
+      return false;
+    return GPRCS1Frames[fi];
+  }
+  bool isGPRCalleeSavedArea2Frame(int fi) const {
+    if (fi < 0 || fi >= (int)GPRCS2Frames.size())
+      return false;
+    return GPRCS2Frames[fi];
+  }
+  bool isDPRCalleeSavedAreaFrame(int fi) const {
+    if (fi < 0 || fi >= (int)DPRCSFrames.size())
+      return false;
+    return DPRCSFrames[fi];
+  }
+
+  void addGPRCalleeSavedArea1Frame(int fi) {
+    if (fi >= 0) {
+      int Size = GPRCS1Frames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        GPRCS1Frames.resize(Size);
+      }
+      GPRCS1Frames[fi] = true;
+    }
+  }
+  void addGPRCalleeSavedArea2Frame(int fi) {
+    if (fi >= 0) {
+      int Size = GPRCS2Frames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        GPRCS2Frames.resize(Size);
+      }
+      GPRCS2Frames[fi] = true;
+    }
+  }
+  void addDPRCalleeSavedAreaFrame(int fi) {
+    if (fi >= 0) {
+      int Size = DPRCSFrames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        DPRCSFrames.resize(Size);
+      }
+      DPRCSFrames[fi] = true;
+    }
+  }
+
+  void setCSRegisterIsSpilled(unsigned Reg) {
+    SpilledCSRegs.set(Reg);
+  }
+
+  bool isCSRegisterSpilled(unsigned Reg) {
+    return SpilledCSRegs[Reg];
+  }
+
+  const BitVector &getSpilledCSRegisters() const {
+    return SpilledCSRegs;
+  }
+
+  unsigned createJumpTableUId() {
+    return JumpTableUId++;
+  }
+};
+} // End llvm namespace
+
+#endif // ARMMACHINEFUNCTIONINFO_H

diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
new file mode 100644
index 0000000..f8e10de
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp

@@ -0,0 +1,1566 @@
+//===- ARMRegisterInfo.cpp - ARM Register Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+using namespace llvm;
+
+static cl::opt<bool> ThumbRegScavenging("enable-thumb-reg-scavenging",
+                               cl::Hidden,
+                               cl::desc("Enable register scavenging on Thumb"));
+
+unsigned ARMRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
+  using namespace ARM;
+  switch (RegEnum) {
+  case R0:  case S0:  case D0:  return 0;
+  case R1:  case S1:  case D1:  return 1;
+  case R2:  case S2:  case D2:  return 2;
+  case R3:  case S3:  case D3:  return 3;
+  case R4:  case S4:  case D4:  return 4;
+  case R5:  case S5:  case D5:  return 5;
+  case R6:  case S6:  case D6:  return 6;
+  case R7:  case S7:  case D7:  return 7;
+  case R8:  case S8:  case D8:  return 8;
+  case R9:  case S9:  case D9:  return 9;
+  case R10: case S10: case D10: return 10;
+  case R11: case S11: case D11: return 11;
+  case R12: case S12: case D12: return 12;
+  case SP:  case S13: case D13: return 13;
+  case LR:  case S14: case D14: return 14;
+  case PC:  case S15: case D15: return 15;
+  case S16: return 16;
+  case S17: return 17;
+  case S18: return 18;
+  case S19: return 19;
+  case S20: return 20;
+  case S21: return 21;
+  case S22: return 22;
+  case S23: return 23;
+  case S24: return 24;
+  case S25: return 25;
+  case S26: return 26;
+  case S27: return 27;
+  case S28: return 28;
+  case S29: return 29;
+  case S30: return 30;
+  case S31: return 31;
+  default:
+    assert(0 && "Unknown ARM register!");
+    abort();
+  }
+}
+
+ARMRegisterInfo::ARMRegisterInfo(const TargetInstrInfo &tii,
+                                 const ARMSubtarget &sti)
+  : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+    TII(tii), STI(sti),
+    FramePtr((STI.useThumbBacktraces() || STI.isThumb()) ? ARM::R7 : ARM::R11) {
+}
+
+bool ARMRegisterInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                                MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  if (!AFI->isThumbFunction() || CSI.empty())
+    return false;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, TII.get(ARM::tPUSH));
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    MIB.addReg(Reg, false/*isDef*/,false/*isImp*/,true/*isKill*/);
+  }
+  return true;
+}
+
+bool ARMRegisterInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                                 MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  if (!AFI->isThumbFunction() || CSI.empty())
+    return false;
+
+  bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
+  MachineInstr *PopMI = new MachineInstr(TII.get(ARM::tPOP));
+  MBB.insert(MI, PopMI);
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (Reg == ARM::LR) {
+      // Special epilogue for vararg functions. See emitEpilogue
+      if (isVarArg)
+        continue;
+      Reg = ARM::PC;
+      PopMI->setInstrDescriptor(TII.get(ARM::tPOP_RET));
+      MBB.erase(MI);
+    }
+    PopMI->addRegOperand(Reg, true);
+  }
+  return true;
+}
+
+void ARMRegisterInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, int FI,
+                    const TargetRegisterClass *RC) const {
+  if (RC == ARM::GPRRegisterClass) {
+    MachineFunction &MF = *MBB.getParent();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    if (AFI->isThumbFunction())
+      BuildMI(MBB, I, TII.get(ARM::tSpill)).addReg(SrcReg, false, false, true)
+        .addFrameIndex(FI).addImm(0);
+    else
+      BuildMI(MBB, I, TII.get(ARM::STR)).addReg(SrcReg, false, false, true)
+        .addFrameIndex(FI).addReg(0).addImm(0).addImm((int64_t)ARMCC::AL)
+        .addReg(0);
+  } else if (RC == ARM::DPRRegisterClass) {
+    BuildMI(MBB, I, TII.get(ARM::FSTD)).addReg(SrcReg, false, false, true)
+      .addFrameIndex(FI).addImm(0).addImm((int64_t)ARMCC::AL).addReg(0);
+  } else {
+    assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
+    BuildMI(MBB, I, TII.get(ARM::FSTS)).addReg(SrcReg, false, false, true)
+      .addFrameIndex(FI).addImm(0).addImm((int64_t)ARMCC::AL).addReg(0);
+  }
+}
+
+void ARMRegisterInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const {
+  if (RC == ARM::GPRRegisterClass) {
+    MachineFunction &MF = *MBB.getParent();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    if (AFI->isThumbFunction())
+      BuildMI(MBB, I, TII.get(ARM::tRestore), DestReg)
+        .addFrameIndex(FI).addImm(0);
+    else
+      BuildMI(MBB, I, TII.get(ARM::LDR), DestReg)
+        .addFrameIndex(FI).addReg(0).addImm(0).addImm((int64_t)ARMCC::AL)
+        .addReg(0);
+  } else if (RC == ARM::DPRRegisterClass) {
+    BuildMI(MBB, I, TII.get(ARM::FLDD), DestReg)
+      .addFrameIndex(FI).addImm(0).addImm((int64_t)ARMCC::AL).addReg(0);
+  } else {
+    assert(RC == ARM::SPRRegisterClass && "Unknown regclass!");
+    BuildMI(MBB, I, TII.get(ARM::FLDS), DestReg)
+      .addFrameIndex(FI).addImm(0).addImm((int64_t)ARMCC::AL).addReg(0);
+  }
+}
+
+void ARMRegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *RC) const {
+  if (RC == ARM::GPRRegisterClass) {
+    MachineFunction &MF = *MBB.getParent();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    if (AFI->isThumbFunction())
+      BuildMI(MBB, I, TII.get(ARM::tMOVr), DestReg).addReg(SrcReg);
+    else
+      BuildMI(MBB, I, TII.get(ARM::MOVr), DestReg).addReg(SrcReg)
+        .addImm((int64_t)ARMCC::AL).addReg(0).addReg(0);
+  } else if (RC == ARM::SPRRegisterClass)
+    BuildMI(MBB, I, TII.get(ARM::FCPYS), DestReg).addReg(SrcReg)
+      .addImm((int64_t)ARMCC::AL).addReg(0);
+  else if (RC == ARM::DPRRegisterClass)
+    BuildMI(MBB, I, TII.get(ARM::FCPYD), DestReg).addReg(SrcReg)
+      .addImm((int64_t)ARMCC::AL).addReg(0);
+  else
+    abort();
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+static void emitLoadConstPool(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned DestReg, int Val,
+                              ARMCC::CondCodes Pred, unsigned PredReg,
+                              const TargetInstrInfo &TII, bool isThumb) {
+  MachineFunction &MF = *MBB.getParent();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  Constant *C = ConstantInt::get(Type::Int32Ty, Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 2);
+  if (isThumb)
+    BuildMI(MBB, MBBI, TII.get(ARM::tLDRcp), DestReg).addConstantPoolIndex(Idx);
+  else
+    BuildMI(MBB, MBBI, TII.get(ARM::LDRcp), DestReg).addConstantPoolIndex(Idx)
+      .addReg(0).addImm(0).addImm((unsigned)Pred).addReg(PredReg);
+}
+
+void ARMRegisterInfo::reMaterialize(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned DestReg,
+                                    const MachineInstr *Orig) const {
+  if (Orig->getOpcode() == ARM::MOVi2pieces) {
+    emitLoadConstPool(MBB, I, DestReg,
+                      Orig->getOperand(1).getImmedValue(),
+                      (ARMCC::CondCodes)Orig->getOperand(2).getImmedValue(),
+                      Orig->getOperand(3).getReg(),
+                      TII, false);
+    return;
+  }
+
+  MachineInstr *MI = Orig->clone();
+  MI->getOperand(0).setReg(DestReg);
+  MBB.insert(I, MI);
+}
+
+/// isLowRegister - Returns true if the register is low register r0-r7.
+///
+static bool isLowRegister(unsigned Reg) {
+  using namespace ARM;
+  switch (Reg) {
+  case R0:  case R1:  case R2:  case R3:
+  case R4:  case R5:  case R6:  case R7:
+    return true;
+  default:
+    return false;
+  }
+}
+
+MachineInstr *ARMRegisterInfo::foldMemoryOperand(MachineInstr *MI,
+                                                 unsigned OpNum, int FI) const {
+  unsigned Opc = MI->getOpcode();
+  MachineInstr *NewMI = NULL;
+  switch (Opc) {
+  default: break;
+  case ARM::MOVr: {
+    if (MI->getOperand(4).getReg() == ARM::CPSR)
+      // If it is updating CPSR, then it cannot be foled.
+      break;
+    unsigned Pred = MI->getOperand(2).getImmedValue();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      NewMI = BuildMI(TII.get(ARM::STR)).addReg(SrcReg).addFrameIndex(FI)
+        .addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      NewMI = BuildMI(TII.get(ARM::LDR), DstReg).addFrameIndex(FI).addReg(0)
+        .addImm(0).addImm(Pred).addReg(PredReg);
+    }
+    break;
+  }
+  case ARM::tMOVr: {
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      if (isPhysicalRegister(SrcReg) && !isLowRegister(SrcReg))
+        // tSpill cannot take a high register operand.
+        break;
+      NewMI = BuildMI(TII.get(ARM::tSpill)).addReg(SrcReg).addFrameIndex(FI)
+        .addImm(0);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      if (isPhysicalRegister(DstReg) && !isLowRegister(DstReg))
+        // tRestore cannot target a high register operand.
+        break;
+      NewMI = BuildMI(TII.get(ARM::tRestore), DstReg).addFrameIndex(FI)
+        .addImm(0);
+    }
+    break;
+  }
+  case ARM::FCPYS: {
+    unsigned Pred = MI->getOperand(2).getImmedValue();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      NewMI = BuildMI(TII.get(ARM::FSTS)).addReg(SrcReg).addFrameIndex(FI)
+        .addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      NewMI = BuildMI(TII.get(ARM::FLDS), DstReg).addFrameIndex(FI)
+        .addImm(0).addImm(Pred).addReg(PredReg);
+    }
+    break;
+  }
+  case ARM::FCPYD: {
+    unsigned Pred = MI->getOperand(2).getImmedValue();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      NewMI = BuildMI(TII.get(ARM::FSTD)).addReg(SrcReg).addFrameIndex(FI)
+        .addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      NewMI = BuildMI(TII.get(ARM::FLDD), DstReg).addFrameIndex(FI)
+        .addImm(0).addImm(Pred).addReg(PredReg);
+    }
+    break;
+  }
+  }
+
+  if (NewMI)
+    NewMI->copyKillDeadInfo(MI);
+  return NewMI;
+}
+
+const unsigned* ARMRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = {
+    ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8,
+    ARM::R7, ARM::R6,  ARM::R5,  ARM::R4,
+
+    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
+    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
+    0
+  };
+
+  static const unsigned DarwinCalleeSavedRegs[] = {
+    ARM::LR,  ARM::R7,  ARM::R6, ARM::R5, ARM::R4,
+    ARM::R11, ARM::R10, ARM::R9, ARM::R8,
+
+    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
+    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
+    0
+  };
+  return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const *
+ARMRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector ARMRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  // FIXME: avoid re-calculating this everytime.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(ARM::SP);
+  Reserved.set(ARM::PC);
+  if (STI.isTargetDarwin() || hasFP(MF))
+    Reserved.set(FramePtr);
+  // Some targets reserve R9.
+  if (STI.isR9Reserved())
+    Reserved.set(ARM::R9);
+  return Reserved;
+}
+
+bool
+ARMRegisterInfo::isReservedReg(const MachineFunction &MF, unsigned Reg) const {
+  switch (Reg) {
+  default: break;
+  case ARM::SP:
+  case ARM::PC:
+    return true;
+  case ARM::R7:
+  case ARM::R11:
+    if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF)))
+      return true;
+    break;
+  case ARM::R9:
+    return STI.isR9Reserved();
+  }
+
+  return false;
+}
+
+bool
+ARMRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  return ThumbRegScavenging || !AFI->isThumbFunction();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+///
+bool ARMRegisterInfo::hasFP(const MachineFunction &MF) const {
+  return NoFramePointerElim || MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+// not required, we reserve argument space for call sites in the function
+// immediately on entry to the current function. This eliminates the need for
+// add/sub sp brackets around call sites. Returns true if the call frame is
+// included as part of the stack frame.
+bool ARMRegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (AFI->isThumbFunction()) {
+    if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
+      return false;
+  } else {
+    if (CFSize >= ((1 << 12) - 1) / 2)  // Half of imm12
+      return false;
+  }
+  return !hasFP(MF);
+}
+
+/// emitARMRegPlusImmediate - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in ARM code.
+static
+void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI,
+                             unsigned DestReg, unsigned BaseReg, int NumBytes,
+                             ARMCC::CondCodes Pred, unsigned PredReg,
+                             const TargetInstrInfo &TII) {
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  while (NumBytes) {
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
+    unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
+    assert(ThisVal && "Didn't extract field correctly");
+    
+    // We will handle these bits from offset, clear them.
+    NumBytes &= ~ThisVal;
+    
+    // Get the properly encoded SOImmVal field.
+    int SOImmVal = ARM_AM::getSOImmVal(ThisVal);
+    assert(SOImmVal != -1 && "Bit extraction didn't work?");
+    
+    // Build the new ADD / SUB.
+    BuildMI(MBB, MBBI, TII.get(isSub ? ARM::SUBri : ARM::ADDri), DestReg)
+      .addReg(BaseReg, false, false, true).addImm(SOImmVal)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+    BaseReg = DestReg;
+  }
+}
+
+/// calcNumMI - Returns the number of instructions required to materialize
+/// the specific add / sub r, c instruction.
+static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
+                          unsigned NumBits, unsigned Scale) {
+  unsigned NumMIs = 0;
+  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+
+  if (Opc == ARM::tADDrSPi) {
+    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+    Bytes -= ThisVal;
+    NumMIs++;
+    NumBits = 8;
+    Scale = 1;  // Followed by a number of tADDi8.
+    Chunk = ((1 << NumBits) - 1) * Scale;
+  }
+
+  NumMIs += Bytes / Chunk;
+  if ((Bytes % Chunk) != 0)
+    NumMIs++;
+  if (ExtraOpc)
+    NumMIs++;
+  return NumMIs;
+}
+
+/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
+/// in a register using mov / mvn sequences or load the immediate from a
+/// constpool entry.
+static
+void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI,
+                               unsigned DestReg, unsigned BaseReg,
+                               int NumBytes, bool CanChangeCC,
+                               const TargetInstrInfo &TII) {
+    bool isHigh = !isLowRegister(DestReg) ||
+                  (BaseReg != 0 && !isLowRegister(BaseReg));
+    bool isSub = false;
+    // Subtract doesn't have high register version. Load the negative value
+    // if either base or dest register is a high register. Also, if do not
+    // issue sub as part of the sequence if condition register is to be
+    // preserved.
+    if (NumBytes < 0 && !isHigh && CanChangeCC) {
+      isSub = true;
+      NumBytes = -NumBytes;
+    }
+    unsigned LdReg = DestReg;
+    if (DestReg == ARM::SP) {
+      assert(BaseReg == ARM::SP && "Unexpected!");
+      LdReg = ARM::R3;
+      BuildMI(MBB, MBBI, TII.get(ARM::tMOVr), ARM::R12)
+        .addReg(ARM::R3, false, false, true);
+    }
+
+    if (NumBytes <= 255 && NumBytes >= 0)
+      BuildMI(MBB, MBBI, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
+    else if (NumBytes < 0 && NumBytes >= -255) {
+      BuildMI(MBB, MBBI, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes);
+      BuildMI(MBB, MBBI, TII.get(ARM::tNEG), LdReg)
+        .addReg(LdReg, false, false, true);
+    } else
+      emitLoadConstPool(MBB, MBBI, LdReg, NumBytes, ARMCC::AL, 0, TII, true);
+
+    // Emit add / sub.
+    int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
+    const MachineInstrBuilder MIB = BuildMI(MBB, MBBI, TII.get(Opc), DestReg);
+    if (DestReg == ARM::SP || isSub)
+      MIB.addReg(BaseReg).addReg(LdReg, false, false, true);
+    else
+      MIB.addReg(LdReg).addReg(BaseReg, false, false, true);
+    if (DestReg == ARM::SP)
+      BuildMI(MBB, MBBI, TII.get(ARM::tMOVr), ARM::R3)
+        .addReg(ARM::R12, false, false, true);
+}
+
+/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code.
+static
+void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI,
+                               unsigned DestReg, unsigned BaseReg,
+                               int NumBytes, const TargetInstrInfo &TII) {
+  bool isSub = NumBytes < 0;
+  unsigned Bytes = (unsigned)NumBytes;
+  if (isSub) Bytes = -NumBytes;
+  bool isMul4 = (Bytes & 3) == 0;
+  bool isTwoAddr = false;
+  bool DstNotEqBase = false;
+  unsigned NumBits = 1;
+  unsigned Scale = 1;
+  int Opc = 0;
+  int ExtraOpc = 0;
+
+  if (DestReg == BaseReg && BaseReg == ARM::SP) {
+    assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
+    NumBits = 7;
+    Scale = 4;
+    Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+    isTwoAddr = true;
+  } else if (!isSub && BaseReg == ARM::SP) {
+    // r1 = add sp, 403
+    // =>
+    // r1 = add sp, 100 * 4
+    // r1 = add r1, 3
+    if (!isMul4) {
+      Bytes &= ~3;
+      ExtraOpc = ARM::tADDi3;
+    }
+    NumBits = 8;
+    Scale = 4;
+    Opc = ARM::tADDrSPi;
+  } else {
+    // sp = sub sp, c
+    // r1 = sub sp, c
+    // r8 = sub sp, c
+    if (DestReg != BaseReg)
+      DstNotEqBase = true;
+    NumBits = 8;
+    Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+    isTwoAddr = true;
+  }
+
+  unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale);
+  unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
+  if (NumMIs > Threshold) {
+    // This will expand into too many instructions. Load the immediate from a
+    // constpool entry.
+    emitThumbRegPlusImmInReg(MBB, MBBI, DestReg, BaseReg, NumBytes, true, TII);
+    return;
+  }
+
+  if (DstNotEqBase) {
+    if (isLowRegister(DestReg) && isLowRegister(BaseReg)) {
+      // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7)
+      unsigned Chunk = (1 << 3) - 1;
+      unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+      Bytes -= ThisVal;
+      BuildMI(MBB, MBBI, TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3), DestReg)
+        .addReg(BaseReg, false, false, true).addImm(ThisVal);
+    } else {
+      BuildMI(MBB, MBBI, TII.get(ARM::tMOVr), DestReg)
+        .addReg(BaseReg, false, false, true);
+    }
+    BaseReg = DestReg;
+  }
+
+  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+  while (Bytes) {
+    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+    Bytes -= ThisVal;
+    ThisVal /= Scale;
+    // Build the new tADD / tSUB.
+    if (isTwoAddr)
+      BuildMI(MBB, MBBI, TII.get(Opc), DestReg).addReg(DestReg).addImm(ThisVal);
+    else {
+      bool isKill = BaseReg != ARM::SP;
+      BuildMI(MBB, MBBI, TII.get(Opc), DestReg)
+        .addReg(BaseReg, false, false, isKill).addImm(ThisVal);
+      BaseReg = DestReg;
+
+      if (Opc == ARM::tADDrSPi) {
+        // r4 = add sp, imm
+        // r4 = add r4, imm
+        // ...
+        NumBits = 8;
+        Scale = 1;
+        Chunk = ((1 << NumBits) - 1) * Scale;
+        Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+        isTwoAddr = true;
+      }
+    }
+  }
+
+  if (ExtraOpc)
+    BuildMI(MBB, MBBI, TII.get(ExtraOpc), DestReg)
+      .addReg(DestReg, false, false, true)
+      .addImm(((unsigned)NumBytes) & 3);
+}
+
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                  int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg,
+                  bool isThumb, const TargetInstrInfo &TII) {
+  if (isThumb)
+    emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII);
+  else
+    emitARMRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes,
+                            Pred, PredReg, TII);
+}
+
+void ARMRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    unsigned Amount = Old->getOperand(0).getImmedValue();
+    if (Amount != 0) {
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      bool isThumb = AFI->isThumbFunction();
+      ARMCC::CondCodes Pred = isThumb
+        ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(1).getImmedValue();
+      unsigned PredReg = isThumb ? 0 : Old->getOperand(2).getReg();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        emitSPUpdate(MBB, I, -Amount, Pred, PredReg, isThumb, TII);
+      } else {
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(MBB, I, Amount, Pred, PredReg, isThumb, TII);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+/// emitThumbConstant - Emit a series of instructions to materialize a
+/// constant.
+static void emitThumbConstant(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned DestReg, int Imm,
+                              const TargetInstrInfo &TII) {
+  bool isSub = Imm < 0;
+  if (isSub) Imm = -Imm;
+
+  int Chunk = (1 << 8) - 1;
+  int ThisVal = (Imm > Chunk) ? Chunk : Imm;
+  Imm -= ThisVal;
+  BuildMI(MBB, MBBI, TII.get(ARM::tMOVi8), DestReg).addImm(ThisVal);
+  if (Imm > 0) 
+    emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII);
+  if (isSub)
+    BuildMI(MBB, MBBI, TII.get(ARM::tNEG), DestReg)
+      .addReg(DestReg, false, false, true);
+}
+
+/// findScratchRegister - Find a 'free' ARM register. If register scavenger
+/// is not being used, R12 is available. Otherwise, try for a call-clobbered
+/// register first and then a spilled callee-saved register if that fails.
+static
+unsigned findScratchRegister(RegScavenger *RS, const TargetRegisterClass *RC,
+                             ARMFunctionInfo *AFI) {
+  unsigned Reg = RS ? RS->FindUnusedReg(RC, true) : (unsigned) ARM::R12;
+  if (Reg == 0)
+    // Try a already spilled CS register.
+    Reg = RS->FindUnusedReg(RC, AFI->getSpilledCSRegisters());
+
+  return Reg;
+}
+
+void ARMRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, RegScavenger *RS) const{
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isThumb = AFI->isThumbFunction();
+
+  while (!MI.getOperand(i).isFrameIndex()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  
+  unsigned FrameReg = ARM::SP;
+  int FrameIndex = MI.getOperand(i).getFrameIndex();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + 
+               MF.getFrameInfo()->getStackSize() + SPAdj;
+
+  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea1Offset();
+  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea2Offset();
+  else if (AFI->isDPRCalleeSavedAreaFrame(FrameIndex))
+    Offset -= AFI->getDPRCalleeSavedAreaOffset();
+  else if (hasFP(MF)) {
+    assert(SPAdj == 0 && "Unexpected");
+    // There is alloca()'s in this function, must reference off the frame
+    // pointer instead.
+    FrameReg = getFrameRegister(MF);
+    Offset -= AFI->getFramePtrSpillOffset();
+  }
+
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDescriptor &Desc = TII.get(Opcode);
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  if (Opcode == ARM::ADDri) {
+    Offset += MI.getOperand(i+1).getImm();
+    if (Offset == 0) {
+      // Turn it into a move.
+      MI.setInstrDescriptor(TII.get(ARM::MOVr));
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.RemoveOperand(i+1);
+      return;
+    } else if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setInstrDescriptor(TII.get(ARM::SUBri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    int ImmedOffset = ARM_AM::getSOImmVal(Offset);
+    if (ImmedOffset != -1) {
+      // Replace the FrameIndex with sp / fp
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.getOperand(i+1).ChangeToImmediate(ImmedOffset);
+      return;
+    }
+    
+    // Otherwise, we fallback to common code below to form the imm offset with
+    // a sequence of ADDri instructions.  First though, pull as much of the imm
+    // into this ADDri as possible.
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
+    
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+    
+    // Get the properly encoded SOImmVal field.
+    int ThisSOImmVal = ARM_AM::getSOImmVal(ThisImmVal);
+    assert(ThisSOImmVal != -1 && "Bit extraction didn't work?");    
+    MI.getOperand(i+1).ChangeToImmediate(ThisSOImmVal);
+  } else if (Opcode == ARM::tADDrSPi) {
+    Offset += MI.getOperand(i+1).getImm();
+
+    // Can't use tADDrSPi if it's based off the frame pointer.
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    if (FrameReg != ARM::SP) {
+      Opcode = ARM::tADDi3;
+      MI.setInstrDescriptor(TII.get(ARM::tADDi3));
+      NumBits = 3;
+    } else {
+      NumBits = 8;
+      Scale = 4;
+      assert((Offset & 3) == 0 &&
+             "Thumb add/sub sp, #imm immediate must be multiple of 4!");
+    }
+
+    if (Offset == 0) {
+      // Turn it into a move.
+      MI.setInstrDescriptor(TII.get(ARM::tMOVr));
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.RemoveOperand(i+1);
+      return;
+    }
+
+    // Common case: small offset, fits into instruction.
+    unsigned Mask = (1 << NumBits) - 1;
+    if (((Offset / Scale) & ~Mask) == 0) {
+      // Replace the FrameIndex with sp / fp
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.getOperand(i+1).ChangeToImmediate(Offset / Scale);
+      return;
+    }
+
+    unsigned DestReg = MI.getOperand(0).getReg();
+    unsigned Bytes = (Offset > 0) ? Offset : -Offset;
+    unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale);
+    // MI would expand into a large number of instructions. Don't try to
+    // simplify the immediate.
+    if (NumMIs > 2) {
+      emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII);
+      MBB.erase(II);
+      return;
+    }
+
+    if (Offset > 0) {
+      // Translate r0 = add sp, imm to
+      // r0 = add sp, 255*4
+      // r0 = add r0, (imm - 255*4)
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      MI.getOperand(i+1).ChangeToImmediate(Mask);
+      Offset = (Offset - Mask * Scale);
+      MachineBasicBlock::iterator NII = next(II);
+      emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII);
+    } else {
+      // Translate r0 = add sp, -imm to
+      // r0 = -imm (this is then translated into a series of instructons)
+      // r0 = add r0, sp
+      emitThumbConstant(MBB, II, DestReg, Offset, TII);
+      MI.setInstrDescriptor(TII.get(ARM::tADDhirr));
+      MI.getOperand(i).ChangeToRegister(DestReg, false, false, true);
+      MI.getOperand(i+1).ChangeToRegister(FrameReg, false);
+    }
+    return;
+  } else {
+    unsigned ImmIdx = 0;
+    int InstrOffs = 0;
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    switch (AddrMode) {
+    case ARMII::AddrMode2: {
+      ImmIdx = i+2;
+      InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 12;
+      break;
+    }
+    case ARMII::AddrMode3: {
+      ImmIdx = i+2;
+      InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      break;
+    }
+    case ARMII::AddrMode5: {
+      ImmIdx = i+1;
+      InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      break;
+    }
+    case ARMII::AddrModeTs: {
+      ImmIdx = i+1;
+      InstrOffs = MI.getOperand(ImmIdx).getImm();
+      NumBits = (FrameReg == ARM::SP) ? 8 : 5;
+      Scale = 4;
+      break;
+    }
+    default:
+      assert(0 && "Unsupported addressing mode!");
+      abort();
+      break;
+    }
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+    if (Offset < 0 && !isThumb) {
+      Offset = -Offset;
+      isSub = true;
+    }
+
+    // Common case: small offset, fits into instruction.
+    MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with sp
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      if (isSub)
+        ImmedOffset |= 1 << NumBits;
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      return;
+    }
+
+    bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill;
+    if (AddrMode == ARMII::AddrModeTs) {
+      // Thumb tLDRspi, tSTRspi. These will change to instructions that use
+      // a different base register.
+      NumBits = 5;
+      Mask = (1 << NumBits) - 1;
+    }
+    // If this is a thumb spill / restore, we will be using a constpool load to
+    // materialize the offset.
+    if (AddrMode == ARMII::AddrModeTs && isThumSpillRestore)
+      ImmOp.ChangeToImmediate(0);
+    else {
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      if (isSub)
+        ImmedOffset |= 1 << NumBits;
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask*Scale);
+    }
+  }
+  
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert(Offset && "This code isn't needed if offset already handled!");
+
+  if (isThumb) {
+    if (TII.isLoad(Opcode)) {
+      // Use the destination register to materialize sp + offset.
+      unsigned TmpReg = MI.getOperand(0).getReg();
+      bool UseRR = false;
+      if (Opcode == ARM::tRestore) {
+        if (FrameReg == ARM::SP)
+          emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,Offset,false,TII);
+        else {
+          emitLoadConstPool(MBB, II, TmpReg, Offset, ARMCC::AL, 0, TII, true);
+          UseRR = true;
+        }
+      } else
+        emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII);
+      MI.setInstrDescriptor(TII.get(ARM::tLDR));
+      MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
+      if (UseRR)
+        MI.addRegOperand(FrameReg, false);  // Use [reg, reg] addrmode.
+      else
+        MI.addRegOperand(0, false); // tLDR has an extra register operand.
+    } else if (TII.isStore(Opcode)) {
+      // FIXME! This is horrific!!! We need register scavenging.
+      // Our temporary workaround has marked r3 unavailable. Of course, r3 is
+      // also a ABI register so it's possible that is is the register that is
+      // being storing here. If that's the case, we do the following:
+      // r12 = r2
+      // Use r2 to materialize sp + offset
+      // str r3, r2
+      // r2 = r12
+      unsigned ValReg = MI.getOperand(0).getReg();
+      unsigned TmpReg = ARM::R3;
+      bool UseRR = false;
+      if (ValReg == ARM::R3) {
+        BuildMI(MBB, II, TII.get(ARM::tMOVr), ARM::R12)
+          .addReg(ARM::R2, false, false, true);
+        TmpReg = ARM::R2;
+      }
+      if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
+        BuildMI(MBB, II, TII.get(ARM::tMOVr), ARM::R12)
+          .addReg(ARM::R3, false, false, true);
+      if (Opcode == ARM::tSpill) {
+        if (FrameReg == ARM::SP)
+          emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,Offset,false,TII);
+        else {
+          emitLoadConstPool(MBB, II, TmpReg, Offset, ARMCC::AL, 0, TII, true);
+          UseRR = true;
+        }
+      } else
+        emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII);
+      MI.setInstrDescriptor(TII.get(ARM::tSTR));
+      MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
+      if (UseRR)
+        MI.addRegOperand(FrameReg, false);  // Use [reg, reg] addrmode.
+      else
+        MI.addRegOperand(0, false); // tSTR has an extra register operand.
+
+      MachineBasicBlock::iterator NII = next(II);
+      if (ValReg == ARM::R3)
+        BuildMI(MBB, NII, TII.get(ARM::tMOVr), ARM::R2)
+          .addReg(ARM::R12, false, false, true);
+      if (TmpReg == ARM::R3 && AFI->isR3LiveIn())
+        BuildMI(MBB, NII, TII.get(ARM::tMOVr), ARM::R3)
+          .addReg(ARM::R12, false, false, true);
+    } else
+      assert(false && "Unexpected opcode!");
+  } else {
+    // Insert a set of r12 with the full address: r12 = sp + offset
+    // If the offset we have is too large to fit into the instruction, we need
+    // to form it with a series of ADDri's.  Do this by taking 8-bit chunks
+    // out of 'Offset'.
+    unsigned ScratchReg = findScratchRegister(RS, &ARM::GPRRegClass, AFI);
+    if (ScratchReg == 0)
+      // No register is "free". Scavenge a register.
+      ScratchReg = RS->scavengeRegister(&ARM::GPRRegClass, II, SPAdj);
+    int PIdx = MI.findFirstPredOperandIdx();
+    ARMCC::CondCodes Pred = (PIdx == -1)
+      ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImmedValue();
+    unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
+    emitARMRegPlusImmediate(MBB, II, ScratchReg, FrameReg,
+                            isSub ? -Offset : Offset, Pred, PredReg, TII);
+    MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
+  }
+}
+
+static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset+Align-1)/Align*Align;
+  }
+  return (unsigned)Offset;
+}
+
+void
+ARMRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                      RegScavenger *RS) const {
+  // This tells PEI to spill the FP as if it is any other callee-save register
+  // to take advantage the eliminateFrameIndex machinery. This also ensures it
+  // is spilled in the order specified by getCalleeSavedRegs() to make it easier
+  // to combine multiple loads / stores.
+  bool CanEliminateFrame = true;
+  bool CS1Spilled = false;
+  bool LRSpilled = false;
+  unsigned NumGPRSpills = 0;
+  SmallVector<unsigned, 4> UnspilledCS1GPRs;
+  SmallVector<unsigned, 4> UnspilledCS2GPRs;
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Don't spill FP if the frame can be eliminated. This is determined
+  // by scanning the callee-save registers to see if any is used.
+  const unsigned *CSRegs = getCalleeSavedRegs();
+  const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    bool Spilled = false;
+    if (MF.isPhysRegUsed(Reg)) {
+      AFI->setCSRegisterIsSpilled(Reg);
+      Spilled = true;
+      CanEliminateFrame = false;
+    } else {
+      // Check alias registers too.
+      for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) {
+        if (MF.isPhysRegUsed(*Aliases)) {
+          Spilled = true;
+          CanEliminateFrame = false;
+        }
+      }
+    }
+
+    if (CSRegClasses[i] == &ARM::GPRRegClass) {
+      if (Spilled) {
+        NumGPRSpills++;
+
+        if (!STI.isTargetDarwin()) {
+          if (Reg == ARM::LR)
+            LRSpilled = true;
+          CS1Spilled = true;
+          continue;
+        }
+
+        // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+        switch (Reg) {
+        case ARM::LR:
+          LRSpilled = true;
+          // Fallthrough
+        case ARM::R4:
+        case ARM::R5:
+        case ARM::R6:
+        case ARM::R7:
+          CS1Spilled = true;
+          break;
+        default:
+          break;
+        }
+      } else { 
+        if (!STI.isTargetDarwin()) {
+          UnspilledCS1GPRs.push_back(Reg);
+          continue;
+        }
+
+        switch (Reg) {
+        case ARM::R4:
+        case ARM::R5:
+        case ARM::R6:
+        case ARM::R7:
+        case ARM::LR:
+          UnspilledCS1GPRs.push_back(Reg);
+          break;
+        default:
+          UnspilledCS2GPRs.push_back(Reg);
+          break;
+        }
+      }
+    }
+  }
+
+  bool ForceLRSpill = false;
+  if (!LRSpilled && AFI->isThumbFunction()) {
+    unsigned FnSize = ARM::GetFunctionSize(MF);
+    // Force LR to be spilled if the Thumb function size is > 2048. This enables
+    // use of BL to implement far jump. If it turns out that it's not needed
+    // then the branch fix up path will undo it.
+    if (FnSize >= (1 << 11)) {
+      CanEliminateFrame = false;
+      ForceLRSpill = true;
+    }
+  }
+
+  bool ExtraCSSpill = false;
+  if (!CanEliminateFrame || hasFP(MF)) {
+    AFI->setHasStackFrame(true);
+
+    // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
+    // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
+    if (!LRSpilled && CS1Spilled) {
+      MF.setPhysRegUsed(ARM::LR);
+      AFI->setCSRegisterIsSpilled(ARM::LR);
+      NumGPRSpills++;
+      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
+                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
+      ForceLRSpill = false;
+      ExtraCSSpill = true;
+    }
+
+    // Darwin ABI requires FP to point to the stack slot that contains the
+    // previous FP.
+    if (STI.isTargetDarwin() || hasFP(MF)) {
+      MF.setPhysRegUsed(FramePtr);
+      NumGPRSpills++;
+    }
+
+    // If stack and double are 8-byte aligned and we are spilling an odd number
+    // of GPRs. Spill one extra callee save GPR so we won't have to pad between
+    // the integer and double callee save areas.
+    unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+    if (TargetAlign == 8 && (NumGPRSpills & 1)) {
+      if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
+        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
+          unsigned Reg = UnspilledCS1GPRs[i];
+          // Don't spiil high register if the function is thumb
+          if (!AFI->isThumbFunction() || isLowRegister(Reg) || Reg == ARM::LR) {
+            MF.setPhysRegUsed(Reg);
+            AFI->setCSRegisterIsSpilled(Reg);
+            if (!isReservedReg(MF, Reg))
+              ExtraCSSpill = true;
+            break;
+          }
+        }
+      } else if (!UnspilledCS2GPRs.empty() &&
+                 !AFI->isThumbFunction()) {
+        unsigned Reg = UnspilledCS2GPRs.front();
+        MF.setPhysRegUsed(Reg);
+        AFI->setCSRegisterIsSpilled(Reg);
+        if (!isReservedReg(MF, Reg))
+          ExtraCSSpill = true;
+      }
+    }
+
+    // Estimate if we might need to scavenge a register at some point in order
+    // to materialize a stack offset. If so, either spill one additiona
+    // callee-saved register or reserve a special spill slot to facilitate
+    // register scavenging.
+    if (RS && !ExtraCSSpill && !AFI->isThumbFunction()) {
+      MachineFrameInfo  *MFI = MF.getFrameInfo();
+      unsigned Size = estimateStackSize(MF, MFI);
+      unsigned Limit = (1 << 12) - 1;
+      for (MachineFunction::iterator BB = MF.begin(),E = MF.end();BB != E; ++BB)
+        for (MachineBasicBlock::iterator I= BB->begin(); I != BB->end(); ++I) {
+          for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+            if (I->getOperand(i).isFrameIndex()) {
+              unsigned Opcode = I->getOpcode();
+              const TargetInstrDescriptor &Desc = TII.get(Opcode);
+              unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+              if (AddrMode == ARMII::AddrMode3) {
+                Limit = (1 << 8) - 1;
+                goto DoneEstimating;
+              } else if (AddrMode == ARMII::AddrMode5) {
+                unsigned ThisLimit = ((1 << 8) - 1) * 4;
+                if (ThisLimit < Limit)
+                  Limit = ThisLimit;
+              }
+            }
+        }
+    DoneEstimating:
+      if (Size >= Limit) {
+        // If any non-reserved CS register isn't spilled, just spill one or two
+        // extra. That should take care of it!
+        unsigned NumExtras = TargetAlign / 4;
+        SmallVector<unsigned, 2> Extras;
+        while (NumExtras && !UnspilledCS1GPRs.empty()) {
+          unsigned Reg = UnspilledCS1GPRs.back();
+          UnspilledCS1GPRs.pop_back();
+          if (!isReservedReg(MF, Reg)) {
+            Extras.push_back(Reg);
+            NumExtras--;
+          }
+        }
+        while (NumExtras && !UnspilledCS2GPRs.empty()) {
+          unsigned Reg = UnspilledCS2GPRs.back();
+          UnspilledCS2GPRs.pop_back();
+          if (!isReservedReg(MF, Reg)) {
+            Extras.push_back(Reg);
+            NumExtras--;
+          }
+        }
+        if (Extras.size() && NumExtras == 0) {
+          for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+            MF.setPhysRegUsed(Extras[i]);
+            AFI->setCSRegisterIsSpilled(Extras[i]);
+          }
+        } else {
+          // Reserve a slot closest to SP or frame pointer.
+          const TargetRegisterClass *RC = &ARM::GPRRegClass;
+          RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                           RC->getAlignment()));
+        }
+      }
+    }
+  }
+
+  if (ForceLRSpill) {
+    MF.setPhysRegUsed(ARM::LR);
+    AFI->setCSRegisterIsSpilled(ARM::LR);
+    AFI->setLRIsSpilledForFarJump(true);
+  }
+}
+
+/// Move iterator pass the next bunch of callee save load / store ops for
+/// the particular spill area (1: integer area 1, 2: integer area 2,
+/// 3: fp area, 0: don't care).
+static void movePastCSLoadStoreOps(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator &MBBI,
+                                   int Opc, unsigned Area,
+                                   const ARMSubtarget &STI) {
+  while (MBBI != MBB.end() &&
+         MBBI->getOpcode() == Opc && MBBI->getOperand(1).isFrameIndex()) {
+    if (Area != 0) {
+      bool Done = false;
+      unsigned Category = 0;
+      switch (MBBI->getOperand(0).getReg()) {
+      case ARM::R4:  case ARM::R5:  case ARM::R6: case ARM::R7:
+      case ARM::LR:
+        Category = 1;
+        break;
+      case ARM::R8:  case ARM::R9:  case ARM::R10: case ARM::R11:
+        Category = STI.isTargetDarwin() ? 2 : 1;
+        break;
+      case ARM::D8:  case ARM::D9:  case ARM::D10: case ARM::D11:
+      case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15:
+        Category = 3;
+        break;
+      default:
+        Done = true;
+        break;
+      }
+      if (Done || Category != Area)
+        break;
+    }
+
+    ++MBBI;
+  }
+}
+
+void ARMRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isThumb = AFI->isThumbFunction();
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  unsigned NumBytes = MFI->getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  if (isThumb) {
+    // Check if R3 is live in. It might have to be used as a scratch register.
+    for (MachineFunction::livein_iterator I=MF.livein_begin(),E=MF.livein_end();
+         I != E; ++I) {
+      if ((*I).first == ARM::R3) {
+        AFI->setR3IsLiveIn(true);
+        break;
+      }
+    }
+
+    // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
+    NumBytes = (NumBytes + 3) & ~3;
+    MFI->setStackSize(NumBytes);
+  }
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+
+  if (VARegSaveSize)
+    emitSPUpdate(MBB, MBBI, -VARegSaveSize, ARMCC::AL, 0, isThumb, TII);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(MBB, MBBI, -NumBytes, ARMCC::AL, 0, isThumb, TII);
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      AFI->addGPRCalleeSavedArea1Frame(FI);
+      GPRCS1Size += 4;
+      break;
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      if (STI.isTargetDarwin()) {
+        AFI->addGPRCalleeSavedArea2Frame(FI);
+        GPRCS2Size += 4;
+      } else {
+        AFI->addGPRCalleeSavedArea1Frame(FI);
+        GPRCS1Size += 4;
+      }
+      break;
+    default:
+      AFI->addDPRCalleeSavedAreaFrame(FI);
+      DPRCSSize += 8;
+    }
+  }
+
+  if (!isThumb) {
+    // Build the new SUBri to adjust SP for integer callee-save spill area 1.
+    emitSPUpdate(MBB, MBBI, -GPRCS1Size, ARMCC::AL, 0, isThumb, TII);
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 1, STI);
+  } else if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH)
+    ++MBBI;
+
+  // Darwin ABI requires FP to point to the stack slot that contains the
+  // previous FP.
+  if (STI.isTargetDarwin() || hasFP(MF)) {
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, TII.get(isThumb ? ARM::tADDrSPi : ARM::ADDri),FramePtr)
+      .addFrameIndex(FramePtrSpillFI).addImm(0);
+    if (!isThumb) MIB.addImm(ARMCC::AL).addReg(0).addReg(0);
+  }
+
+  if (!isThumb) {
+    // Build the new SUBri to adjust SP for integer callee-save spill area 2.
+    emitSPUpdate(MBB, MBBI, -GPRCS2Size, ARMCC::AL, 0, false, TII);
+
+    // Build the new SUBri to adjust SP for FP callee-save spill area.
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 2, STI);
+    emitSPUpdate(MBB, MBBI, -DPRCSSize, ARMCC::AL, 0, false, TII);
+  }
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+  
+  NumBytes = DPRCSOffset;
+  if (NumBytes) {
+    // Insert it after all the callee-save spills.
+    if (!isThumb)
+      movePastCSLoadStoreOps(MBB, MBBI, ARM::FSTD, 3, STI);
+    emitSPUpdate(MBB, MBBI, -NumBytes, ARMCC::AL, 0, isThumb, TII);
+  }
+
+  if(STI.isTargetELF() && hasFP(MF)) {
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+  }
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) {
+  return ((MI->getOpcode() == ARM::FLDD ||
+           MI->getOpcode() == ARM::LDR  ||
+           MI->getOpcode() == ARM::tRestore) &&
+          MI->getOperand(1).isFrameIndex() &&
+          isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
+}
+
+void ARMRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert((MBBI->getOpcode() == ARM::BX_RET ||
+          MBBI->getOpcode() == ARM::tBX_RET ||
+          MBBI->getOpcode() == ARM::tPOP_RET) &&
+         "Can only insert epilog into returning blocks");
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isThumb = AFI->isThumbFunction();
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  int NumBytes = (int)MFI->getStackSize();
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(MBB, MBBI, NumBytes, ARMCC::AL, 0, isThumb, TII);
+  } else {
+    // Unwind MBBI to point to first LDR / FLDD.
+    const unsigned *CSRegs = getCalleeSavedRegs();
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
+      if (!isCSRestore(MBBI, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize());
+    if (isThumb) {
+      if (hasFP(MF)) {
+        NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+        // Reset SP based on frame pointer only if the stack frame extends beyond
+        // frame pointer stack slot or target is ELF and the function has FP.
+        if (NumBytes)
+          emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes, TII);
+        else
+          BuildMI(MBB, MBBI, TII.get(ARM::tMOVr), ARM::SP).addReg(FramePtr);
+      } else {
+        if (MBBI->getOpcode() == ARM::tBX_RET &&
+            &MBB.front() != MBBI &&
+            prior(MBBI)->getOpcode() == ARM::tPOP) {
+          MachineBasicBlock::iterator PMBBI = prior(MBBI);
+          emitSPUpdate(MBB, PMBBI, NumBytes, ARMCC::AL, 0, isThumb, TII);
+        } else
+          emitSPUpdate(MBB, MBBI, NumBytes, ARMCC::AL, 0, isThumb, TII);
+      }
+    } else {
+      // Darwin ABI requires FP to point to the stack slot that contains the
+      // previous FP.
+      if ((STI.isTargetDarwin() && NumBytes) || hasFP(MF)) {
+        NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+        // Reset SP based on frame pointer only if the stack frame extends beyond
+        // frame pointer stack slot or target is ELF and the function has FP.
+        if (AFI->getGPRCalleeSavedArea2Size() ||
+            AFI->getDPRCalleeSavedAreaSize()  ||
+            AFI->getDPRCalleeSavedAreaOffset()||
+            hasFP(MF))
+          if (NumBytes)
+            BuildMI(MBB, MBBI, TII.get(ARM::SUBri), ARM::SP).addReg(FramePtr)
+              .addImm(NumBytes)
+              .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+          else
+            BuildMI(MBB, MBBI, TII.get(ARM::MOVr), ARM::SP).addReg(FramePtr)
+              .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+      } else if (NumBytes) {
+        emitSPUpdate(MBB, MBBI, NumBytes, ARMCC::AL, 0, false, TII);
+      }
+
+      // Move SP to start of integer callee save spill area 2.
+      movePastCSLoadStoreOps(MBB, MBBI, ARM::FLDD, 3, STI);
+      emitSPUpdate(MBB, MBBI, AFI->getDPRCalleeSavedAreaSize(), ARMCC::AL, 0,
+                   false, TII);
+
+      // Move SP to start of integer callee save spill area 1.
+      movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 2, STI);
+      emitSPUpdate(MBB, MBBI, AFI->getGPRCalleeSavedArea2Size(), ARMCC::AL, 0,
+                   false, TII);
+
+      // Move SP to SP upon entry to the function.
+      movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 1, STI);
+      emitSPUpdate(MBB, MBBI, AFI->getGPRCalleeSavedArea1Size(), ARMCC::AL, 0,
+                   false, TII);
+    }
+  }
+
+  if (VARegSaveSize) {
+    if (isThumb)
+      // Epilogue for vararg functions: pop LR to R3 and branch off it.
+      // FIXME: Verify this is still ok when R3 is no longer being reserved.
+      BuildMI(MBB, MBBI, TII.get(ARM::tPOP)).addReg(ARM::R3);
+
+    emitSPUpdate(MBB, MBBI, VARegSaveSize, ARMCC::AL, 0, isThumb, TII);
+
+    if (isThumb) {
+      BuildMI(MBB, MBBI, TII.get(ARM::tBX_RET_vararg)).addReg(ARM::R3);
+      MBB.erase(MBBI);
+    }
+  }
+}
+
+unsigned ARMRegisterInfo::getRARegister() const {
+  return ARM::LR;
+}
+
+unsigned ARMRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  if (STI.isTargetDarwin() || hasFP(MF))
+    return (STI.useThumbBacktraces() || STI.isThumb()) ? ARM::R7 : ARM::R11;
+  else
+    return ARM::SP;
+}
+
+unsigned ARMRegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned ARMRegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+#include "ARMGenRegisterInfo.inc"
+

diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
new file mode 100644
index 0000000..3db1d89
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterInfo.h

@@ -0,0 +1,108 @@
+//===- ARMRegisterInfo.h - ARM Register Information Impl --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMREGISTERINFO_H
+#define ARMREGISTERINFO_H
+
+#include "llvm/Target/MRegisterInfo.h"
+#include "ARMGenRegisterInfo.h.inc"
+
+namespace llvm {
+  class ARMSubtarget;
+  class TargetInstrInfo;
+  class Type;
+
+struct ARMRegisterInfo : public ARMGenRegisterInfo {
+  const TargetInstrInfo &TII;
+  const ARMSubtarget &STI;
+private:
+  /// FramePtr - ARM physical register used as frame ptr.
+  unsigned FramePtr;
+
+public:
+  ARMRegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI);
+
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// ARM::LR, return the number that it corresponds to (e.g. 14).
+  static unsigned getRegisterNumbering(unsigned RegEnum);
+
+  /// Code Generation virtual methods...
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, int FrameIndex,
+                           const TargetRegisterClass *RC) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC) const;
+
+  void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *RC) const;
+
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, const MachineInstr *Orig) const;
+
+  MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
+                                  int FrameIndex) const;
+
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  bool hasReservedCallFrame(MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
new file mode 100644
index 0000000..3d2646e
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterInfo.td

@@ -0,0 +1,196 @@
+//===- ARMRegisterInfo.td - ARM Register defs -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the ARM register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 4-bit ID numbers.
+class ARMReg<bits<4> num, string n, list<Register> subregs = []> : Register<n> {
+  field bits<4> Num;
+  let Namespace = "ARM";
+  let SubRegs = subregs;
+}
+
+class ARMFReg<bits<5> num, string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "ARM";
+}
+
+// Integer registers
+def R0  : ARMReg< 0, "r0">,  DwarfRegNum<0>;
+def R1  : ARMReg< 1, "r1">,  DwarfRegNum<1>;
+def R2  : ARMReg< 2, "r2">,  DwarfRegNum<2>;
+def R3  : ARMReg< 3, "r3">,  DwarfRegNum<3>;
+def R4  : ARMReg< 4, "r4">,  DwarfRegNum<4>;
+def R5  : ARMReg< 5, "r5">,  DwarfRegNum<5>;
+def R6  : ARMReg< 6, "r6">,  DwarfRegNum<6>;
+def R7  : ARMReg< 7, "r7">,  DwarfRegNum<7>;
+def R8  : ARMReg< 8, "r8">,  DwarfRegNum<8>;
+def R9  : ARMReg< 9, "r9">,  DwarfRegNum<9>;
+def R10 : ARMReg<10, "r10">, DwarfRegNum<10>;
+def R11 : ARMReg<11, "r11">, DwarfRegNum<11>;
+def R12 : ARMReg<12, "r12">, DwarfRegNum<12>;
+def SP  : ARMReg<13, "sp">,  DwarfRegNum<13>;
+def LR  : ARMReg<14, "lr">,  DwarfRegNum<14>;
+def PC  : ARMReg<15, "pc">,  DwarfRegNum<15>;
+
+// Float registers
+def S0  : ARMFReg< 0, "s0">;  def S1  : ARMFReg< 1, "s1">;
+def S2  : ARMFReg< 2, "s2">;  def S3  : ARMFReg< 3, "s3">;
+def S4  : ARMFReg< 4, "s4">;  def S5  : ARMFReg< 5, "s5">;
+def S6  : ARMFReg< 6, "s6">;  def S7  : ARMFReg< 7, "s7">;
+def S8  : ARMFReg< 8, "s8">;  def S9  : ARMFReg< 9, "s9">;
+def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">;
+def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">;
+def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">;
+def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">;
+def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">;
+def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">;
+def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">;
+def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">;
+def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">;
+def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">;
+def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0  : ARMReg< 0,  "d0", [S0,   S1]>;
+def D1  : ARMReg< 1,  "d1", [S2,   S3]>; 
+def D2  : ARMReg< 2,  "d2", [S4,   S5]>;
+def D3  : ARMReg< 3,  "d3", [S6,   S7]>;
+def D4  : ARMReg< 4,  "d4", [S8,   S9]>;
+def D5  : ARMReg< 5,  "d5", [S10, S11]>;
+def D6  : ARMReg< 6,  "d6", [S12, S13]>;
+def D7  : ARMReg< 7,  "d7", [S14, S15]>;
+def D8  : ARMReg< 8,  "d8", [S16, S17]>;
+def D9  : ARMReg< 9,  "d9", [S18, S19]>;
+def D10 : ARMReg<10, "d10", [S20, S21]>;
+def D11 : ARMReg<11, "d11", [S22, S23]>;
+def D12 : ARMReg<12, "d12", [S24, S25]>;
+def D13 : ARMReg<13, "d13", [S26, S27]>;
+def D14 : ARMReg<14, "d14", [S28, S29]>;
+def D15 : ARMReg<15, "d15", [S30, S31]>;
+
+// Current Program Status Register.
+def CPSR : ARMReg<0, "cpsr">;
+
+// Register classes.
+//
+// pc  == Program Counter
+// lr  == Link Register
+// sp  == Stack Pointer
+// r12 == ip (scratch)
+// r7  == Frame Pointer (thumb-style backtraces)
+// r11 == Frame Pointer (arm-style backtraces)
+// r10 == Stack Limit
+//
+def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
+                                           R7, R8, R9, R10, R12, R11,
+                                           LR, SP, PC]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  // FIXME: We are reserving r12 in case the PEI needs to use it to
+  // generate large stack offset. Make it available once we have register
+  // scavenging. Similarly r3 is reserved in Thumb mode for now.
+  let MethodBodies = [{
+    // FP is R11, R9 is available.
+    static const unsigned ARM_GPR_AO_1[] = {
+      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+      ARM::R8, ARM::R9, ARM::R10,
+      ARM::R11 };
+    // FP is R11, R9 is not available.
+    static const unsigned ARM_GPR_AO_2[] = {
+      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+      ARM::R8, ARM::R10,
+      ARM::R11 };
+    // FP is R7, R9 is available.
+    static const unsigned ARM_GPR_AO_3[] = {
+      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6,
+      ARM::R8, ARM::R9, ARM::R10,ARM::R11,
+      ARM::R7 };
+    // FP is R7, R9 is not available.
+    static const unsigned ARM_GPR_AO_4[] = {
+      ARM::R3, ARM::R2, ARM::R1, ARM::R0,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6,
+      ARM::R8, ARM::R10,ARM::R11,
+      ARM::R7 };
+
+    // FP is R7, only low registers available.
+    static const unsigned THUMB_GPR_AO[] = {
+      ARM::R2, ARM::R1, ARM::R0,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7 };
+
+    GPRClass::iterator
+    GPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.isThumb())
+        return THUMB_GPR_AO;
+      if (Subtarget.useThumbBacktraces()) {
+        if (Subtarget.isR9Reserved())
+          return ARM_GPR_AO_4;
+        else
+          return ARM_GPR_AO_3;
+      } else {
+        if (Subtarget.isR9Reserved())
+          return ARM_GPR_AO_2;
+        else
+          return ARM_GPR_AO_1;
+      }
+    }
+
+    GPRClass::iterator
+    GPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      GPRClass::iterator I;
+      if (Subtarget.isThumb())
+        I = THUMB_GPR_AO + (sizeof(THUMB_GPR_AO)/sizeof(unsigned));
+      else if (Subtarget.useThumbBacktraces()) {
+        if (Subtarget.isR9Reserved()) {
+          I = ARM_GPR_AO_4 + (sizeof(ARM_GPR_AO_4)/sizeof(unsigned));
+        } else {
+          I = ARM_GPR_AO_3 + (sizeof(ARM_GPR_AO_3)/sizeof(unsigned));
+        }
+      } else {
+        if (Subtarget.isR9Reserved()) {
+          I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned));
+        } else {
+          I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned));
+        }
+      }
+
+      // Mac OS X requires FP not to be clobbered for backtracing purpose.
+      return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I;
+    }
+  }];
+}
+
+def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
+  S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
+  S23, S24, S25, S26, S27, S28, S29, S30, S31]>;
+
+// ARM requires only word alignment for double. It's more performant if it
+// is double-word alignment though.
+def DPR : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, D8,
+  D9, D10, D11, D12, D13, D14, D15]>;
+
+// Condition code registers.
+def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;

diff --git a/lib/Target/ARM/ARMRelocations.h b/lib/Target/ARM/ARMRelocations.h
new file mode 100644
index 0000000..beea52b
--- /dev/null
+++ b/lib/Target/ARM/ARMRelocations.h

@@ -0,0 +1,28 @@
+//===- ARMRelocations.h - ARM Code Relocations ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Raul Herbster and is distributed under the 
+// University of Illinois Open Source License.  See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMRELOCATIONS_H
+#define ARMRELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace ARM {
+    enum RelocationType {
+
+    };
+  }
+}
+
+#endif
+

diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
new file mode 100644
index 0000000..6db36df
--- /dev/null
+++ b/lib/Target/ARM/ARMSubtarget.cpp

@@ -0,0 +1,57 @@
+//===-- ARMSubtarget.cpp - ARM Subtarget Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMSubtarget.h"
+#include "ARMGenSubtarget.inc"
+#include "llvm/Module.h"
+using namespace llvm;
+
+ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS, bool thumb)
+  : ARMArchVersion(V4T)
+  , HasVFP2(false)
+  , IsThumb(thumb)
+  , UseThumbBacktraces(false)
+  , IsR9Reserved(false)
+  , stackAlignment(4)
+  , TargetType(isELF) // Default to ELF unless otherwise specified.
+  , TargetABI(ARM_ABI_APCS) {
+
+  // Determine default and user specified characteristics
+  std::string CPU = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  const std::string& TT = M.getTargetTriple();
+  if (TT.length() > 5) {
+    if (TT.find("-darwin") != std::string::npos)
+      TargetType = isDarwin;
+  } else if (TT.empty()) {
+#if defined(__APPLE__)
+    TargetType = isDarwin;
+#endif
+  }
+
+  if (TT.find("eabi") != std::string::npos)
+    TargetABI = ARM_ABI_AAPCS;
+
+  if (isAAPCS_ABI())
+    stackAlignment = 8;
+
+  if (isTargetDarwin()) {
+    UseThumbBacktraces = true;
+    IsR9Reserved = true;
+  }
+}

diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
new file mode 100644
index 0000000..62367ca
--- /dev/null
+++ b/lib/Target/ARM/ARMSubtarget.h

@@ -0,0 +1,94 @@
+//=====---- ARMSubtarget.h - Define Subtarget for the ARM -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMSUBTARGET_H
+#define ARMSUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+class Module;
+
+class ARMSubtarget : public TargetSubtarget {
+protected:
+  enum ARMArchEnum {
+    V4T, V5T, V5TE, V6
+  };
+
+  /// ARMArchVersion - ARM architecture vecrsion: V4T (base), V5T, V5TE,
+  /// and V6.
+  ARMArchEnum ARMArchVersion;
+
+  /// HasVFP2 - True if the processor supports Vector Floating Point (VFP) V2
+  /// instructions.
+  bool HasVFP2;
+
+  /// IsThumb - True if we are in thumb mode, false if in ARM mode.
+  bool IsThumb;
+
+  /// UseThumbBacktraces - True if we use thumb style backtraces.
+  bool UseThumbBacktraces;
+
+  /// IsR9Reserved - True if R9 is a not available as general purpose register.
+  bool IsR9Reserved;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+ public:
+  enum {
+    isELF, isDarwin
+  } TargetType;
+
+  enum {
+    ARM_ABI_APCS,
+    ARM_ABI_AAPCS // ARM EABI
+  } TargetABI;
+
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  ARMSubtarget(const Module &M, const std::string &FS, bool thumb);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU);
+
+  bool hasV4TOps()  const { return ARMArchVersion >= V4T; }
+  bool hasV5TOps()  const { return ARMArchVersion >= V5T; }
+  bool hasV5TEOps() const { return ARMArchVersion >= V5TE; }
+  bool hasV6Ops()   const { return ARMArchVersion >= V6; }
+
+  bool hasVFP2() const { return HasVFP2; }
+  
+  bool isTargetDarwin() const { return TargetType == isDarwin; }
+  bool isTargetELF() const { return TargetType == isELF; }
+
+  bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
+  bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; }
+
+  bool isThumb() const { return IsThumb; }
+
+  bool useThumbBacktraces() const { return UseThumbBacktraces; }
+  bool isR9Reserved() const { return IsR9Reserved; }
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+};
+} // End llvm namespace
+
+#endif  // ARMSUBTARGET_H

diff --git a/lib/Target/ARM/ARMTargetAsmInfo.cpp b/lib/Target/ARM/ARMTargetAsmInfo.cpp
new file mode 100644
index 0000000..1dea1c1
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetAsmInfo.cpp

@@ -0,0 +1,276 @@
+
+//===-- ARMTargetAsmInfo.cpp - ARM asm properties ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARMTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetAsmInfo.h"
+#include "ARMTargetMachine.h"
+#include <cstring>
+#include <cctype>
+using namespace llvm;
+
+static const char* arm_asm_table[] = {"{r0}", "r0",
+                                      "{r1}", "r1",
+                                      "{r2}", "r2",
+                                      "{r3}", "r3",
+                                      "{r4}", "r4",
+                                      "{r5}", "r5",
+                                      "{r6}", "r6",
+                                      "{r7}", "r7",
+                                      "{r8}", "r8",
+                                      "{r9}", "r9",
+                                      "{r10}", "r10",
+                                      "{r11}", "r11",
+                                      "{r12}", "r12",
+                                      "{r13}", "r13",
+                                      "{r14}", "r14",
+                                      "{lr}", "lr",
+                                      "{sp}", "sp",
+                                      "{ip}", "ip",
+                                      "{fp}", "fp",
+                                      "{sl}", "sl",
+                                      "{memory}", "memory",
+                                      "{cc}", "cc",
+                                      0,0};
+
+ARMTargetAsmInfo::ARMTargetAsmInfo(const ARMTargetMachine &TM) {
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  AsmTransCBE = arm_asm_table;
+  if (Subtarget->isTargetDarwin()) {
+    GlobalPrefix = "_";
+    PrivateGlobalPrefix = "L";
+    BSSSection = 0;                       // no BSS section.
+    ZeroFillDirective = "\t.zerofill\t";  // Uses .zerofill
+    SetDirective = "\t.set";
+    WeakRefDirective = "\t.weak_reference\t";
+    HiddenDirective = "\t.private_extern\t";
+    ProtectedDirective = NULL;
+    JumpTableDataSection = ".const";
+    CStringSection = "\t.cstring";
+    FourByteConstantSection = "\t.literal4\n";
+    EightByteConstantSection = "\t.literal8\n";
+    ReadOnlySection = "\t.const\n";
+    HasDotTypeDotSizeDirective = false;
+    if (TM.getRelocationModel() == Reloc::Static) {
+      StaticCtorsSection = ".constructor";
+      StaticDtorsSection = ".destructor";
+    } else {
+      StaticCtorsSection = ".mod_init_func";
+      StaticDtorsSection = ".mod_term_func";
+    }
+    
+    // In non-PIC modes, emit a special label before jump tables so that the
+    // linker can perform more accurate dead code stripping.
+    if (TM.getRelocationModel() != Reloc::PIC_) {
+      // Emit a local label that is preserved until the linker runs.
+      JumpTableSpecialLabelPrefix = "l";
+    }
+    
+    NeedsSet = true;
+    DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug";
+    DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug";
+    DwarfLineSection = ".section __DWARF,__debug_line,regular,debug";
+    DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug";
+    DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug";
+    DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug";
+    DwarfStrSection = ".section __DWARF,__debug_str,regular,debug";
+    DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug";
+    DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug";
+    DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug";
+    DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug";
+  } else {
+    NeedsSet = false;
+    HasLEB128 = true;
+    AbsoluteDebugSectionOffsets = true;
+    ReadOnlySection = "\t.section\t.rodata\n";
+    PrivateGlobalPrefix = ".L";
+    WeakRefDirective = "\t.weak\t";
+    SetDirective = "\t.set\t";
+    DwarfRequiresFrameSection = false;
+    DwarfAbbrevSection =  "\t.section\t.debug_abbrev,\"\",%progbits";
+    DwarfInfoSection =    "\t.section\t.debug_info,\"\",%progbits";
+    DwarfLineSection =    "\t.section\t.debug_line,\"\",%progbits";
+    DwarfFrameSection =   "\t.section\t.debug_frame,\"\",%progbits";
+    DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"\",%progbits";
+    DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"\",%progbits";
+    DwarfStrSection =     "\t.section\t.debug_str,\"\",%progbits";
+    DwarfLocSection =     "\t.section\t.debug_loc,\"\",%progbits";
+    DwarfARangesSection = "\t.section\t.debug_aranges,\"\",%progbits";
+    DwarfRangesSection =  "\t.section\t.debug_ranges,\"\",%progbits";
+    DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",%progbits";
+
+    if (Subtarget->isAAPCS_ABI()) {
+      StaticCtorsSection = "\t.section .init_array,\"aw\",%init_array";
+      StaticDtorsSection = "\t.section .fini_array,\"aw\",%fini_array";
+    } else {
+      StaticCtorsSection = "\t.section .ctors,\"aw\",%progbits";
+      StaticDtorsSection = "\t.section .dtors,\"aw\",%progbits";
+    }
+    TLSDataSection = "\t.section .tdata,\"awT\",%progbits";
+    TLSBSSSection = "\t.section .tbss,\"awT\",%nobits";
+  }
+
+  ZeroDirective = "\t.space\t";
+  AlignmentIsInBytes = false;
+  Data64bitsDirective = 0;
+  CommentString = "@";
+  DataSection = "\t.data";
+  ConstantPoolSection = "\t.text\n";
+  COMMDirectiveTakesAlignment = false;
+  InlineAsmStart = "@ InlineAsm Start";
+  InlineAsmEnd = "@ InlineAsm End";
+  LCOMMDirective = "\t.lcomm\t";
+}
+
+/// Count the number of comma-separated arguments.
+/// Do not try to detect errors.
+unsigned ARMTargetAsmInfo::countArguments(const char* p) const {
+  unsigned count = 0;
+  while (*p && isspace(*p) && *p != '\n')
+    p++;
+  count++;
+  while (*p && *p!='\n' && 
+         strncmp(p, CommentString, strlen(CommentString))!=0) {
+    if (*p==',')
+      count++;
+    p++;
+  }
+  return count;
+}
+
+/// Count the length of a string enclosed in quote characters.
+/// Do not try to detect errors.
+unsigned ARMTargetAsmInfo::countString(const char* p) const {
+  unsigned count = 0;
+  while (*p && isspace(*p) && *p!='\n')
+    p++;
+  if (!*p || *p != '\"')
+    return count;
+  while (*++p && *p != '\"')
+    count++;
+  return count;
+}
+
+/// ARM-specific version of TargetAsmInfo::getInlineAsmLength.
+unsigned ARMTargetAsmInfo::getInlineAsmLength(const char *Str) const {
+  // Count the number of bytes in the asm.
+  bool atInsnStart = true;
+  bool inTextSection = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (atInsnStart) {
+      // Skip whitespace
+      while (*Str && isspace(*Str) && *Str != '\n')
+        Str++;
+      // Skip label
+      for (const char* p = Str; *p && !isspace(*p); p++)
+        if (*p == ':') {
+          Str = p+1;
+          while (*Str && isspace(*Str) && *Str != '\n')
+            Str++;
+          break;
+        }
+      // Ignore everything from comment char(s) to EOL
+      if (strncmp(Str, CommentString, strlen(CommentString))==-0)
+        atInsnStart = false;
+      // FIXME do something like the following for non-Darwin
+      else if (*Str == '.' && Subtarget->isTargetDarwin()) {
+        // Directive.
+        atInsnStart = false;
+        // Some change the section, but don't generate code.
+        if (strncasecmp(Str, ".literal4", strlen(".literal4"))==0 ||
+            strncasecmp(Str, ".literal8", strlen(".literal8"))==0 ||
+            strncasecmp(Str, ".const", strlen(".const"))==0 ||
+            strncasecmp(Str, ".constructor", strlen(".constructor"))==0 ||
+            strncasecmp(Str, ".cstring", strlen(".cstring"))==0 ||
+            strncasecmp(Str, ".data", strlen(".data"))==0 ||
+            strncasecmp(Str, ".destructor", strlen(".destructor"))==0 ||
+            strncasecmp(Str, ".fvmlib_init0", strlen(".fvmlib_init0"))==0 ||
+            strncasecmp(Str, ".fvmlib_init1", strlen(".fvmlib_init1"))==0 ||
+            strncasecmp(Str, ".mod_init_func", strlen(".mod_init_func"))==0 ||
+            strncasecmp(Str, ".mod_term_func", strlen(".mod_term_func"))==0 ||
+            strncasecmp(Str, ".picsymbol_stub", strlen(".picsymbol_stub"))==0 ||
+            strncasecmp(Str, ".symbol_stub", strlen(".symbol_stub"))==0 ||
+            strncasecmp(Str, ".static_data", strlen(".static_data"))==0 ||
+            strncasecmp(Str, ".section", strlen(".section"))==0 ||
+            strncasecmp(Str, ".lazy_symbol_pointer", strlen(".lazy_symbol_pointer"))==0 ||
+            strncasecmp(Str, ".non_lazy_symbol_pointer", strlen(".non_lazy_symbol_pointer"))==0 ||
+            strncasecmp(Str, ".dyld", strlen(".dyld"))==0 ||
+            strncasecmp(Str, ".const_data", strlen(".const_data"))==0 ||
+            strncasecmp(Str, ".objc", strlen(".objc"))==0 ||       //// many directives
+            strncasecmp(Str, ".static_const", strlen(".static_const"))==0)
+          inTextSection=false;
+        else if (strncasecmp(Str, ".text", strlen(".text"))==0)
+          inTextSection = true;
+        // Some can't really be handled without implementing significant pieces
+        // of an assembler.  Others require dynamic adjustment of block sizes in
+        // AdjustBBOffsetsAfter; it's a big compile-time speed hit to check every
+        // instruction in there, and none of these are currently used in the kernel.
+        else if (strncasecmp(Str, ".macro", strlen(".macro"))==0 ||
+                 strncasecmp(Str, ".if", strlen(".if"))==0 ||
+                 strncasecmp(Str, ".align", strlen(".align"))==0 ||
+                 strncasecmp(Str, ".fill", strlen(".fill"))==0 ||
+                 strncasecmp(Str, ".space", strlen(".space"))==0 ||
+                 strncasecmp(Str, ".zerofill", strlen(".zerofill"))==0 ||
+                 strncasecmp(Str, ".p2align", strlen(".p2align"))==0 ||
+                 strncasecmp(Str, ".p2alignw", strlen(".p2alignw"))==0 ||
+                 strncasecmp(Str, ".p2alignl", strlen(".p2alignl"))==0 ||
+                 strncasecmp(Str, ".align32", strlen(".p2align32"))==0 ||
+                 strncasecmp(Str, ".include", strlen(".include"))==0)
+          cerr << "Directive " << Str << " in asm may lead to invalid offsets for" <<
+                   " constant pools (the assembler will tell you if this happens).\n";
+        // Some generate code, but this is only interesting in the text section.
+        else if (inTextSection) {
+          if (strncasecmp(Str, ".long", strlen(".long"))==0)
+            Length += 4*countArguments(Str+strlen(".long"));
+          else if (strncasecmp(Str, ".short", strlen(".short"))==0)
+            Length += 2*countArguments(Str+strlen(".short"));
+          else if (strncasecmp(Str, ".byte", strlen(".byte"))==0)
+            Length += 1*countArguments(Str+strlen(".byte"));
+          else if (strncasecmp(Str, ".single", strlen(".single"))==0)
+            Length += 4*countArguments(Str+strlen(".single"));
+          else if (strncasecmp(Str, ".double", strlen(".double"))==0)
+            Length += 8*countArguments(Str+strlen(".double"));
+          else if (strncasecmp(Str, ".quad", strlen(".quad"))==0)
+            Length += 16*countArguments(Str+strlen(".quad"));
+          else if (strncasecmp(Str, ".ascii", strlen(".ascii"))==0)
+            Length += countString(Str+strlen(".ascii"));
+          else if (strncasecmp(Str, ".asciz", strlen(".asciz"))==0)
+            Length += countString(Str+strlen(".asciz"))+1;
+        }
+      } else if (inTextSection) {
+        // An instruction
+        atInsnStart = false;
+        if (Subtarget->isThumb()) {
+          // BL and BLX <non-reg> are 4 bytes, all others 2.
+          if (strncasecmp(Str, "blx", strlen("blx"))==0) {
+            const char* p = Str+3;
+            while (*p && isspace(*p))
+              p++;
+            if (*p == 'r' || *p=='R')
+              Length += 2;    // BLX reg
+            else
+              Length += 4;    // BLX non-reg
+          } else if (strncasecmp(Str, "bl", strlen("bl"))==0)
+            Length += 4;    // BL
+          else
+            Length += 2;    // Thumb anything else
+        }
+        else
+          Length += 4;    // ARM
+      }
+    }
+    if (*Str == '\n' || *Str == SeparatorChar)
+      atInsnStart = true;
+  }
+  return Length;
+}

diff --git a/lib/Target/ARM/ARMTargetAsmInfo.h b/lib/Target/ARM/ARMTargetAsmInfo.h
new file mode 100644
index 0000000..9dd45e5
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetAsmInfo.h

@@ -0,0 +1,38 @@
+//=====-- ARMTargetAsmInfo.h - ARM asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMTARGETASMINFO_H
+#define ARMTARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+#include "ARMSubtarget.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class ARMTargetMachine;
+
+  struct ARMTargetAsmInfo : public TargetAsmInfo {
+    ARMTargetAsmInfo(const ARMTargetMachine &TM);
+
+    const ARMSubtarget *Subtarget;
+
+    virtual unsigned getInlineAsmLength(const char *Str) const;
+    unsigned countArguments(const char *p) const;
+    unsigned countString(const char *p) const;
+  };
+
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
new file mode 100644
index 0000000..58b3ab9
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetMachine.cpp

@@ -0,0 +1,160 @@
+//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetMachine.h"
+#include "ARMTargetAsmInfo.h"
+#include "ARMFrameInfo.h"
+#include "ARM.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::opt<bool> DisableLdStOpti("disable-arm-loadstore-opti", cl::Hidden,
+                              cl::desc("Disable load store optimization pass"));
+static cl::opt<bool> EnableIfConversion("enable-arm-if-conversion", cl::Hidden,
+                              cl::desc("Enable if-conversion pass"));
+
+namespace {
+  // Register the target.
+  RegisterTarget<ARMTargetMachine>   X("arm",   "  ARM");
+  RegisterTarget<ThumbTargetMachine> Y("thumb", "  Thumb");
+}
+
+/// ThumbTargetMachine - Create an Thumb architecture model.
+///
+unsigned ThumbTargetMachine::getJITMatchQuality() {
+#if defined(__arm__)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned ThumbTargetMachine::getModuleMatchQuality(const Module &M) {
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 6 && std::string(TT.begin(), TT.begin()+6) == "thumb-")
+    return 20;
+
+  // If the target triple is something non-thumb, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer32)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+ThumbTargetMachine::ThumbTargetMachine(const Module &M, const std::string &FS) 
+  : ARMTargetMachine(M, FS, true) {
+}
+
+/// TargetMachine ctor - Create an ARM architecture model.
+///
+ARMTargetMachine::ARMTargetMachine(const Module &M, const std::string &FS,
+                                   bool isThumb)
+  : Subtarget(M, FS, isThumb),
+    DataLayout(Subtarget.isAPCS_ABI() ?
+               // APCS ABI
+          (isThumb ?
+           std::string("e-p:32:32-f64:32:32-i64:32:32-"
+                       "i16:16:32-i8:8:32-i1:8:32-a:0:32") :
+           std::string("e-p:32:32-f64:32:32-i64:32:32")) :
+               // AAPCS ABI
+          (isThumb ?
+           std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                       "i16:16:32-i8:8:32-i1:8:32-a:0:32") :
+           std::string("e-p:32:32-f64:64:64-i64:64:64"))),
+    InstrInfo(Subtarget),
+    FrameInfo(Subtarget),
+    JITInfo(*this),
+    TLInfo(*this) {}
+
+unsigned ARMTargetMachine::getJITMatchQuality() {
+#if defined(__thumb__)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned ARMTargetMachine::getModuleMatchQuality(const Module &M) {
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 4 && std::string(TT.begin(), TT.begin()+4) == "arm-")
+    return 20;
+  // If the target triple is something non-arm, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer32)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+
+const TargetAsmInfo *ARMTargetMachine::createTargetAsmInfo() const {
+  return new ARMTargetAsmInfo(*this);
+}
+
+
+// Pass Pipeline Configuration
+bool ARMTargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) {
+  PM.add(createARMISelDag(*this));
+  return false;
+}
+
+bool ARMTargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) {
+  // FIXME: temporarily disabling load / store optimization pass for Thumb mode.
+  if (!Fast && !DisableLdStOpti && !Subtarget.isThumb())
+    PM.add(createARMLoadStoreOptimizationPass());
+  
+  if (!Fast && EnableIfConversion && !Subtarget.isThumb())
+    PM.add(createIfConverterPass());
+
+  PM.add(createARMConstantIslandPass());
+  return true;
+}
+
+bool ARMTargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                          std::ostream &Out) {
+  // Output assembly language.
+  PM.add(createARMCodePrinterPass(Out, *this));
+  return false;
+}
+
+
+bool ARMTargetMachine::addCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                      MachineCodeEmitter &MCE) {
+  // FIXME: Move this to TargetJITInfo!
+  setRelocationModel(Reloc::Static);
+
+  // Machine code emitter pass for ARM.
+  PM.add(createARMCodeEmitterPass(*this, MCE));
+  return false;
+}
+
+bool ARMTargetMachine::addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                            MachineCodeEmitter &MCE) {
+  // Machine code emitter pass for ARM.
+  PM.add(createARMCodeEmitterPass(*this, MCE));
+  return false;
+}

diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
new file mode 100644
index 0000000..183a582
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetMachine.h

@@ -0,0 +1,81 @@
+//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the "Instituto Nokia de Tecnologia" and
+// is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMTARGETMACHINE_H
+#define ARMTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "ARMInstrInfo.h"
+#include "ARMFrameInfo.h"
+#include "ARMJITInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMISelLowering.h"
+
+namespace llvm {
+
+class Module;
+
+class ARMTargetMachine : public LLVMTargetMachine {
+  ARMSubtarget      Subtarget;
+  const TargetData  DataLayout;       // Calculates type size & alignment
+  ARMInstrInfo      InstrInfo;
+  ARMFrameInfo      FrameInfo;
+  ARMJITInfo        JITInfo;
+  ARMTargetLowering TLInfo;
+
+public:
+  ARMTargetMachine(const Module &M, const std::string &FS, bool isThumb = false);
+
+  virtual const ARMInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual       TargetJITInfo    *getJITInfo()         { return &JITInfo; }
+  virtual const MRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
+  virtual       ARMTargetLowering *getTargetLowering() const { 
+    return const_cast<ARMTargetLowering*>(&TLInfo); 
+  }
+  static unsigned getModuleMatchQuality(const Module &M);
+  static unsigned getJITMatchQuality();
+
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);
+  virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast);
+  virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                  std::ostream &Out);
+  virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast,
+                              MachineCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                    MachineCodeEmitter &MCE);
+};
+
+/// ThumbTargetMachine - Thumb target machine.
+///
+class ThumbTargetMachine : public ARMTargetMachine {
+public:
+  ThumbTargetMachine(const Module &M, const std::string &FS);
+
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
new file mode 100644
index 0000000..77300a1
--- /dev/null
+++ b/lib/Target/ARM/Makefile

@@ -0,0 +1,21 @@
+##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by the "Instituto Nokia de Tecnologia" and
+# is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMARM
+TARGET = ARM
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \
+                ARMGenRegisterInfo.inc ARMGenInstrNames.inc \
+                ARMGenInstrInfo.inc ARMGenAsmWriter.inc \
+                ARMGenDAGISel.inc ARMGenSubtarget.inc
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt
new file mode 100644
index 0000000..380097d
--- /dev/null
+++ b/lib/Target/ARM/README-Thumb.txt

@@ -0,0 +1,223 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the ARM backend (Thumb specific).
+//===---------------------------------------------------------------------===//
+
+* Add support for compiling functions in both ARM and Thumb mode, then taking
+  the smallest.
+* Add support for compiling individual basic blocks in thumb mode, when in a 
+  larger ARM function.  This can be used for presumed cold code, like paths
+  to abort (failure path of asserts), EH handling code, etc.
+
+* Thumb doesn't have normal pre/post increment addressing modes, but you can
+  load/store 32-bit integers with pre/postinc by using load/store multiple
+  instrs with a single register.
+
+* Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
+  and cmp instructions can use high registers. Also, we can use them as
+  temporaries to spill values into.
+
+* In thumb mode, short, byte, and bool preferred alignments are currently set
+  to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
+  of 4).
+
+//===---------------------------------------------------------------------===//
+
+Potential jumptable improvements:
+
+* If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
+  jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
+  function is even smaller. This also applies to ARM.
+
+* Thumb jumptable codegen can improve given some help from the assembler. This
+  is what we generate right now:
+
+	.set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
+LPCRELL0:
+	mov r1, #PCRELV0
+	add r1, pc
+	ldr r0, [r0, r1]
+	cpy pc, r0 
+	.align	2
+LJTI1_0_0:
+	.long	 LBB1_3
+        ...
+
+Note there is another pc relative add that we can take advantage of.
+     add r1, pc, #imm_8 * 4
+
+We should be able to generate:
+
+LPCRELL0:
+	add r1, LJTI1_0_0
+	ldr r0, [r0, r1]
+	cpy pc, r0 
+	.align	2
+LJTI1_0_0:
+	.long	 LBB1_3
+
+if the assembler can translate the add to:
+       add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
+
+Note the assembler also does something similar to constpool load:
+LPCRELL0:
+     ldr r0, LCPI1_0
+=>
+     ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
+
+
+//===---------------------------------------------------------------------===//
+
+We compiles the following:
+
+define i16 @func_entry_2E_ce(i32 %i) {
+        switch i32 %i, label %bb12.exitStub [
+                 i32 0, label %bb4.exitStub
+                 i32 1, label %bb9.exitStub
+                 i32 2, label %bb4.exitStub
+                 i32 3, label %bb4.exitStub
+                 i32 7, label %bb9.exitStub
+                 i32 8, label %bb.exitStub
+                 i32 9, label %bb9.exitStub
+        ]
+
+bb12.exitStub:
+        ret i16 0
+
+bb4.exitStub:
+        ret i16 1
+
+bb9.exitStub:
+        ret i16 2
+
+bb.exitStub:
+        ret i16 3
+}
+
+into:
+
+_func_entry_2E_ce:
+        mov r2, #1
+        lsl r2, r0
+        cmp r0, #9
+        bhi LBB1_4      @bb12.exitStub
+LBB1_1: @newFuncRoot
+        mov r1, #13
+        tst r2, r1
+        bne LBB1_5      @bb4.exitStub
+LBB1_2: @newFuncRoot
+        ldr r1, LCPI1_0
+        tst r2, r1
+        bne LBB1_6      @bb9.exitStub
+LBB1_3: @newFuncRoot
+        mov r1, #1
+        lsl r1, r1, #8
+        tst r2, r1
+        bne LBB1_7      @bb.exitStub
+LBB1_4: @bb12.exitStub
+        mov r0, #0
+        bx lr
+LBB1_5: @bb4.exitStub
+        mov r0, #1
+        bx lr
+LBB1_6: @bb9.exitStub
+        mov r0, #2
+        bx lr
+LBB1_7: @bb.exitStub
+        mov r0, #3
+        bx lr
+LBB1_8:
+        .align  2
+LCPI1_0:
+        .long   642
+
+
+gcc compiles to:
+
+	cmp	r0, #9
+	@ lr needed for prologue
+	bhi	L2
+	ldr	r3, L11
+	mov	r2, #1
+	mov	r1, r2, asl r0
+	ands	r0, r3, r2, asl r0
+	movne	r0, #2
+	bxne	lr
+	tst	r1, #13
+	beq	L9
+L3:
+	mov	r0, r2
+	bx	lr
+L9:
+	tst	r1, #256
+	movne	r0, #3
+	bxne	lr
+L2:
+	mov	r0, #0
+	bx	lr
+L12:
+	.align 2
+L11:
+	.long	642
+        
+
+GCC is doing a couple of clever things here:
+  1. It is predicating one of the returns.  This isn't a clear win though: in
+     cases where that return isn't taken, it is replacing one condbranch with
+     two 'ne' predicated instructions.
+  2. It is sinking the shift of "1 << i" into the tst, and using ands instead of
+     tst.  This will probably require whole function isel.
+  3. GCC emits:
+  	tst	r1, #256
+     we emit:
+        mov r1, #1
+        lsl r1, r1, #8
+        tst r2, r1
+  
+
+//===---------------------------------------------------------------------===//
+
+When spilling in thumb mode and the sp offset is too large to fit in the ldr /
+str offset field, we load the offset from a constpool entry and add it to sp:
+
+ldr r2, LCPI
+add r2, sp
+ldr r2, [r2]
+
+These instructions preserve the condition code which is important if the spill
+is between a cmp and a bcc instruction. However, we can use the (potentially)
+cheaper sequnce if we know it's ok to clobber the condition register.
+
+add r2, sp, #255 * 4
+add r2, #132
+ldr r2, [r2, #7 * 4]
+
+This is especially bad when dynamic alloca is used. The all fixed size stack
+objects are referenced off the frame pointer with negative offsets. See
+oggenc for an example.
+
+//===---------------------------------------------------------------------===//
+
+We are reserving R3 as a scratch register under thumb mode. So if it is live in
+to the function, we save / restore R3 to / from R12. Until register scavenging
+is done, we should save R3 to a high callee saved reg at emitPrologue time
+(when hasFP is true or stack size is large) and restore R3 from that register
+instead. This allows us to at least get rid of the save to r12 everytime it is
+used.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen test/CodeGen/ARM/select.ll f7:
+
+	ldr r5, LCPI1_0
+LPC0:
+	add r5, pc
+	ldr r6, LCPI1_1
+	ldr r2, LCPI1_2
+	cpy r3, r6
+	cpy lr, pc
+	bx r5
+
+//===---------------------------------------------------------------------===//
+
+Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
+etc. Almost all Thumb instructions clobber condition code.

diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
new file mode 100644
index 0000000..3db8f54
--- /dev/null
+++ b/lib/Target/ARM/README.txt

@@ -0,0 +1,530 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the ARM backend.
+//===---------------------------------------------------------------------===//
+
+Reimplement 'select' in terms of 'SEL'.
+
+* We would really like to support UXTAB16, but we need to prove that the
+  add doesn't need to overflow between the two 16-bit chunks.
+
+* Implement pre/post increment support.  (e.g. PR935)
+* Coalesce stack slots!
+* Implement smarter constant generation for binops with large immediates.
+
+* Consider materializing FP constants like 0.0f and 1.0f using integer 
+  immediate instructions then copy to FPU.  Slower than load into FPU?
+
+//===---------------------------------------------------------------------===//
+
+Crazy idea:  Consider code that uses lots of 8-bit or 16-bit values.  By the
+time regalloc happens, these values are now in a 32-bit register, usually with
+the top-bits known to be sign or zero extended.  If spilled, we should be able
+to spill these to a 8-bit or 16-bit stack slot, zero or sign extending as part
+of the reload.
+
+Doing this reduces the size of the stack frame (important for thumb etc), and
+also increases the likelihood that we will be able to reload multiple values
+from the stack with a single load.
+
+//===---------------------------------------------------------------------===//
+
+The constant island pass is in good shape.  Some cleanups might be desirable,
+but there is unlikely to be much improvement in the generated code.
+
+1.  There may be some advantage to trying to be smarter about the initial
+placement, rather than putting everything at the end.
+
+2.  There might be some compile-time efficiency to be had by representing
+consecutive islands as a single block rather than multiple blocks.
+
+3.  Use a priority queue to sort constant pool users in inverse order of
+    position so we always process the one closed to the end of functions
+    first. This may simply CreateNewWater.
+
+//===---------------------------------------------------------------------===//
+
+Eliminate copysign custom expansion. We are still generating crappy code with
+default expansion + if-conversion.
+
+//===---------------------------------------------------------------------===//
+
+Eliminate one instruction from:
+
+define i32 @_Z6slow4bii(i32 %x, i32 %y) {
+        %tmp = icmp sgt i32 %x, %y
+        %retval = select i1 %tmp, i32 %x, i32 %y
+        ret i32 %retval
+}
+
+__Z6slow4bii:
+        cmp r0, r1
+        movgt r1, r0
+        mov r0, r1
+        bx lr
+=>
+
+__Z6slow4bii:
+        cmp r0, r1
+        movle r0, r1
+        bx lr
+
+//===---------------------------------------------------------------------===//
+
+Implement long long "X-3" with instructions that fold the immediate in.  These
+were disabled due to badness with the ARM carry flag on subtracts.
+
+//===---------------------------------------------------------------------===//
+
+We currently compile abs:
+int foo(int p) { return p < 0 ? -p : p; }
+
+into:
+
+_foo:
+        rsb r1, r0, #0
+        cmn r0, #1
+        movgt r1, r0
+        mov r0, r1
+        bx lr
+
+This is very, uh, literal.  This could be a 3 operation sequence:
+  t = (p sra 31); 
+  res = (p xor t)-t
+
+Which would be better.  This occurs in png decode.
+
+//===---------------------------------------------------------------------===//
+
+More load / store optimizations:
+1) Look past instructions without side-effects (not load, store, branch, etc.)
+   when forming the list of loads / stores to optimize.
+
+2) Smarter register allocation?
+We are probably missing some opportunities to use ldm / stm. Consider:
+
+ldr r5, [r0]
+ldr r4, [r0, #4]
+
+This cannot be merged into a ldm. Perhaps we will need to do the transformation
+before register allocation. Then teach the register allocator to allocate a
+chunk of consecutive registers.
+
+3) Better representation for block transfer? This is from Olden/power:
+
+	fldd d0, [r4]
+	fstd d0, [r4, #+32]
+	fldd d0, [r4, #+8]
+	fstd d0, [r4, #+40]
+	fldd d0, [r4, #+16]
+	fstd d0, [r4, #+48]
+	fldd d0, [r4, #+24]
+	fstd d0, [r4, #+56]
+
+If we can spare the registers, it would be better to use fldm and fstm here.
+Need major register allocator enhancement though.
+
+4) Can we recognize the relative position of constantpool entries? i.e. Treat
+
+	ldr r0, LCPI17_3
+	ldr r1, LCPI17_4
+	ldr r2, LCPI17_5
+
+   as
+	ldr r0, LCPI17
+	ldr r1, LCPI17+4
+	ldr r2, LCPI17+8
+
+   Then the ldr's can be combined into a single ldm. See Olden/power.
+
+Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a
+double 64-bit FP constant:
+
+	adr	r0, L6
+	ldmia	r0, {r0-r1}
+
+	.align 2
+L6:
+	.long	-858993459
+	.long	1074318540
+
+5) Can we make use of ldrd and strd? Instead of generating ldm / stm, use
+ldrd/strd instead if there are only two destination registers that form an
+odd/even pair. However, we probably would pay a penalty if the address is not
+aligned on 8-byte boundary. This requires more information on load / store
+nodes (and MI's?) then we currently carry.
+
+6) struct copies appear to be done field by field 
+instead of by words, at least sometimes:
+
+struct foo { int x; short s; char c1; char c2; };
+void cpy(struct foo*a, struct foo*b) { *a = *b; }
+
+llvm code (-O2)
+        ldrb r3, [r1, #+6]
+        ldr r2, [r1]
+        ldrb r12, [r1, #+7]
+        ldrh r1, [r1, #+4]
+        str r2, [r0]
+        strh r1, [r0, #+4]
+        strb r3, [r0, #+6]
+        strb r12, [r0, #+7]
+gcc code (-O2)
+        ldmia   r1, {r1-r2}
+        stmia   r0, {r1-r2}
+
+In this benchmark poor handling of aggregate copies has shown up as
+having a large effect on size, and possibly speed as well (we don't have
+a good way to measure on ARM).
+
+//===---------------------------------------------------------------------===//
+
+* Consider this silly example:
+
+double bar(double x) {  
+  double r = foo(3.1);
+  return x+r;
+}
+
+_bar:
+	sub sp, sp, #16
+	str r4, [sp, #+12]
+	str r5, [sp, #+8]
+	str lr, [sp, #+4]
+	mov r4, r0
+	mov r5, r1
+	ldr r0, LCPI2_0
+	bl _foo
+	fmsr f0, r0
+	fcvtsd d0, f0
+	fmdrr d1, r4, r5
+	faddd d0, d0, d1
+	fmrrd r0, r1, d0
+	ldr lr, [sp, #+4]
+	ldr r5, [sp, #+8]
+	ldr r4, [sp, #+12]
+	add sp, sp, #16
+	bx lr
+
+Ignore the prologue and epilogue stuff for a second. Note 
+	mov r4, r0
+	mov r5, r1
+the copys to callee-save registers and the fact they are only being used by the
+fmdrr instruction. It would have been better had the fmdrr been scheduled
+before the call and place the result in a callee-save DPR register. The two
+mov ops would not have been necessary.
+
+//===---------------------------------------------------------------------===//
+
+Calling convention related stuff:
+
+* gcc's parameter passing implementation is terrible and we suffer as a result:
+
+e.g.
+struct s {
+  double d1;
+  int s1;
+};
+
+void foo(struct s S) {
+  printf("%g, %d\n", S.d1, S.s1);
+}
+
+'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and
+then reload them to r1, r2, and r3 before issuing the call (r0 contains the
+address of the format string):
+
+	stmfd	sp!, {r7, lr}
+	add	r7, sp, #0
+	sub	sp, sp, #12
+	stmia	sp, {r0, r1, r2}
+	ldmia	sp, {r1-r2}
+	ldr	r0, L5
+	ldr	r3, [sp, #8]
+L2:
+	add	r0, pc, r0
+	bl	L_printf$stub
+
+Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves?
+
+* Return an aggregate type is even worse:
+
+e.g.
+struct s foo(void) {
+  struct s S = {1.1, 2};
+  return S;
+}
+
+	mov	ip, r0
+	ldr	r0, L5
+	sub	sp, sp, #12
+L2:
+	add	r0, pc, r0
+	@ lr needed for prologue
+	ldmia	r0, {r0, r1, r2}
+	stmia	sp, {r0, r1, r2}
+	stmia	ip, {r0, r1, r2}
+	mov	r0, ip
+	add	sp, sp, #12
+	bx	lr
+
+r0 (and later ip) is the hidden parameter from caller to store the value in. The
+first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1,
+r2 into the address passed in. However, there is one additional stmia that
+stores r0, r1, and r2 to some stack location. The store is dead.
+
+The llvm-gcc generated code looks like this:
+
+csretcc void %foo(%struct.s* %agg.result) {
+entry:
+	%S = alloca %struct.s, align 4		; <%struct.s*> [#uses=1]
+	%memtmp = alloca %struct.s		; <%struct.s*> [#uses=1]
+	cast %struct.s* %S to sbyte*		; <sbyte*>:0 [#uses=2]
+	call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 )
+	cast %struct.s* %agg.result to sbyte*		; <sbyte*>:1 [#uses=2]
+	call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 )
+	cast %struct.s* %memtmp to sbyte*		; <sbyte*>:2 [#uses=1]
+	call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 )
+	ret void
+}
+
+llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from
+constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated
+into a number of load and stores, or 2) custom lower memcpy (of small size) to
+be ldmia / stmia. I think option 2 is better but the current register
+allocator cannot allocate a chunk of registers at a time.
+
+A feasible temporary solution is to use specific physical registers at the
+lowering time for small (<= 4 words?) transfer size.
+
+* ARM CSRet calling convention requires the hidden argument to be returned by
+the callee.
+
+//===---------------------------------------------------------------------===//
+
+We can definitely do a better job on BB placements to eliminate some branches.
+It's very common to see llvm generated assembly code that looks like this:
+
+LBB3:
+ ...
+LBB4:
+...
+  beq LBB3
+  b LBB2
+
+If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can
+then eliminate beq and and turn the unconditional branch to LBB2 to a bne.
+
+See McCat/18-imp/ComputeBoundingBoxes for an example.
+
+//===---------------------------------------------------------------------===//
+
+Register scavenging is now implemented.  The example in the previous version
+of this document produces optimal code at -O2.
+
+//===---------------------------------------------------------------------===//
+
+Pre-/post- indexed load / stores:
+
+1) We should not make the pre/post- indexed load/store transform if the base ptr
+is guaranteed to be live beyond the load/store. This can happen if the base
+ptr is live out of the block we are performing the optimization. e.g.
+
+mov r1, r2
+ldr r3, [r1], #4
+...
+
+vs.
+
+ldr r3, [r2]
+add r1, r2, #4
+...
+
+In most cases, this is just a wasted optimization. However, sometimes it can
+negatively impact the performance because two-address code is more restrictive
+when it comes to scheduling.
+
+Unfortunately, liveout information is currently unavailable during DAG combine
+time.
+
+2) Consider spliting a indexed load / store into a pair of add/sub + load/store
+   to solve #1 (in TwoAddressInstructionPass.cpp).
+
+3) Enhance LSR to generate more opportunities for indexed ops.
+
+4) Once we added support for multiple result patterns, write indexed loads
+   patterns instead of C++ instruction selection code.
+
+5) Use FLDM / FSTM to emulate indexed FP load / store.
+
+//===---------------------------------------------------------------------===//
+
+We should add i64 support to take advantage of the 64-bit load / stores.
+We can add a pseudo i64 register class containing pseudo registers that are
+register pairs. All other ops (e.g. add, sub) would be expanded as usual.
+
+We need to add pseudo instructions (i.e. gethi / getlo) to extract i32 registers
+from the i64 register. These are single moves which can be eliminated if the
+destination register is a sub-register of the source. We should implement proper
+subreg support in the register allocator to coalesce these away.
+
+There are other minor issues such as multiple instructions for a spill / restore
+/ move.
+
+//===---------------------------------------------------------------------===//
+
+Implement support for some more tricky ways to materialize immediates.  For
+example, to get 0xffff8000, we can use:
+
+mov r9, #&3f8000
+sub r9, r9, #&400000
+
+//===---------------------------------------------------------------------===//
+
+We sometimes generate multiple add / sub instructions to update sp in prologue
+and epilogue if the inc / dec value is too large to fit in a single immediate
+operand. In some cases, perhaps it might be better to load the value from a
+constantpool instead.
+
+//===---------------------------------------------------------------------===//
+
+GCC generates significantly better code for this function.
+
+int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) {
+    int i = 0;
+
+    if (StackPtr != 0) {
+       while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768)))
+          Line[i++] = Stack[--StackPtr];
+        if (LineLen > 32768)
+        {
+            while (StackPtr != 0 && i < LineLen)
+            {
+                i++;
+                --StackPtr;
+            }
+        }
+    }
+    return StackPtr;
+}
+
+//===---------------------------------------------------------------------===//
+
+This should compile to the mlas instruction:
+int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; }
+
+//===---------------------------------------------------------------------===//
+
+At some point, we should triage these to see if they still apply to us:
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663
+
+http://www.inf.u-szeged.hu/gcc-arm/
+http://citeseer.ist.psu.edu/debus04linktime.html
+
+//===---------------------------------------------------------------------===//
+
+gcc generates smaller code for this function at -O2 or -Os:
+
+void foo(signed char* p) {
+  if (*p == 3)
+     bar();
+   else if (*p == 4)
+    baz();
+  else if (*p == 5)
+    quux();
+}
+
+llvm decides it's a good idea to turn the repeated if...else into a
+binary tree, as if it were a switch; the resulting code requires -1 
+compare-and-branches when *p<=2 or *p==5, the same number if *p==4
+or *p>6, and +1 if *p==3.  So it should be a speed win
+(on balance).  However, the revised code is larger, with 4 conditional 
+branches instead of 3.
+
+More seriously, there is a byte->word extend before
+each comparison, where there should be only one, and the condition codes
+are not remembered when the same two values are compared twice.
+
+//===---------------------------------------------------------------------===//
+
+More register scavenging work:
+
+1. Use the register scavenger to track frame index materialized into registers
+   (those that do not fit in addressing modes) to allow reuse in the same BB.
+2. Finish scavenging for Thumb.
+3. We know some spills and restores are unnecessary. The issue is once live
+   intervals are merged, they are not never split. So every def is spilled
+   and every use requires a restore if the register allocator decides the
+   resulting live interval is not assigned a physical register. It may be
+   possible (with the help of the scavenger) to turn some spill / restore
+   pairs into register copies.
+
+//===---------------------------------------------------------------------===//
+
+More LSR enhancements possible:
+
+1. Teach LSR about pre- and post- indexed ops to allow iv increment be merged
+   in a load / store.
+2. Allow iv reuse even when a type conversion is required. For example, i8
+   and i32 load / store addressing modes are identical.
+
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+int foo(int a, int b, int c, int d) {
+  long long acc = (long long)a * (long long)b;
+  acc += (long long)c * (long long)d;
+  return (int)(acc >> 32);
+}
+
+Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies 
+two signed 32-bit values to produce a 64-bit value, and accumulates this with 
+a 64-bit value.
+
+We currently get this with v6:
+
+_foo:
+        mul r12, r1, r0
+        smmul r1, r1, r0
+        smmul r0, r3, r2
+        mul r3, r3, r2
+        adds r3, r3, r12
+        adc r0, r0, r1
+        bx lr
+
+and this with v4:
+
+_foo:
+        stmfd sp!, {r7, lr}
+        mov r7, sp
+        mul r12, r1, r0
+        smull r0, r1, r1, r0
+        smull lr, r0, r3, r2
+        mul r3, r3, r2
+        adds r3, r3, r12
+        adc r0, r0, r1
+        ldmfd sp!, {r7, pc}
+
+This apparently occurs in real code.
+
+//===---------------------------------------------------------------------===//

diff --git a/lib/Target/Alpha/Alpha.h b/lib/Target/Alpha/Alpha.h
new file mode 100644
index 0000000..a1acde4
--- /dev/null
+++ b/lib/Target/Alpha/Alpha.h

@@ -0,0 +1,48 @@
+//===-- Alpha.h - Top-level interface for Alpha representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Alpha back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ALPHA_H
+#define TARGET_ALPHA_H
+
+#include <iosfwd>
+
+namespace llvm {
+
+  class AlphaTargetMachine;
+  class FunctionPass;
+  class TargetMachine;
+  class MachineCodeEmitter;
+
+  FunctionPass *createAlphaSimpleInstructionSelector(TargetMachine &TM);
+  FunctionPass *createAlphaISelDag(TargetMachine &TM);
+  FunctionPass *createAlphaCodePrinterPass(std::ostream &OS,
+                                             TargetMachine &TM);
+  FunctionPass *createAlphaPatternInstructionSelector(TargetMachine &TM);
+  FunctionPass *createAlphaCodeEmitterPass(AlphaTargetMachine &TM,
+                                           MachineCodeEmitter &MCE);
+  FunctionPass *createAlphaLLRPPass(AlphaTargetMachine &tm);
+  FunctionPass *createAlphaBranchSelectionPass();
+
+} // end namespace llvm;
+
+// Defines symbolic names for Alpha registers.  This defines a mapping from
+// register name to register number.
+//
+#include "AlphaGenRegisterNames.inc"
+
+// Defines symbolic names for the Alpha instructions.
+//
+#include "AlphaGenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/Alpha/Alpha.td b/lib/Target/Alpha/Alpha.td
new file mode 100644
index 0000000..fbf7ed9
--- /dev/null
+++ b/lib/Target/Alpha/Alpha.td

@@ -0,0 +1,66 @@
+//===- Alpha.td - Describe the Alpha Target Machine --------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "../Target.td"
+
+//Alpha is little endian
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features
+//===----------------------------------------------------------------------===//
+
+def FeatureCIX : SubtargetFeature<"cix", "HasCT", "true",
+                                  "Enable CIX extentions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AlphaRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Schedule Description
+//===----------------------------------------------------------------------===//
+
+include "AlphaSchedule.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "AlphaInstrInfo.td"
+
+def AlphaInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+ // let TSFlagsFields = [];
+ // let TSFlagsShifts = [];
+}
+
+//===----------------------------------------------------------------------===//
+// Alpha Processor Definitions
+//===----------------------------------------------------------------------===//
+
+def : Processor<"generic", Alpha21264Itineraries, []>;
+def : Processor<"ev6"    , Alpha21264Itineraries, []>;
+def : Processor<"ev67"   , Alpha21264Itineraries, [FeatureCIX]>;
+
+//===----------------------------------------------------------------------===//
+// The Alpha Target
+//===----------------------------------------------------------------------===//
+
+
+def Alpha : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = AlphaInstrInfo;
+}

diff --git a/lib/Target/Alpha/AlphaAsmPrinter.cpp b/lib/Target/Alpha/AlphaAsmPrinter.cpp
new file mode 100644
index 0000000..0494777
--- /dev/null
+++ b/lib/Target/Alpha/AlphaAsmPrinter.cpp

@@ -0,0 +1,297 @@
+//===-- AlphaAsmPrinter.cpp - Alpha LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format Alpha assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Alpha.h"
+#include "AlphaInstrInfo.h"
+#include "AlphaTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  struct VISIBILITY_HIDDEN AlphaAsmPrinter : public AsmPrinter {
+
+    /// Unique incrementer for label values for referencing Global values.
+    ///
+
+    AlphaAsmPrinter(std::ostream &o, TargetMachine &tm, const TargetAsmInfo *T)
+      : AsmPrinter(o, tm, T) {
+    }
+
+    virtual const char *getPassName() const {
+      return "Alpha Assembly Printer";
+    }
+    bool printInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO, bool IsCallOp = false);
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printBaseOffsetPair (const MachineInstr *MI, int i, bool brackets=true);
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+    
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, 
+                               unsigned OpNo,
+                               unsigned AsmVariant, 
+                               const char *ExtraCode);
+  };
+} // end of anonymous namespace
+
+/// createAlphaCodePrinterPass - Returns a pass that prints the Alpha
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *llvm::createAlphaCodePrinterPass(std::ostream &o,
+                                               TargetMachine &tm) {
+  return new AlphaAsmPrinter(o, tm, tm.getTargetAsmInfo());
+}
+
+#include "AlphaGenAsmWriter.inc"
+
+void AlphaAsmPrinter::printOperand(const MachineInstr *MI, int opNum)
+{
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.getType() == MachineOperand::MO_Register) {
+    assert(MRegisterInfo::isPhysicalRegister(MO.getReg())&&"Not physreg??");
+    O << TM.getRegisterInfo()->get(MO.getReg()).Name;
+  } else if (MO.isImmediate()) {
+    O << MO.getImmedValue();
+    assert(MO.getImmedValue() < (1 << 30));
+  } else {
+    printOp(MO);
+  }
+}
+
+
+void AlphaAsmPrinter::printOp(const MachineOperand &MO, bool IsCallOp) {
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << RI.get(MO.getReg()).Name;
+    return;
+
+  case MachineOperand::MO_Immediate:
+    cerr << "printOp() does not handle immediate values\n";
+    abort();
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMachineBasicBlock());
+    return;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getConstantPoolIndex();
+    return;
+
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    return;
+
+  case MachineOperand::MO_GlobalAddress: {
+    GlobalValue *GV = MO.getGlobal();
+    O << Mang->getValueName(GV);
+    if (GV->isDeclaration() && GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+    return;
+  }
+
+  case MachineOperand::MO_JumpTableIndex:
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getJumpTableIndex();
+    return;
+
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool AlphaAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out jump tables referenced by the function
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+  
+  EmitAlignment(4, F);
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+    break;
+   case Function::ExternalLinkage:
+     O << "\t.globl " << CurrentFnName << "\n";
+     break;
+  case Function::WeakLinkage:
+  case Function::LinkOnceLinkage:
+    O << TAI->getWeakRefDirective() << CurrentFnName << "\n";
+    break;
+  }
+
+  O << "\t.ent " << CurrentFnName << "\n";
+
+  O << CurrentFnName << ":\n";
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      ++EmittedInsts;
+      O << "\t";
+      if (!printInstruction(II)) {
+        assert(0 && "Unhandled instruction in asm writer!");
+        abort();
+      }
+    }
+  }
+
+  O << "\t.end " << CurrentFnName << "\n";
+
+  // We didn't modify anything.
+  return false;
+}
+
+bool AlphaAsmPrinter::doInitialization(Module &M)
+{
+  if(TM.getSubtarget<AlphaSubtarget>().hasCT())
+    O << "\t.arch ev6\n"; //This might need to be ev67, so leave this test here
+  else
+    O << "\t.arch ev6\n";
+  O << "\t.set noat\n";
+  AsmPrinter::doInitialization(M);
+  return false;
+}
+
+bool AlphaAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) {
+
+    if (!I->hasInitializer()) continue;  // External global require no code
+    
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(I))
+      continue;
+    
+    std::string name = Mang->getValueName(I);
+    Constant *C = I->getInitializer();
+    unsigned Size = TD->getTypeSize(C->getType());
+    unsigned Align = TD->getPreferredAlignmentLog(I);
+    
+    //1: hidden?
+    if (I->hasHiddenVisibility())
+      O << TAI->getHiddenDirective() << name << "\n";
+    
+    //2: kind
+    switch (I->getLinkage()) {
+    case GlobalValue::LinkOnceLinkage:
+    case GlobalValue::WeakLinkage:
+      O << TAI->getWeakRefDirective() << name << '\n';
+      break;
+    case GlobalValue::AppendingLinkage:
+    case GlobalValue::ExternalLinkage:
+      O << "\t.globl " << name << "\n";
+      break;
+    case GlobalValue::InternalLinkage:
+      break;
+    default:
+      assert(0 && "Unknown linkage type!");
+      cerr << "Unknown linkage type!\n";
+      abort();
+    }
+    
+    //3: Section (if changed)
+    if (I->hasSection() &&
+        (I->getSection() == ".ctors" ||
+         I->getSection() == ".dtors")) {
+      std::string SectionName = ".section\t" + I->getSection()
+        + ",\"aw\",@progbits";
+      SwitchToDataSection(SectionName.c_str());
+    } else {
+      if (C->isNullValue())
+        SwitchToDataSection("\t.section\t.bss", I);
+      else
+        SwitchToDataSection("\t.section\t.data", I);
+    }
+    
+    //4: Type, Size, Align
+    O << "\t.type\t" << name << ", @object\n";
+    O << "\t.size\t" << name << ", " << Size << "\n";
+    EmitAlignment(Align, I);
+    
+    O << name << ":\n";
+    
+    // If the initializer is a extern weak symbol, remember to emit the weak
+    // reference!
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+      if (GV->hasExternalWeakLinkage())
+        ExtWeakSymbols.insert(GV);
+    
+    EmitGlobalConstant(C);
+    O << '\n';
+  }
+
+  AsmPrinter::doFinalization(M);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool AlphaAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant, 
+                                      const char *ExtraCode) {
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool AlphaAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, 
+                                            unsigned OpNo,
+                                            unsigned AsmVariant, 
+                                            const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  O << "0(";
+  printOperand(MI, OpNo);
+  O << ")";
+  return false;
+}

diff --git a/lib/Target/Alpha/AlphaBranchSelector.cpp b/lib/Target/Alpha/AlphaBranchSelector.cpp
new file mode 100644
index 0000000..ac789b3
--- /dev/null
+++ b/lib/Target/Alpha/AlphaBranchSelector.cpp

@@ -0,0 +1,67 @@
+//===-- AlphaBranchSelector.cpp - Convert Pseudo branchs ----------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Andrew Lenharth and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace Pseudo COND_BRANCH_* with their appropriate real branch
+// Simplified version of the PPC Branch Selector
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetAsmInfo.h"
+using namespace llvm;
+
+namespace {
+  struct VISIBILITY_HIDDEN AlphaBSel : public MachineFunctionPass {
+    static char ID;
+    AlphaBSel() : MachineFunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "Alpha Branch Selection";
+    }
+  };
+  char AlphaBSel::ID = 0;
+}
+
+/// createAlphaBranchSelectionPass - returns an instance of the Branch Selection
+/// Pass
+///
+FunctionPass *llvm::createAlphaBranchSelectionPass() {
+  return new AlphaBSel();
+}
+
+bool AlphaBSel::runOnMachineFunction(MachineFunction &Fn) {
+
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+    
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI) {
+      if (MBBI->getOpcode() == Alpha::COND_BRANCH_I ||
+          MBBI->getOpcode() == Alpha::COND_BRANCH_F) {
+        
+        // condbranch operands:
+        // 0. bc opcode
+        // 1. reg
+        // 2. target MBB
+        const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+        MBBI->setInstrDescriptor(TII->get(MBBI->getOperand(0).getImm()));
+      }
+    }
+  }
+  
+  return true;
+}
+

diff --git a/lib/Target/Alpha/AlphaCodeEmitter.cpp b/lib/Target/Alpha/AlphaCodeEmitter.cpp
new file mode 100644
index 0000000..3549551
--- /dev/null
+++ b/lib/Target/Alpha/AlphaCodeEmitter.cpp

@@ -0,0 +1,222 @@
+//===-- Alpha/AlphaCodeEmitter.cpp - Convert Alpha code to machine code ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Alpha machine instructions
+// into relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "alpha-emitter"
+#include "AlphaTargetMachine.h"
+#include "AlphaRelocations.h"
+#include "Alpha.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+namespace {
+  class AlphaCodeEmitter : public MachineFunctionPass {
+    const AlphaInstrInfo  *II;
+    TargetMachine &TM;
+    MachineCodeEmitter  &MCE;
+
+    /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr
+    ///
+    int getMachineOpValue(MachineInstr &MI, MachineOperand &MO);
+
+  public:
+    static char ID;
+    explicit AlphaCodeEmitter(TargetMachine &tm, MachineCodeEmitter &mce)
+      : MachineFunctionPass((intptr_t)&ID), II(0), TM(tm), MCE(mce) {}
+    AlphaCodeEmitter(TargetMachine &tm, MachineCodeEmitter &mce,
+                     const AlphaInstrInfo& ii)
+      : MachineFunctionPass((intptr_t)&ID), II(&ii), TM(tm), MCE(mce) {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "Alpha Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI);
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+    ///
+    unsigned getBinaryCodeForInstr(MachineInstr &MI);
+
+  private:
+    void emitBasicBlock(MachineBasicBlock &MBB);
+
+  };
+  char AlphaCodeEmitter::ID = 0;
+}
+
+/// createAlphaCodeEmitterPass - Return a pass that emits the collected Alpha code
+/// to the specified MCE object.
+FunctionPass *llvm::createAlphaCodeEmitterPass(AlphaTargetMachine &TM,
+                                               MachineCodeEmitter &MCE) {
+  return new AlphaCodeEmitter(TM, MCE);
+}
+
+bool AlphaCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  II = ((AlphaTargetMachine&)MF.getTarget()).getInstrInfo();
+
+  do {
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      emitBasicBlock(*I);
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void AlphaCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
+  MCE.StartMachineBasicBlock(&MBB);
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+       I != E; ++I) {
+    MachineInstr &MI = *I;
+    switch(MI.getOpcode()) {
+    default:
+      MCE.emitWordLE(getBinaryCodeForInstr(*I));
+      break;
+    case Alpha::ALTENT:
+    case Alpha::PCLABEL:
+    case Alpha::MEMLABEL:
+    case Alpha::IDEF_I:
+    case Alpha::IDEF_F32:
+    case Alpha::IDEF_F64:
+      break; //skip these
+    }
+  }
+}
+
+static unsigned getAlphaRegNumber(unsigned Reg) {
+  switch (Reg) {
+  case Alpha::R0  : case Alpha::F0  : return 0;
+  case Alpha::R1  : case Alpha::F1  : return 1;
+  case Alpha::R2  : case Alpha::F2  : return 2;
+  case Alpha::R3  : case Alpha::F3  : return 3;
+  case Alpha::R4  : case Alpha::F4  : return 4;
+  case Alpha::R5  : case Alpha::F5  : return 5;
+  case Alpha::R6  : case Alpha::F6  : return 6;
+  case Alpha::R7  : case Alpha::F7  : return 7;
+  case Alpha::R8  : case Alpha::F8  : return 8;
+  case Alpha::R9  : case Alpha::F9  : return 9;
+  case Alpha::R10 : case Alpha::F10 : return 10;
+  case Alpha::R11 : case Alpha::F11 : return 11;
+  case Alpha::R12 : case Alpha::F12 : return 12;
+  case Alpha::R13 : case Alpha::F13 : return 13;
+  case Alpha::R14 : case Alpha::F14 : return 14;
+  case Alpha::R15 : case Alpha::F15 : return 15;
+  case Alpha::R16 : case Alpha::F16 : return 16;
+  case Alpha::R17 : case Alpha::F17 : return 17;
+  case Alpha::R18 : case Alpha::F18 : return 18;
+  case Alpha::R19 : case Alpha::F19 : return 19;
+  case Alpha::R20 : case Alpha::F20 : return 20;
+  case Alpha::R21 : case Alpha::F21 : return 21;
+  case Alpha::R22 : case Alpha::F22 : return 22;
+  case Alpha::R23 : case Alpha::F23 : return 23;
+  case Alpha::R24 : case Alpha::F24 : return 24;
+  case Alpha::R25 : case Alpha::F25 : return 25;
+  case Alpha::R26 : case Alpha::F26 : return 26;
+  case Alpha::R27 : case Alpha::F27 : return 27;
+  case Alpha::R28 : case Alpha::F28 : return 28;
+  case Alpha::R29 : case Alpha::F29 : return 29;
+  case Alpha::R30 : case Alpha::F30 : return 30;
+  case Alpha::R31 : case Alpha::F31 : return 31;
+  default:
+    assert(0 && "Unhandled reg");
+    abort();
+  }
+}
+
+int AlphaCodeEmitter::getMachineOpValue(MachineInstr &MI, MachineOperand &MO) {
+
+  int rv = 0; // Return value; defaults to 0 for unhandled cases
+              // or things that get fixed up later by the JIT.
+
+  if (MO.isRegister()) {
+    rv = getAlphaRegNumber(MO.getReg());
+  } else if (MO.isImmediate()) {
+    rv = MO.getImmedValue();
+  } else if (MO.isGlobalAddress() || MO.isExternalSymbol()
+             || MO.isConstantPoolIndex()) {
+    DOUT << MO << " is a relocated op for " << MI << "\n";
+    unsigned Reloc = 0;
+    int Offset = 0;
+    bool useGOT = false;
+    switch (MI.getOpcode()) {
+    case Alpha::BSR:
+      Reloc = Alpha::reloc_bsr;
+      break;
+    case Alpha::LDLr:
+    case Alpha::LDQr:
+    case Alpha::LDBUr:
+    case Alpha::LDWUr:
+    case Alpha::LDSr:
+    case Alpha::LDTr:
+    case Alpha::LDAr:
+    case Alpha::STQr:
+    case Alpha::STLr:
+    case Alpha::STWr:
+    case Alpha::STBr:
+    case Alpha::STSr:
+    case Alpha::STTr:
+      Reloc = Alpha::reloc_gprellow;
+      break;
+    case Alpha::LDAHr:
+      Reloc = Alpha::reloc_gprelhigh;
+      break;
+    case Alpha::LDQl:
+      Reloc = Alpha::reloc_literal;
+      useGOT = true;
+      break;
+    case Alpha::LDAg:
+    case Alpha::LDAHg:
+      Reloc = Alpha::reloc_gpdist;
+      Offset = MI.getOperand(3).getImmedValue();
+      break;
+    default:
+      assert(0 && "unknown relocatable instruction");
+      abort();
+    }
+    if (MO.isGlobalAddress())
+      MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getGlobal(), Offset,
+                                          false, useGOT));
+    else if (MO.isExternalSymbol())
+      MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getSymbolName(), Offset,
+                                          true));
+    else
+    MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getConstantPoolIndex(),
+                                          Offset));
+  } else if (MO.isMachineBasicBlock()) {
+    MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                               Alpha::reloc_bsr,
+                                               MO.getMachineBasicBlock()));
+  }else {
+    cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
+    abort();
+  }
+
+  return rv;
+}
+
+
+#include "AlphaGenCodeEmitter.inc"
+

diff --git a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
new file mode 100644
index 0000000..4f7533c
--- /dev/null
+++ b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp

@@ -0,0 +1,563 @@
+//===-- AlphaISelDAGToDAG.cpp - Alpha pattern matching inst selector ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Andrew Lenharth and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for Alpha,
+// converting from a legalized dag to a Alpha dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaTargetMachine.h"
+#include "AlphaISelLowering.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <queue>
+#include <set>
+using namespace llvm;
+
+namespace {
+
+  //===--------------------------------------------------------------------===//
+  /// AlphaDAGToDAGISel - Alpha specific code to select Alpha machine
+  /// instructions for SelectionDAG operations.
+  class AlphaDAGToDAGISel : public SelectionDAGISel {
+    AlphaTargetLowering AlphaLowering;
+
+    static const int64_t IMM_LOW  = -32768;
+    static const int64_t IMM_HIGH = 32767;
+    static const int64_t IMM_MULT = 65536;
+    static const int64_t IMM_FULLHIGH = IMM_HIGH + IMM_HIGH * IMM_MULT;
+    static const int64_t IMM_FULLLOW = IMM_LOW + IMM_LOW  * IMM_MULT;
+
+    static int64_t get_ldah16(int64_t x) {
+      int64_t y = x / IMM_MULT;
+      if (x % IMM_MULT > IMM_HIGH)
+        ++y;
+      return y;
+    }
+
+    static int64_t get_lda16(int64_t x) {
+      return x - get_ldah16(x) * IMM_MULT;
+    }
+
+    /// get_zapImm - Return a zap mask if X is a valid immediate for a zapnot
+    /// instruction (if not, return 0).  Note that this code accepts partial
+    /// zap masks.  For example (and LHS, 1) is a valid zap, as long we know
+    /// that the bits 1-7 of LHS are already zero.  If LHS is non-null, we are
+    /// in checking mode.  If LHS is null, we assume that the mask has already
+    /// been validated before.
+    uint64_t get_zapImm(SDOperand LHS, uint64_t Constant) {
+      uint64_t BitsToCheck = 0;
+      unsigned Result = 0;
+      for (unsigned i = 0; i != 8; ++i) {
+        if (((Constant >> 8*i) & 0xFF) == 0) {
+          // nothing to do.
+        } else {
+          Result |= 1 << i;
+          if (((Constant >> 8*i) & 0xFF) == 0xFF) {
+            // If the entire byte is set, zapnot the byte.
+          } else if (LHS.Val == 0) {
+            // Otherwise, if the mask was previously validated, we know its okay
+            // to zapnot this entire byte even though all the bits aren't set.
+          } else {
+            // Otherwise we don't know that the it's okay to zapnot this entire
+            // byte.  Only do this iff we can prove that the missing bits are
+            // already null, so the bytezap doesn't need to really null them.
+            BitsToCheck |= ~Constant & (0xFF << 8*i);
+          }
+        }
+      }
+      
+      // If there are missing bits in a byte (for example, X & 0xEF00), check to
+      // see if the missing bits (0x1000) are already known zero if not, the zap
+      // isn't okay to do, as it won't clear all the required bits.
+      if (BitsToCheck &&
+          !CurDAG->MaskedValueIsZero(LHS, BitsToCheck))
+        return 0;
+      
+      return Result;
+    }
+    
+    static uint64_t get_zapImm(uint64_t x) {
+      unsigned build = 0;
+      for(int i = 0; i != 8; ++i) {
+        if ((x & 0x00FF) == 0x00FF)
+          build |= 1 << i;
+        else if ((x & 0x00FF) != 0)
+          return 0;
+        x >>= 8;
+      }
+      return build;
+    }
+      
+    
+    static uint64_t getNearPower2(uint64_t x) {
+      if (!x) return 0;
+      unsigned at = CountLeadingZeros_64(x);
+      uint64_t complow = 1 << (63 - at);
+      uint64_t comphigh = 1 << (64 - at);
+      //cerr << x << ":" << complow << ":" << comphigh << "\n";
+      if (abs(complow - x) <= abs(comphigh - x))
+        return complow;
+      else
+        return comphigh;
+    }
+
+    static bool chkRemNearPower2(uint64_t x, uint64_t r, bool swap) {
+      uint64_t y = getNearPower2(x);
+      if (swap)
+        return (y - x) == r;
+      else
+        return (x - y) == r;
+    }
+
+    static bool isFPZ(SDOperand N) {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+      return (CN && (CN->isExactlyValue(+0.0) || CN->isExactlyValue(-0.0)));
+    }
+    static bool isFPZn(SDOperand N) {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+      return (CN && CN->isExactlyValue(-0.0));
+    }
+    static bool isFPZp(SDOperand N) {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+      return (CN && CN->isExactlyValue(+0.0));
+    }
+
+  public:
+    AlphaDAGToDAGISel(TargetMachine &TM)
+      : SelectionDAGISel(AlphaLowering), 
+        AlphaLowering(*(AlphaTargetLowering*)(TM.getTargetLowering())) 
+    {}
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDOperand getI64Imm(int64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    SDNode *Select(SDOperand Op);
+    
+    /// InstructionSelectBasicBlock - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelectBasicBlock(SelectionDAG &DAG);
+    
+    virtual const char *getPassName() const {
+      return "Alpha DAG->DAG Pattern Instruction Selection";
+    } 
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDOperand &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDOperand> &OutOps,
+                                              SelectionDAG &DAG) {
+      SDOperand Op0;
+      switch (ConstraintCode) {
+      default: return true;
+      case 'm':   // memory
+        Op0 = Op;
+        AddToISelQueue(Op0);
+        break;
+      }
+      
+      OutOps.push_back(Op0);
+      return false;
+    }
+    
+// Include the pieces autogenerated from the target description.
+#include "AlphaGenDAGISel.inc"
+    
+private:
+    SDOperand getGlobalBaseReg();
+    SDOperand getGlobalRetAddr();
+    void SelectCALL(SDOperand Op);
+
+  };
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+///
+SDOperand AlphaDAGToDAGISel::getGlobalBaseReg() {
+  MachineFunction* MF = BB->getParent();
+  unsigned GP = 0;
+  for(MachineFunction::livein_iterator ii = MF->livein_begin(), 
+        ee = MF->livein_end(); ii != ee; ++ii)
+    if (ii->first == Alpha::R29) {
+      GP = ii->second;
+      break;
+    }
+  assert(GP && "GOT PTR not in liveins");
+  return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), 
+                                GP, MVT::i64);
+}
+
+/// getRASaveReg - Grab the return address
+///
+SDOperand AlphaDAGToDAGISel::getGlobalRetAddr() {
+  MachineFunction* MF = BB->getParent();
+  unsigned RA = 0;
+  for(MachineFunction::livein_iterator ii = MF->livein_begin(), 
+        ee = MF->livein_end(); ii != ee; ++ii)
+    if (ii->first == Alpha::R26) {
+      RA = ii->second;
+      break;
+    }
+  assert(RA && "RA PTR not in liveins");
+  return CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                RA, MVT::i64);
+}
+
+/// InstructionSelectBasicBlock - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void AlphaDAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
+  DEBUG(BB->dump());
+  
+  // Select target instructions for the DAG.
+  DAG.setRoot(SelectRoot(DAG.getRoot()));
+  DAG.RemoveDeadNodes();
+  
+  // Emit machine code to BB. 
+  ScheduleAndEmitDAG(DAG);
+}
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+SDNode *AlphaDAGToDAGISel::Select(SDOperand Op) {
+  SDNode *N = Op.Val;
+  if (N->getOpcode() >= ISD::BUILTIN_OP_END &&
+      N->getOpcode() < AlphaISD::FIRST_NUMBER) {
+    return NULL;   // Already selected.
+  }
+
+  switch (N->getOpcode()) {
+  default: break;
+  case AlphaISD::CALL:
+    SelectCALL(Op);
+    return NULL;
+
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    return CurDAG->SelectNodeTo(N, Alpha::LDA, MVT::i64,
+                                CurDAG->getTargetFrameIndex(FI, MVT::i32),
+                                getI64Imm(0));
+  }
+  case ISD::GLOBAL_OFFSET_TABLE: {
+    SDOperand Result = getGlobalBaseReg();
+    ReplaceUses(Op, Result);
+    return NULL;
+  }
+  case AlphaISD::GlobalRetAddr: {
+    SDOperand Result = getGlobalRetAddr();
+    ReplaceUses(Op, Result);
+    return NULL;
+  }
+  
+  case AlphaISD::DivCall: {
+    SDOperand Chain = CurDAG->getEntryNode();
+    SDOperand N0 = Op.getOperand(0);
+    SDOperand N1 = Op.getOperand(1);
+    SDOperand N2 = Op.getOperand(2);
+    AddToISelQueue(N0);
+    AddToISelQueue(N1);
+    AddToISelQueue(N2);
+    Chain = CurDAG->getCopyToReg(Chain, Alpha::R24, N1, 
+                                 SDOperand(0,0));
+    Chain = CurDAG->getCopyToReg(Chain, Alpha::R25, N2, 
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, Alpha::R27, N0, 
+                                 Chain.getValue(1));
+    SDNode *CNode =
+      CurDAG->getTargetNode(Alpha::JSRs, MVT::Other, MVT::Flag, 
+                            Chain, Chain.getValue(1));
+    Chain = CurDAG->getCopyFromReg(Chain, Alpha::R27, MVT::i64, 
+                                   SDOperand(CNode, 1));
+    return CurDAG->SelectNodeTo(N, Alpha::BISr, MVT::i64, Chain, Chain);
+  }
+
+  case ISD::READCYCLECOUNTER: {
+    SDOperand Chain = N->getOperand(0);
+    AddToISelQueue(Chain); //Select chain
+    return CurDAG->getTargetNode(Alpha::RPCC, MVT::i64, MVT::Other,
+                                 Chain);
+  }
+
+  case ISD::Constant: {
+    uint64_t uval = cast<ConstantSDNode>(N)->getValue();
+    
+    if (uval == 0) {
+      SDOperand Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                                Alpha::R31, MVT::i64);
+      ReplaceUses(Op, Result);
+      return NULL;
+    }
+
+    int64_t val = (int64_t)uval;
+    int32_t val32 = (int32_t)val;
+    if (val <= IMM_HIGH + IMM_HIGH * IMM_MULT &&
+        val >= IMM_LOW  + IMM_LOW  * IMM_MULT)
+      break; //(LDAH (LDA))
+    if ((uval >> 32) == 0 && //empty upper bits
+        val32 <= IMM_HIGH + IMM_HIGH * IMM_MULT)
+      // val32 >= IMM_LOW  + IMM_LOW  * IMM_MULT) //always true
+      break; //(zext (LDAH (LDA)))
+    //Else use the constant pool
+    ConstantInt *C = ConstantInt::get(Type::Int64Ty, uval);
+    SDOperand CPI = CurDAG->getTargetConstantPool(C, MVT::i64);
+    SDNode *Tmp = CurDAG->getTargetNode(Alpha::LDAHr, MVT::i64, CPI,
+                                        getGlobalBaseReg());
+    return CurDAG->SelectNodeTo(N, Alpha::LDQr, MVT::i64, MVT::Other, 
+                                CPI, SDOperand(Tmp, 0), CurDAG->getEntryNode());
+  }
+  case ISD::TargetConstantFP: {
+    ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+    bool isDouble = N->getValueType(0) == MVT::f64;
+    MVT::ValueType T = isDouble ? MVT::f64 : MVT::f32;
+    if (CN->isExactlyValue(+0.0)) {
+      return CurDAG->SelectNodeTo(N, isDouble ? Alpha::CPYST : Alpha::CPYSS,
+                                  T, CurDAG->getRegister(Alpha::F31, T),
+                                  CurDAG->getRegister(Alpha::F31, T));
+    } else if ( CN->isExactlyValue(-0.0)) {
+      return CurDAG->SelectNodeTo(N, isDouble ? Alpha::CPYSNT : Alpha::CPYSNS,
+                                  T, CurDAG->getRegister(Alpha::F31, T),
+                                  CurDAG->getRegister(Alpha::F31, T));
+    } else {
+      abort();
+    }
+    break;
+  }
+
+  case ISD::SETCC:
+    if (MVT::isFloatingPoint(N->getOperand(0).Val->getValueType(0))) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+      unsigned Opc = Alpha::WTF;
+      bool rev = false;
+      bool inv = false;
+      switch(CC) {
+      default: DEBUG(N->dump(CurDAG)); assert(0 && "Unknown FP comparison!");
+      case ISD::SETEQ: case ISD::SETOEQ: case ISD::SETUEQ:
+        Opc = Alpha::CMPTEQ; break;
+      case ISD::SETLT: case ISD::SETOLT: case ISD::SETULT: 
+        Opc = Alpha::CMPTLT; break;
+      case ISD::SETLE: case ISD::SETOLE: case ISD::SETULE: 
+        Opc = Alpha::CMPTLE; break;
+      case ISD::SETGT: case ISD::SETOGT: case ISD::SETUGT: 
+        Opc = Alpha::CMPTLT; rev = true; break;
+      case ISD::SETGE: case ISD::SETOGE: case ISD::SETUGE: 
+        Opc = Alpha::CMPTLE; rev = true; break;
+      case ISD::SETNE: case ISD::SETONE: case ISD::SETUNE:
+        Opc = Alpha::CMPTEQ; inv = true; break;
+      case ISD::SETO:
+        Opc = Alpha::CMPTUN; inv = true; break;
+      case ISD::SETUO:
+        Opc = Alpha::CMPTUN; break;
+      };
+      SDOperand tmp1 = N->getOperand(rev?1:0);
+      SDOperand tmp2 = N->getOperand(rev?0:1);
+      AddToISelQueue(tmp1);
+      AddToISelQueue(tmp2);
+      SDNode *cmp = CurDAG->getTargetNode(Opc, MVT::f64, tmp1, tmp2);
+      if (inv) 
+        cmp = CurDAG->getTargetNode(Alpha::CMPTEQ, MVT::f64, SDOperand(cmp, 0), 
+                                    CurDAG->getRegister(Alpha::F31, MVT::f64));
+      switch(CC) {
+      case ISD::SETUEQ: case ISD::SETULT: case ISD::SETULE:
+      case ISD::SETUNE: case ISD::SETUGT: case ISD::SETUGE:
+       {
+         SDNode* cmp2 = CurDAG->getTargetNode(Alpha::CMPTUN, MVT::f64,
+                                              tmp1, tmp2);
+         cmp = CurDAG->getTargetNode(Alpha::ADDT, MVT::f64, 
+                                     SDOperand(cmp2, 0), SDOperand(cmp, 0));
+         break;
+       }
+      default: break;
+      }
+
+      SDNode* LD = CurDAG->getTargetNode(Alpha::FTOIT, MVT::i64, SDOperand(cmp, 0));
+      return CurDAG->getTargetNode(Alpha::CMPULT, MVT::i64, 
+                                   CurDAG->getRegister(Alpha::R31, MVT::i64),
+                                   SDOperand(LD,0));
+    }
+    break;
+
+  case ISD::SELECT:
+    if (MVT::isFloatingPoint(N->getValueType(0)) &&
+        (N->getOperand(0).getOpcode() != ISD::SETCC ||
+         !MVT::isFloatingPoint(N->getOperand(0).getOperand(1).getValueType()))) {
+      //This should be the condition not covered by the Patterns
+      //FIXME: Don't have SelectCode die, but rather return something testable
+      // so that things like this can be caught in fall though code
+      //move int to fp
+      bool isDouble = N->getValueType(0) == MVT::f64;
+      SDOperand cond = N->getOperand(0);
+      SDOperand TV = N->getOperand(1);
+      SDOperand FV = N->getOperand(2);
+      AddToISelQueue(cond);
+      AddToISelQueue(TV);
+      AddToISelQueue(FV);
+      
+      SDNode* LD = CurDAG->getTargetNode(Alpha::ITOFT, MVT::f64, cond);
+      return CurDAG->getTargetNode(isDouble?Alpha::FCMOVNET:Alpha::FCMOVNES,
+                                   MVT::f64, FV, TV, SDOperand(LD,0));
+    }
+    break;
+
+  case ISD::AND: {
+    ConstantSDNode* SC = NULL;
+    ConstantSDNode* MC = NULL;
+    if (N->getOperand(0).getOpcode() == ISD::SRL &&
+        (MC = dyn_cast<ConstantSDNode>(N->getOperand(1))) &&
+        (SC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)))) {
+      uint64_t sval = SC->getValue();
+      uint64_t mval = MC->getValue();
+      // If the result is a zap, let the autogened stuff handle it.
+      if (get_zapImm(N->getOperand(0), mval))
+        break;
+      // given mask X, and shift S, we want to see if there is any zap in the
+      // mask if we play around with the botton S bits
+      uint64_t dontcare = (~0ULL) >> (64 - sval);
+      uint64_t mask = mval << sval;
+      
+      if (get_zapImm(mask | dontcare))
+        mask = mask | dontcare;
+      
+      if (get_zapImm(mask)) {
+        AddToISelQueue(N->getOperand(0).getOperand(0));
+        SDOperand Z = 
+          SDOperand(CurDAG->getTargetNode(Alpha::ZAPNOTi, MVT::i64,
+                                          N->getOperand(0).getOperand(0),
+                                          getI64Imm(get_zapImm(mask))), 0);
+        return CurDAG->getTargetNode(Alpha::SRLr, MVT::i64, Z, 
+                                     getI64Imm(sval));
+      }
+    }
+    break;
+  }
+
+  }
+
+  return SelectCode(Op);
+}
+
+void AlphaDAGToDAGISel::SelectCALL(SDOperand Op) {
+  //TODO: add flag stuff to prevent nondeturministic breakage!
+
+  SDNode *N = Op.Val;
+  SDOperand Chain = N->getOperand(0);
+  SDOperand Addr = N->getOperand(1);
+  SDOperand InFlag(0,0);  // Null incoming flag value.
+  AddToISelQueue(Chain);
+
+   std::vector<SDOperand> CallOperands;
+   std::vector<MVT::ValueType> TypeOperands;
+  
+   //grab the arguments
+   for(int i = 2, e = N->getNumOperands(); i < e; ++i) {
+     TypeOperands.push_back(N->getOperand(i).getValueType());
+     AddToISelQueue(N->getOperand(i));
+     CallOperands.push_back(N->getOperand(i));
+   }
+   int count = N->getNumOperands() - 2;
+
+   static const unsigned args_int[] = {Alpha::R16, Alpha::R17, Alpha::R18,
+                                       Alpha::R19, Alpha::R20, Alpha::R21};
+   static const unsigned args_float[] = {Alpha::F16, Alpha::F17, Alpha::F18,
+                                         Alpha::F19, Alpha::F20, Alpha::F21};
+   
+   for (int i = 6; i < count; ++i) {
+     unsigned Opc = Alpha::WTF;
+     if (MVT::isInteger(TypeOperands[i])) {
+       Opc = Alpha::STQ;
+     } else if (TypeOperands[i] == MVT::f32) {
+       Opc = Alpha::STS;
+     } else if (TypeOperands[i] == MVT::f64) {
+       Opc = Alpha::STT;
+     } else
+       assert(0 && "Unknown operand"); 
+
+     SDOperand Ops[] = { CallOperands[i],  getI64Imm((i - 6) * 8), 
+                         CurDAG->getCopyFromReg(Chain, Alpha::R30, MVT::i64),
+                         Chain };
+     Chain = SDOperand(CurDAG->getTargetNode(Opc, MVT::Other, Ops, 4), 0);
+   }
+   for (int i = 0; i < std::min(6, count); ++i) {
+     if (MVT::isInteger(TypeOperands[i])) {
+       Chain = CurDAG->getCopyToReg(Chain, args_int[i], CallOperands[i], InFlag);
+       InFlag = Chain.getValue(1);
+     } else if (TypeOperands[i] == MVT::f32 || TypeOperands[i] == MVT::f64) {
+       Chain = CurDAG->getCopyToReg(Chain, args_float[i], CallOperands[i], InFlag);
+       InFlag = Chain.getValue(1);
+     } else
+       assert(0 && "Unknown operand"); 
+   }
+
+   // Finally, once everything is in registers to pass to the call, emit the
+   // call itself.
+   if (Addr.getOpcode() == AlphaISD::GPRelLo) {
+     SDOperand GOT = getGlobalBaseReg();
+     Chain = CurDAG->getCopyToReg(Chain, Alpha::R29, GOT, InFlag);
+     InFlag = Chain.getValue(1);
+     Chain = SDOperand(CurDAG->getTargetNode(Alpha::BSR, MVT::Other, MVT::Flag, 
+                                             Addr.getOperand(0), Chain, InFlag), 0);
+   } else {
+     AddToISelQueue(Addr);
+     Chain = CurDAG->getCopyToReg(Chain, Alpha::R27, Addr, InFlag);
+     InFlag = Chain.getValue(1);
+     Chain = SDOperand(CurDAG->getTargetNode(Alpha::JSR, MVT::Other, MVT::Flag, 
+                                             Chain, InFlag), 0);
+   }
+   InFlag = Chain.getValue(1);
+
+   std::vector<SDOperand> CallResults;
+  
+   switch (N->getValueType(0)) {
+   default: assert(0 && "Unexpected ret value!");
+     case MVT::Other: break;
+   case MVT::i64:
+     Chain = CurDAG->getCopyFromReg(Chain, Alpha::R0, MVT::i64, InFlag).getValue(1);
+     CallResults.push_back(Chain.getValue(0));
+     break;
+   case MVT::f32:
+     Chain = CurDAG->getCopyFromReg(Chain, Alpha::F0, MVT::f32, InFlag).getValue(1);
+     CallResults.push_back(Chain.getValue(0));
+     break;
+   case MVT::f64:
+     Chain = CurDAG->getCopyFromReg(Chain, Alpha::F0, MVT::f64, InFlag).getValue(1);
+     CallResults.push_back(Chain.getValue(0));
+     break;
+   }
+
+   CallResults.push_back(Chain);
+   for (unsigned i = 0, e = CallResults.size(); i != e; ++i)
+     ReplaceUses(Op.getValue(i), CallResults[i]);
+}
+
+
+/// createAlphaISelDag - This pass converts a legalized DAG into a 
+/// Alpha-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createAlphaISelDag(TargetMachine &TM) {
+  return new AlphaDAGToDAGISel(TM);
+}

diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
new file mode 100644
index 0000000..d4777b2
--- /dev/null
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp

@@ -0,0 +1,623 @@
+//===-- AlphaISelLowering.cpp - Alpha DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Andrew Lenharth and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AlphaISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AlphaISelLowering.h"
+#include "AlphaTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+/// AddLiveIn - This helper function adds the specified physical register to the
+/// MachineFunction as a live in value.  It also creates a corresponding virtual
+/// register for it.
+static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
+                          TargetRegisterClass *RC) {
+  assert(RC->contains(PReg) && "Not the correct regclass!");
+  unsigned VReg = MF.getSSARegMap()->createVirtualRegister(RC);
+  MF.addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM) {
+  // Set up the TargetLowering object.
+  //I am having problems with shr n ubyte 1
+  setShiftAmountType(MVT::i64);
+  setSetCCResultType(MVT::i64);
+  setSetCCResultContents(ZeroOrOneSetCCResult);
+  
+  setUsesGlobalOffsetTable(true);
+  
+  addRegisterClass(MVT::i64, Alpha::GPRCRegisterClass);
+  addRegisterClass(MVT::f64, Alpha::F8RCRegisterClass);
+  addRegisterClass(MVT::f32, Alpha::F4RCRegisterClass);
+  
+  setLoadXAction(ISD::EXTLOAD, MVT::i1,  Promote);
+  setLoadXAction(ISD::EXTLOAD, MVT::f32, Expand);
+  
+  setLoadXAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadXAction(ISD::ZEXTLOAD, MVT::i32, Expand);
+  
+  setLoadXAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+  setLoadXAction(ISD::SEXTLOAD, MVT::i8,  Expand);
+  setLoadXAction(ISD::SEXTLOAD, MVT::i16, Expand);
+
+  setStoreXAction(MVT::i1, Promote);
+  
+  //  setOperationAction(ISD::BRIND,        MVT::Other,   Expand);
+  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);  
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+
+  if (!TM.getSubtarget<AlphaSubtarget>().hasCT()) {
+    setOperationAction(ISD::CTPOP    , MVT::i64  , Expand);
+    setOperationAction(ISD::CTTZ     , MVT::i64  , Expand);
+    setOperationAction(ISD::CTLZ     , MVT::i64  , Expand);
+  }
+  setOperationAction(ISD::BSWAP    , MVT::i64, Expand);
+  setOperationAction(ISD::ROTL     , MVT::i64, Expand);
+  setOperationAction(ISD::ROTR     , MVT::i64, Expand);
+  
+  setOperationAction(ISD::SREM     , MVT::i64, Custom);
+  setOperationAction(ISD::UREM     , MVT::i64, Custom);
+  setOperationAction(ISD::SDIV     , MVT::i64, Custom);
+  setOperationAction(ISD::UDIV     , MVT::i64, Custom);
+
+  setOperationAction(ISD::MEMMOVE  , MVT::Other, Expand);
+  setOperationAction(ISD::MEMSET   , MVT::Other, Expand);
+  setOperationAction(ISD::MEMCPY   , MVT::Other, Expand);
+  
+  // We don't support sin/cos/sqrt
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+  
+  setOperationAction(ISD::SETCC, MVT::f32, Promote);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Promote);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  setOperationAction(ISD::LABEL, MVT::Other, Expand);
+
+  // Not implemented yet.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  // We want to legalize GlobalAddress and ConstantPool and
+  // ExternalSymbols nodes into the appropriate instructions to
+  // materialize the address.
+  setOperationAction(ISD::GlobalAddress,  MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,   MVT::i64, Custom);
+  setOperationAction(ISD::ExternalSymbol, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND,   MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,   MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,   MVT::i32,   Custom);
+
+  setOperationAction(ISD::RET,     MVT::Other, Custom);
+
+  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+
+  setStackPointerRegisterToSaveRestore(Alpha::R30);
+
+  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+  addLegalFPImmediate(+0.0); //F31
+  addLegalFPImmediate(-0.0); //-F31
+
+  setJumpBufSize(272);
+  setJumpBufAlignment(16);
+
+  computeRegisterProperties();
+}
+
+const char *AlphaTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case AlphaISD::CVTQT_: return "Alpha::CVTQT_";
+  case AlphaISD::CVTQS_: return "Alpha::CVTQS_";
+  case AlphaISD::CVTTQ_: return "Alpha::CVTTQ_";
+  case AlphaISD::GPRelHi: return "Alpha::GPRelHi";
+  case AlphaISD::GPRelLo: return "Alpha::GPRelLo";
+  case AlphaISD::RelLit: return "Alpha::RelLit";
+  case AlphaISD::GlobalRetAddr: return "Alpha::GlobalRetAddr";
+  case AlphaISD::CALL:   return "Alpha::CALL";
+  case AlphaISD::DivCall: return "Alpha::DivCall";
+  case AlphaISD::RET_FLAG: return "Alpha::RET_FLAG";
+  case AlphaISD::COND_BRANCH_I: return "Alpha::COND_BRANCH_I";
+  case AlphaISD::COND_BRANCH_F: return "Alpha::COND_BRANCH_F";
+  }
+}
+
+static SDOperand LowerJumpTable(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDOperand JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  SDOperand Zero = DAG.getConstant(0, PtrVT);
+  
+  SDOperand Hi = DAG.getNode(AlphaISD::GPRelHi,  MVT::i64, JTI,
+                             DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, MVT::i64));
+  SDOperand Lo = DAG.getNode(AlphaISD::GPRelLo, MVT::i64, JTI, Hi);
+  return Lo;
+}
+
+//http://www.cs.arizona.edu/computer.help/policy/DIGITAL_unix/
+//AA-PY8AC-TET1_html/callCH3.html#BLOCK21
+
+//For now, just use variable size stack frame format
+
+//In a standard call, the first six items are passed in registers $16
+//- $21 and/or registers $f16 - $f21. (See Section 4.1.2 for details
+//of argument-to-register correspondence.) The remaining items are
+//collected in a memory argument list that is a naturally aligned
+//array of quadwords. In a standard call, this list, if present, must
+//be passed at 0(SP).
+//7 ... n         0(SP) ... (n-7)*8(SP)
+
+// //#define FP    $15
+// //#define RA    $26
+// //#define PV    $27
+// //#define GP    $29
+// //#define SP    $30
+
+static SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG,
+                                       int &VarArgsBase,
+                                       int &VarArgsOffset) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  std::vector<SDOperand> ArgValues;
+  SDOperand Root = Op.getOperand(0);
+
+  AddLiveIn(MF, Alpha::R29, &Alpha::GPRCRegClass); //GP
+  AddLiveIn(MF, Alpha::R26, &Alpha::GPRCRegClass); //RA
+
+  unsigned args_int[] = {
+    Alpha::R16, Alpha::R17, Alpha::R18, Alpha::R19, Alpha::R20, Alpha::R21};
+  unsigned args_float[] = {
+    Alpha::F16, Alpha::F17, Alpha::F18, Alpha::F19, Alpha::F20, Alpha::F21};
+  
+  for (unsigned ArgNo = 0, e = Op.Val->getNumValues()-1; ArgNo != e; ++ArgNo) {
+    SDOperand argt;
+    MVT::ValueType ObjectVT = Op.getValue(ArgNo).getValueType();
+    SDOperand ArgVal;
+
+    if (ArgNo  < 6) {
+      switch (ObjectVT) {
+      default:
+        cerr << "Unknown Type " << ObjectVT << "\n";
+        abort();
+      case MVT::f64:
+        args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo], 
+                                      &Alpha::F8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Root, args_float[ArgNo], ObjectVT);
+        break;
+      case MVT::f32:
+        args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo], 
+                                      &Alpha::F4RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Root, args_float[ArgNo], ObjectVT);
+        break;
+      case MVT::i64:
+        args_int[ArgNo] = AddLiveIn(MF, args_int[ArgNo], 
+                                    &Alpha::GPRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Root, args_int[ArgNo], MVT::i64);
+        break;
+      }
+    } else { //more args
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(8, 8 * (ArgNo - 6));
+
+      // Create the SelectionDAG nodes corresponding to a load
+      //from this parameter
+      SDOperand FIN = DAG.getFrameIndex(FI, MVT::i64);
+      ArgVal = DAG.getLoad(ObjectVT, Root, FIN, NULL, 0);
+    }
+    ArgValues.push_back(ArgVal);
+  }
+
+  // If the functions takes variable number of arguments, copy all regs to stack
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  if (isVarArg) {
+    VarArgsOffset = (Op.Val->getNumValues()-1) * 8;
+    std::vector<SDOperand> LS;
+    for (int i = 0; i < 6; ++i) {
+      if (MRegisterInfo::isPhysicalRegister(args_int[i]))
+        args_int[i] = AddLiveIn(MF, args_int[i], &Alpha::GPRCRegClass);
+      SDOperand argt = DAG.getCopyFromReg(Root, args_int[i], MVT::i64);
+      int FI = MFI->CreateFixedObject(8, -8 * (6 - i));
+      if (i == 0) VarArgsBase = FI;
+      SDOperand SDFI = DAG.getFrameIndex(FI, MVT::i64);
+      LS.push_back(DAG.getStore(Root, argt, SDFI, NULL, 0));
+
+      if (MRegisterInfo::isPhysicalRegister(args_float[i]))
+        args_float[i] = AddLiveIn(MF, args_float[i], &Alpha::F8RCRegClass);
+      argt = DAG.getCopyFromReg(Root, args_float[i], MVT::f64);
+      FI = MFI->CreateFixedObject(8, - 8 * (12 - i));
+      SDFI = DAG.getFrameIndex(FI, MVT::i64);
+      LS.push_back(DAG.getStore(Root, argt, SDFI, NULL, 0));
+    }
+
+    //Set up a token factor with all the stack traffic
+    Root = DAG.getNode(ISD::TokenFactor, MVT::Other, &LS[0], LS.size());
+  }
+
+  ArgValues.push_back(Root);
+
+  // Return the new list of results.
+  std::vector<MVT::ValueType> RetVT(Op.Val->value_begin(),
+                                    Op.Val->value_end());
+  return DAG.getNode(ISD::MERGE_VALUES, RetVT, &ArgValues[0], ArgValues.size());
+}
+
+static SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Copy = DAG.getCopyToReg(Op.getOperand(0), Alpha::R26, 
+                                    DAG.getNode(AlphaISD::GlobalRetAddr, 
+                                                MVT::i64),
+                                    SDOperand());
+  switch (Op.getNumOperands()) {
+  default:
+    assert(0 && "Do not know how to return this many arguments!");
+    abort();
+  case 1: 
+    break;
+    //return SDOperand(); // ret void is legal
+  case 3: {
+    MVT::ValueType ArgVT = Op.getOperand(1).getValueType();
+    unsigned ArgReg;
+    if (MVT::isInteger(ArgVT))
+      ArgReg = Alpha::R0;
+    else {
+      assert(MVT::isFloatingPoint(ArgVT));
+      ArgReg = Alpha::F0;
+    }
+    Copy = DAG.getCopyToReg(Copy, ArgReg, Op.getOperand(1), Copy.getValue(1));
+    if (DAG.getMachineFunction().liveout_empty())
+      DAG.getMachineFunction().addLiveOut(ArgReg);
+    break;
+  }
+  }
+  return DAG.getNode(AlphaISD::RET_FLAG, MVT::Other, Copy, Copy.getValue(1));
+}
+
+std::pair<SDOperand, SDOperand>
+AlphaTargetLowering::LowerCallTo(SDOperand Chain, const Type *RetTy, 
+                                 bool RetTyIsSigned, bool isVarArg,
+                                 unsigned CallingConv, bool isTailCall,
+                                 SDOperand Callee, ArgListTy &Args,
+                                 SelectionDAG &DAG) {
+  int NumBytes = 0;
+  if (Args.size() > 6)
+    NumBytes = (Args.size() - 6) * 8;
+
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getConstant(NumBytes, getPointerTy()));
+  std::vector<SDOperand> args_to_use;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i)
+  {
+    switch (getValueType(Args[i].Ty)) {
+    default: assert(0 && "Unexpected ValueType for argument!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      // Promote the integer to 64 bits.  If the input type is signed use a
+      // sign extend, otherwise use a zero extend.
+      if (Args[i].isSExt)
+        Args[i].Node = DAG.getNode(ISD::SIGN_EXTEND, MVT::i64, Args[i].Node);
+      else if (Args[i].isZExt)
+        Args[i].Node = DAG.getNode(ISD::ZERO_EXTEND, MVT::i64, Args[i].Node);
+      else
+        Args[i].Node = DAG.getNode(ISD::ANY_EXTEND, MVT::i64, Args[i].Node);
+      break;
+    case MVT::i64:
+    case MVT::f64:
+    case MVT::f32:
+      break;
+    }
+    args_to_use.push_back(Args[i].Node);
+  }
+
+  std::vector<MVT::ValueType> RetVals;
+  MVT::ValueType RetTyVT = getValueType(RetTy);
+  MVT::ValueType ActualRetTyVT = RetTyVT;
+  if (RetTyVT >= MVT::i1 && RetTyVT <= MVT::i32)
+    ActualRetTyVT = MVT::i64;
+
+  if (RetTyVT != MVT::isVoid)
+    RetVals.push_back(ActualRetTyVT);
+  RetVals.push_back(MVT::Other);
+
+  std::vector<SDOperand> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+  Ops.insert(Ops.end(), args_to_use.begin(), args_to_use.end());
+  SDOperand TheCall = DAG.getNode(AlphaISD::CALL, RetVals, &Ops[0], Ops.size());
+  Chain = TheCall.getValue(RetTyVT != MVT::isVoid);
+  Chain = DAG.getNode(ISD::CALLSEQ_END, MVT::Other, Chain,
+                      DAG.getConstant(NumBytes, getPointerTy()));
+  SDOperand RetVal = TheCall;
+
+  if (RetTyVT != ActualRetTyVT) {
+    RetVal = DAG.getNode(RetTyIsSigned ? ISD::AssertSext : ISD::AssertZext,
+                         MVT::i64, RetVal, DAG.getValueType(RetTyVT));
+    RetVal = DAG.getNode(ISD::TRUNCATE, RetTyVT, RetVal);
+  }
+
+  return std::make_pair(RetVal, Chain);
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDOperand AlphaTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Wasn't expecting to be able to lower this!");
+  case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG, 
+                                                           VarArgsBase,
+                                                           VarArgsOffset);
+
+  case ISD::RET: return LowerRET(Op,DAG);
+  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+
+  case ISD::SINT_TO_FP: {
+    assert(MVT::i64 == Op.getOperand(0).getValueType() && 
+           "Unhandled SINT_TO_FP type in custom expander!");
+    SDOperand LD;
+    bool isDouble = MVT::f64 == Op.getValueType();
+    LD = DAG.getNode(ISD::BIT_CONVERT, MVT::f64, Op.getOperand(0));
+    SDOperand FP = DAG.getNode(isDouble?AlphaISD::CVTQT_:AlphaISD::CVTQS_,
+                               isDouble?MVT::f64:MVT::f32, LD);
+    return FP;
+  }
+  case ISD::FP_TO_SINT: {
+    bool isDouble = MVT::f64 == Op.getOperand(0).getValueType();
+    SDOperand src = Op.getOperand(0);
+
+    if (!isDouble) //Promote
+      src = DAG.getNode(ISD::FP_EXTEND, MVT::f64, src);
+    
+    src = DAG.getNode(AlphaISD::CVTTQ_, MVT::f64, src);
+
+    return DAG.getNode(ISD::BIT_CONVERT, MVT::i64, src);
+  }
+  case ISD::ConstantPool: {
+    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+    Constant *C = CP->getConstVal();
+    SDOperand CPI = DAG.getTargetConstantPool(C, MVT::i64, CP->getAlignment());
+    
+    SDOperand Hi = DAG.getNode(AlphaISD::GPRelHi,  MVT::i64, CPI,
+                               DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, MVT::i64));
+    SDOperand Lo = DAG.getNode(AlphaISD::GPRelLo, MVT::i64, CPI, Hi);
+    return Lo;
+  }
+  case ISD::GlobalTLSAddress:
+    assert(0 && "TLS not implemented for Alpha.");
+  case ISD::GlobalAddress: {
+    GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+    GlobalValue *GV = GSDN->getGlobal();
+    SDOperand GA = DAG.getTargetGlobalAddress(GV, MVT::i64, GSDN->getOffset());
+
+    //    if (!GV->hasWeakLinkage() && !GV->isDeclaration() && !GV->hasLinkOnceLinkage()) {
+    if (GV->hasInternalLinkage()) {
+      SDOperand Hi = DAG.getNode(AlphaISD::GPRelHi,  MVT::i64, GA,
+                                DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, MVT::i64));
+      SDOperand Lo = DAG.getNode(AlphaISD::GPRelLo, MVT::i64, GA, Hi);
+      return Lo;
+    } else
+      return DAG.getNode(AlphaISD::RelLit, MVT::i64, GA, 
+                         DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, MVT::i64));
+  }
+  case ISD::ExternalSymbol: {
+    return DAG.getNode(AlphaISD::RelLit, MVT::i64, 
+                       DAG.getTargetExternalSymbol(cast<ExternalSymbolSDNode>(Op)
+                                                   ->getSymbol(), MVT::i64),
+                       DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, MVT::i64));
+  }
+
+  case ISD::UREM:
+  case ISD::SREM:
+    //Expand only on constant case
+    if (Op.getOperand(1).getOpcode() == ISD::Constant) {
+      MVT::ValueType VT = Op.Val->getValueType(0);
+      SDOperand Tmp1 = Op.Val->getOpcode() == ISD::UREM ?
+        BuildUDIV(Op.Val, DAG, NULL) :
+        BuildSDIV(Op.Val, DAG, NULL);
+      Tmp1 = DAG.getNode(ISD::MUL, VT, Tmp1, Op.getOperand(1));
+      Tmp1 = DAG.getNode(ISD::SUB, VT, Op.getOperand(0), Tmp1);
+      return Tmp1;
+    }
+    //fall through
+  case ISD::SDIV:
+  case ISD::UDIV:
+    if (MVT::isInteger(Op.getValueType())) {
+      if (Op.getOperand(1).getOpcode() == ISD::Constant)
+        return Op.getOpcode() == ISD::SDIV ? BuildSDIV(Op.Val, DAG, NULL) 
+          : BuildUDIV(Op.Val, DAG, NULL);
+      const char* opstr = 0;
+      switch (Op.getOpcode()) {
+      case ISD::UREM: opstr = "__remqu"; break;
+      case ISD::SREM: opstr = "__remq";  break;
+      case ISD::UDIV: opstr = "__divqu"; break;
+      case ISD::SDIV: opstr = "__divq";  break;
+      }
+      SDOperand Tmp1 = Op.getOperand(0),
+        Tmp2 = Op.getOperand(1),
+        Addr = DAG.getExternalSymbol(opstr, MVT::i64);
+      return DAG.getNode(AlphaISD::DivCall, MVT::i64, Addr, Tmp1, Tmp2);
+    }
+    break;
+
+  case ISD::VAARG: {
+    SDOperand Chain = Op.getOperand(0);
+    SDOperand VAListP = Op.getOperand(1);
+    SrcValueSDNode *VAListS = cast<SrcValueSDNode>(Op.getOperand(2));
+    
+    SDOperand Base = DAG.getLoad(MVT::i64, Chain, VAListP, VAListS->getValue(),
+                                 VAListS->getOffset());
+    SDOperand Tmp = DAG.getNode(ISD::ADD, MVT::i64, VAListP,
+                                DAG.getConstant(8, MVT::i64));
+    SDOperand Offset = DAG.getExtLoad(ISD::SEXTLOAD, MVT::i64, Base.getValue(1),
+                                      Tmp, NULL, 0, MVT::i32);
+    SDOperand DataPtr = DAG.getNode(ISD::ADD, MVT::i64, Base, Offset);
+    if (MVT::isFloatingPoint(Op.getValueType()))
+    {
+      //if fp && Offset < 6*8, then subtract 6*8 from DataPtr
+      SDOperand FPDataPtr = DAG.getNode(ISD::SUB, MVT::i64, DataPtr,
+                                        DAG.getConstant(8*6, MVT::i64));
+      SDOperand CC = DAG.getSetCC(MVT::i64, Offset,
+                                  DAG.getConstant(8*6, MVT::i64), ISD::SETLT);
+      DataPtr = DAG.getNode(ISD::SELECT, MVT::i64, CC, FPDataPtr, DataPtr);
+    }
+
+    SDOperand NewOffset = DAG.getNode(ISD::ADD, MVT::i64, Offset,
+                                      DAG.getConstant(8, MVT::i64));
+    SDOperand Update = DAG.getTruncStore(Offset.getValue(1), NewOffset,
+                                         Tmp, NULL, 0, MVT::i32);
+    
+    SDOperand Result;
+    if (Op.getValueType() == MVT::i32)
+      Result = DAG.getExtLoad(ISD::SEXTLOAD, MVT::i64, Update, DataPtr,
+                              NULL, 0, MVT::i32);
+    else
+      Result = DAG.getLoad(Op.getValueType(), Update, DataPtr, NULL, 0);
+    return Result;
+  }
+  case ISD::VACOPY: {
+    SDOperand Chain = Op.getOperand(0);
+    SDOperand DestP = Op.getOperand(1);
+    SDOperand SrcP = Op.getOperand(2);
+    SrcValueSDNode *DestS = cast<SrcValueSDNode>(Op.getOperand(3));
+    SrcValueSDNode *SrcS = cast<SrcValueSDNode>(Op.getOperand(4));
+    
+    SDOperand Val = DAG.getLoad(getPointerTy(), Chain, SrcP,
+                                SrcS->getValue(), SrcS->getOffset());
+    SDOperand Result = DAG.getStore(Val.getValue(1), Val, DestP, DestS->getValue(),
+                                    DestS->getOffset());
+    SDOperand NP = DAG.getNode(ISD::ADD, MVT::i64, SrcP, 
+                               DAG.getConstant(8, MVT::i64));
+    Val = DAG.getExtLoad(ISD::SEXTLOAD, MVT::i64, Result, NP, NULL,0, MVT::i32);
+    SDOperand NPD = DAG.getNode(ISD::ADD, MVT::i64, DestP,
+                                DAG.getConstant(8, MVT::i64));
+    return DAG.getTruncStore(Val.getValue(1), Val, NPD, NULL, 0, MVT::i32);
+  }
+  case ISD::VASTART: {
+    SDOperand Chain = Op.getOperand(0);
+    SDOperand VAListP = Op.getOperand(1);
+    SrcValueSDNode *VAListS = cast<SrcValueSDNode>(Op.getOperand(2));
+    
+    // vastart stores the address of the VarArgsBase and VarArgsOffset
+    SDOperand FR  = DAG.getFrameIndex(VarArgsBase, MVT::i64);
+    SDOperand S1  = DAG.getStore(Chain, FR, VAListP, VAListS->getValue(),
+                                 VAListS->getOffset());
+    SDOperand SA2 = DAG.getNode(ISD::ADD, MVT::i64, VAListP,
+                                DAG.getConstant(8, MVT::i64));
+    return DAG.getTruncStore(S1, DAG.getConstant(VarArgsOffset, MVT::i64),
+                             SA2, NULL, 0, MVT::i32);
+  }
+  case ISD::RETURNADDR:        
+    return DAG.getNode(AlphaISD::GlobalRetAddr, MVT::i64);
+      //FIXME: implement
+  case ISD::FRAMEADDR:          break;
+  }
+  
+  return SDOperand();
+}
+
+SDOperand AlphaTargetLowering::CustomPromoteOperation(SDOperand Op, 
+                                                      SelectionDAG &DAG) {
+  assert(Op.getValueType() == MVT::i32 && 
+         Op.getOpcode() == ISD::VAARG &&
+         "Unknown node to custom promote!");
+  
+  // The code in LowerOperation already handles i32 vaarg
+  return LowerOperation(Op, DAG);
+}
+
+
+//Inline Asm
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AlphaTargetLowering::ConstraintType 
+AlphaTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'f':
+    case 'r':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::vector<unsigned> AlphaTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT::ValueType VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;  // Unknown constriant letter
+    case 'f': 
+      return make_vector<unsigned>(Alpha::F0 , Alpha::F1 , Alpha::F2 ,
+                                   Alpha::F3 , Alpha::F4 , Alpha::F5 ,
+                                   Alpha::F6 , Alpha::F7 , Alpha::F8 , 
+                                   Alpha::F9 , Alpha::F10, Alpha::F11, 
+                                   Alpha::F12, Alpha::F13, Alpha::F14, 
+                                   Alpha::F15, Alpha::F16, Alpha::F17, 
+                                   Alpha::F18, Alpha::F19, Alpha::F20, 
+                                   Alpha::F21, Alpha::F22, Alpha::F23, 
+                                   Alpha::F24, Alpha::F25, Alpha::F26, 
+                                   Alpha::F27, Alpha::F28, Alpha::F29, 
+                                   Alpha::F30, Alpha::F31, 0);
+    case 'r': 
+      return make_vector<unsigned>(Alpha::R0 , Alpha::R1 , Alpha::R2 , 
+                                   Alpha::R3 , Alpha::R4 , Alpha::R5 , 
+                                   Alpha::R6 , Alpha::R7 , Alpha::R8 , 
+                                   Alpha::R9 , Alpha::R10, Alpha::R11, 
+                                   Alpha::R12, Alpha::R13, Alpha::R14, 
+                                   Alpha::R15, Alpha::R16, Alpha::R17, 
+                                   Alpha::R18, Alpha::R19, Alpha::R20, 
+                                   Alpha::R21, Alpha::R22, Alpha::R23, 
+                                   Alpha::R24, Alpha::R25, Alpha::R26, 
+                                   Alpha::R27, Alpha::R28, Alpha::R29, 
+                                   Alpha::R30, Alpha::R31, 0);
+    }
+  }
+  
+  return std::vector<unsigned>();
+}

diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h
new file mode 100644
index 0000000..24e40a5
--- /dev/null
+++ b/lib/Target/Alpha/AlphaISelLowering.h

@@ -0,0 +1,94 @@
+//===-- AlphaISelLowering.h - Alpha DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Andrew Lenharth and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Alpha uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H
+#define LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H
+
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "Alpha.h"
+
+namespace llvm {
+
+  namespace AlphaISD {
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+Alpha::INSTRUCTION_LIST_END,
+      //These corrospond to the identical Instruction
+      CVTQT_, CVTQS_, CVTTQ_,
+
+      /// GPRelHi/GPRelLo - These represent the high and low 16-bit
+      /// parts of a global address respectively.
+      GPRelHi, GPRelLo, 
+
+      /// RetLit - Literal Relocation of a Global
+      RelLit,
+
+      /// GlobalRetAddr - used to restore the return address
+      GlobalRetAddr,
+      
+      /// CALL - Normal call.
+      CALL,
+
+      /// DIVCALL - used for special library calls for div and rem
+      DivCall,
+      
+      /// return flag operand
+      RET_FLAG,
+
+      /// CHAIN = COND_BRANCH CHAIN, OPC, (G|F)PRC, DESTBB [, INFLAG] - This
+      /// corresponds to the COND_BRANCH pseudo instruction.  
+      /// *PRC is the input register to compare to zero,
+      /// OPC is the branch opcode to use (e.g. Alpha::BEQ),
+      /// DESTBB is the destination block to branch to, and INFLAG is
+      /// an optional input flag argument.
+      COND_BRANCH_I, COND_BRANCH_F
+
+    };
+  }
+
+  class AlphaTargetLowering : public TargetLowering {
+    int VarArgsOffset;  // What is the offset to the first vaarg
+    int VarArgsBase;    // What is the base FrameIndex
+    bool useITOF;
+  public:
+    AlphaTargetLowering(TargetMachine &TM);
+    
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+    virtual SDOperand CustomPromoteOperation(SDOperand Op, SelectionDAG &DAG);
+
+    //Friendly names for dumps
+    const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// LowerCallTo - This hook lowers an abstract call to a function into an
+    /// actual call.
+    virtual std::pair<SDOperand, SDOperand>
+    LowerCallTo(SDOperand Chain, const Type *RetTy, bool RetTyIsSigned, 
+                bool isVarArg, unsigned CC, bool isTailCall, SDOperand Callee, 
+                ArgListTy &Args, SelectionDAG &DAG);
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    std::vector<unsigned> 
+      getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                        MVT::ValueType VT) const;
+
+    bool hasITOF() { return useITOF; }
+  };
+}
+
+#endif   // LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H

diff --git a/lib/Target/Alpha/AlphaInstrFormats.td b/lib/Target/Alpha/AlphaInstrFormats.td
new file mode 100644
index 0000000..259e9af
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrFormats.td

@@ -0,0 +1,249 @@
+//===- AlphaInstrFormats.td - Alpha Instruction Formats ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//3.3:
+//Memory
+//Branch
+//Operate
+//Floating-point
+//PALcode
+
+def u8imm   : Operand<i64>;
+def s14imm  : Operand<i64>;
+def s16imm  : Operand<i64>;
+def s21imm  : Operand<i64>;
+def s64imm  : Operand<i64>;
+def u64imm  : Operand<i64>;
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+// Alpha instruction baseline
+class InstAlpha<bits<6> op, string asmstr, InstrItinClass itin> : Instruction {
+  field bits<32> Inst;
+  let Namespace = "Alpha";
+  let AsmString = asmstr;
+  let Inst{31-26} = op;
+  let Itinerary = itin;
+}
+
+
+//3.3.1
+class MForm<bits<6> opcode, bit store, bit load, string asmstr, list<dag> pattern, InstrItinClass itin> 
+        : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let isStore = store;
+  let isLoad = load;
+  let Defs = [R28]; //We may use this for frame index calculations, so reserve it here
+
+  bits<5> Ra;
+  bits<16> disp;
+  bits<5> Rb;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-0} = disp;
+}
+class MfcForm<bits<6> opcode, bits<16> fc, string asmstr, InstrItinClass itin> 
+        : InstAlpha<opcode, asmstr, itin> {    
+  bits<5> Ra;
+
+  let OperandList = (ops GPRC:$RA);
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = 0;
+  let Inst{15-0} = fc;
+}
+
+class MbrForm<bits<6> opcode, bits<2> TB, dag OL, string asmstr, InstrItinClass itin>
+    : InstAlpha<opcode, asmstr, itin> {
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<14> disp;
+
+  let OperandList = OL;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-14} = TB;
+  let Inst{13-0} = disp;
+}
+class MbrpForm<bits<6> opcode, bits<2> TB, dag OL, string asmstr, list<dag> pattern, InstrItinClass itin>
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern=pattern;
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<14> disp;
+
+  let OperandList = OL;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-14} = TB;
+  let Inst{13-0} = disp;
+}
+
+//3.3.2
+def target : Operand<OtherVT> {}
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, noResults = 1 in {
+class BFormN<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin>
+   : InstAlpha<opcode, asmstr, itin> {
+  let OperandList = OL;
+  bits<64> Opc; //dummy
+  bits<5> Ra;
+  bits<21> disp;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-0} = disp;
+}
+}
+
+let isBranch = 1, isTerminator = 1 in
+class BFormD<bits<6> opcode, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OperandList = (ops target:$DISP);
+  bits<5> Ra;
+  bits<21> disp;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-0} = disp;
+}
+
+//3.3.3
+class OForm<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OperandList = (ops GPRC:$RC, GPRC:$RA, GPRC:$RB);
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<7> Function = fun;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+class OForm2<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OperandList = (ops GPRC:$RC, GPRC:$RB);
+
+  bits<5> Rc;
+  bits<5> Rb;
+  bits<7> Function = fun;
+
+  let Inst{25-21} = 31;
+  let Inst{20-16} = Rb;
+  let Inst{15-13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+class OForm4<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OperandList = (ops GPRC:$RDEST, GPRC:$RCOND, GPRC:$RTRUE, GPRC:$RFALSE);
+  let Constraints = "$RFALSE = $RDEST";
+  let DisableEncoding = "$RFALSE";
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<7> Function = fun;
+
+//  let isTwoAddress = 1;
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+
+class OFormL<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OperandList = (ops GPRC:$RC, GPRC:$RA, u8imm:$L);
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<8> LIT;
+  bits<7> Function = fun;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-13} = LIT;
+  let Inst{12} = 1;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+class OForm4L<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OperandList = (ops GPRC:$RDEST, GPRC:$RCOND, s64imm:$RTRUE, GPRC:$RFALSE);
+  let Constraints = "$RFALSE = $RDEST";
+  let DisableEncoding = "$RFALSE";
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<8> LIT;
+  bits<7> Function = fun;
+
+//  let isTwoAddress = 1;
+  let Inst{25-21} = Ra;
+  let Inst{20-13} = LIT;
+  let Inst{12} = 1;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+//3.3.4
+class FPForm<bits<6> opcode, bits<11> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+
+  bits<5> Fc;
+  bits<5> Fa;
+  bits<5> Fb;
+  bits<11> Function = fun;
+
+  let Inst{25-21} = Fa;
+  let Inst{20-16} = Fb;
+  let Inst{15-5} = Function;
+  let Inst{4-0} = Fc;
+}
+
+//3.3.5
+class PALForm<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin>
+    : InstAlpha<opcode, asmstr, itin> {
+  let OperandList = OL;
+  bits<26> Function;
+
+  let Inst{25-0} = Function;
+}
+
+
+// Pseudo instructions.
+class PseudoInstAlpha<dag OL, string nm, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<0, nm, itin>  {
+  let OperandList = OL;
+  let Pattern = pattern;
+
+}

diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
new file mode 100644
index 0000000..718587d
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp

@@ -0,0 +1,266 @@
+//===- AlphaInstrInfo.cpp - Alpha Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaInstrInfo.h"
+#include "AlphaGenInstrInfo.inc"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+using namespace llvm;
+
+AlphaInstrInfo::AlphaInstrInfo()
+  : TargetInstrInfo(AlphaInsts, sizeof(AlphaInsts)/sizeof(AlphaInsts[0])),
+    RI(*this) { }
+
+
+bool AlphaInstrInfo::isMoveInstr(const MachineInstr& MI,
+                                 unsigned& sourceReg,
+                                 unsigned& destReg) const {
+  MachineOpCode oc = MI.getOpcode();
+  if (oc == Alpha::BISr   || 
+      oc == Alpha::CPYSS  || 
+      oc == Alpha::CPYST  ||
+      oc == Alpha::CPYSSt || 
+      oc == Alpha::CPYSTs) {
+    // or r1, r2, r2 
+    // cpys(s|t) r1 r2 r2
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isRegister() &&
+           MI.getOperand(1).isRegister() &&
+           MI.getOperand(2).isRegister() &&
+           "invalid Alpha BIS instruction!");
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+  }
+  return false;
+}
+
+unsigned 
+AlphaInstrInfo::isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  case Alpha::LDL:
+  case Alpha::LDQ:
+  case Alpha::LDBU:
+  case Alpha::LDWU:
+  case Alpha::LDS:
+  case Alpha::LDT:
+    if (MI->getOperand(1).isFrameIndex()) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned 
+AlphaInstrInfo::isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  case Alpha::STL:
+  case Alpha::STQ:
+  case Alpha::STB:
+  case Alpha::STW:
+  case Alpha::STS:
+  case Alpha::STT:
+    if (MI->getOperand(1).isFrameIndex()) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+static bool isAlphaIntCondCode(unsigned Opcode) {
+  switch (Opcode) {
+  case Alpha::BEQ: 
+  case Alpha::BNE: 
+  case Alpha::BGE: 
+  case Alpha::BGT: 
+  case Alpha::BLE: 
+  case Alpha::BLT: 
+  case Alpha::BLBC: 
+  case Alpha::BLBS:
+    return true;
+  default:
+    return false;
+  }
+}
+
+unsigned AlphaInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                                  MachineBasicBlock *FBB,
+                                  const std::vector<MachineOperand> &Cond)const{
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) && 
+         "Alpha branch conditions have two components!");
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, get(Alpha::BR)).addMBB(TBB);
+    else                // Conditional branch
+      if (isAlphaIntCondCode(Cond[0].getImm()))
+        BuildMI(&MBB, get(Alpha::COND_BRANCH_I))
+          .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+      else
+        BuildMI(&MBB, get(Alpha::COND_BRANCH_F))
+          .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+    return 1;
+  }
+  
+  // Two-way Conditional Branch.
+  if (isAlphaIntCondCode(Cond[0].getImm()))
+    BuildMI(&MBB, get(Alpha::COND_BRANCH_I))
+      .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  else
+    BuildMI(&MBB, get(Alpha::COND_BRANCH_F))
+      .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  BuildMI(&MBB, get(Alpha::BR)).addMBB(FBB);
+  return 2;
+}
+
+static unsigned AlphaRevCondCode(unsigned Opcode) {
+  switch (Opcode) {
+  case Alpha::BEQ: return Alpha::BNE;
+  case Alpha::BNE: return Alpha::BEQ;
+  case Alpha::BGE: return Alpha::BLT;
+  case Alpha::BGT: return Alpha::BLE;
+  case Alpha::BLE: return Alpha::BGT;
+  case Alpha::BLT: return Alpha::BGE;
+  case Alpha::BLBC: return Alpha::BLBS;
+  case Alpha::BLBS: return Alpha::BLBC;
+  case Alpha::FBEQ: return Alpha::FBNE;
+  case Alpha::FBNE: return Alpha::FBEQ;
+  case Alpha::FBGE: return Alpha::FBLT;
+  case Alpha::FBGT: return Alpha::FBLE;
+  case Alpha::FBLE: return Alpha::FBGT;
+  case Alpha::FBLT: return Alpha::FBGE;
+  default:
+    assert(0 && "Unknown opcode");
+  }
+}
+
+// Branch analysis.
+bool AlphaInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 std::vector<MachineOperand> &Cond) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == Alpha::BR) {
+      TBB = LastInst->getOperand(0).getMachineBasicBlock();
+      return false;
+    } else if (LastInst->getOpcode() == Alpha::COND_BRANCH_I ||
+               LastInst->getOpcode() == Alpha::COND_BRANCH_F) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(2).getMachineBasicBlock();
+      Cond.push_back(LastInst->getOperand(0));
+      Cond.push_back(LastInst->getOperand(1));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+  
+  // If the block ends with Alpha::BR and Alpha::COND_BRANCH_*, handle it.
+  if ((SecondLastInst->getOpcode() == Alpha::COND_BRANCH_I ||
+      SecondLastInst->getOpcode() == Alpha::COND_BRANCH_F) && 
+      LastInst->getOpcode() == Alpha::BR) {
+    TBB =  SecondLastInst->getOperand(2).getMachineBasicBlock();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    Cond.push_back(SecondLastInst->getOperand(1));
+    FBB = LastInst->getOperand(0).getMachineBasicBlock();
+    return false;
+  }
+  
+  // If the block ends with two Alpha::BRs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == Alpha::BR && 
+      LastInst->getOpcode() == Alpha::BR) {
+    TBB = SecondLastInst->getOperand(0).getMachineBasicBlock();
+    I = LastInst;
+    I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned AlphaInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != Alpha::BR && 
+      I->getOpcode() != Alpha::COND_BRANCH_I &&
+      I->getOpcode() != Alpha::COND_BRANCH_F)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != Alpha::COND_BRANCH_I && 
+      I->getOpcode() != Alpha::COND_BRANCH_F)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+void AlphaInstrInfo::insertNoop(MachineBasicBlock &MBB, 
+                                MachineBasicBlock::iterator MI) const {
+  BuildMI(MBB, MI, get(Alpha::BISr), Alpha::R31).addReg(Alpha::R31)
+    .addReg(Alpha::R31);
+}
+
+bool AlphaInstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case Alpha::RETDAG: // Return.
+  case Alpha::RETDAGp:
+  case Alpha::BR:     // Uncond branch.
+  case Alpha::JMP:  // Indirect branch.
+    return true;
+  default: return false;
+  }
+}
+bool AlphaInstrInfo::
+ReverseBranchCondition(std::vector<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid Alpha branch opcode!");
+  Cond[0].setImm(AlphaRevCondCode(Cond[0].getImm()));
+  return false;
+}
+

diff --git a/lib/Target/Alpha/AlphaInstrInfo.h b/lib/Target/Alpha/AlphaInstrInfo.h
new file mode 100644
index 0000000..84009be
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrInfo.h

@@ -0,0 +1,57 @@
+//===- AlphaInstrInfo.h - Alpha Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHAINSTRUCTIONINFO_H
+#define ALPHAINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "AlphaRegisterInfo.h"
+
+namespace llvm {
+
+class AlphaInstrInfo : public TargetInstrInfo {
+  const AlphaRegisterInfo RI;
+public:
+  AlphaInstrInfo();
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and
+  /// leave the source and dest operands in the passed parameters.
+  ///
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg) const;
+  
+  virtual unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB,
+                            const std::vector<MachineOperand> &Cond) const;
+  bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     std::vector<MachineOperand> &Cond) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  void insertNoop(MachineBasicBlock &MBB, 
+                  MachineBasicBlock::iterator MI) const;
+  bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const;
+  bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td
new file mode 100644
index 0000000..4a834da
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrInfo.td

@@ -0,0 +1,1088 @@
+//===- AlphaInstrInfo.td - The Alpha Instruction Set -------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+include "AlphaInstrFormats.td"
+
+//********************
+//Custom DAG Nodes
+//********************
+
+def SDTFPUnaryOpUnC  : SDTypeProfile<1, 1, [
+  SDTCisFP<1>, SDTCisFP<0>
+]>;
+def Alpha_cvtqt   : SDNode<"AlphaISD::CVTQT_",    SDTFPUnaryOpUnC, []>;
+def Alpha_cvtqs   : SDNode<"AlphaISD::CVTQS_",    SDTFPUnaryOpUnC, []>;
+def Alpha_cvttq   : SDNode<"AlphaISD::CVTTQ_"  ,  SDTFPUnaryOp, []>;
+def Alpha_gprello : SDNode<"AlphaISD::GPRelLo",   SDTIntBinOp, []>;
+def Alpha_gprelhi : SDNode<"AlphaISD::GPRelHi",   SDTIntBinOp, []>;
+def Alpha_rellit  : SDNode<"AlphaISD::RelLit",    SDTIntBinOp, []>;
+
+def retflag       : SDNode<"AlphaISD::RET_FLAG", SDTRet,
+	                   [SDNPHasChain, SDNPOptInFlag]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_AlphaCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i64> ]>;
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AlphaCallSeq,
+    		           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_AlphaCallSeq,
+                           [SDNPHasChain, SDNPOutFlag]>;
+
+//********************
+//Paterns for matching
+//********************
+def invX : SDNodeXForm<imm, [{ //invert
+  return getI64Imm(~N->getValue());
+}]>;
+def negX : SDNodeXForm<imm, [{ //negate
+  return getI64Imm(~N->getValue() + 1);
+}]>;
+def SExt32 : SDNodeXForm<imm, [{ //signed extend int to long
+  return getI64Imm(((int64_t)N->getValue() << 32) >> 32);
+}]>;
+def SExt16 : SDNodeXForm<imm, [{ //signed extend int to long
+  return getI64Imm(((int64_t)N->getValue() << 48) >> 48);
+}]>;
+def LL16 : SDNodeXForm<imm, [{ //lda part of constant
+  return getI64Imm(get_lda16(N->getValue()));
+}]>;
+def LH16 : SDNodeXForm<imm, [{ //ldah part of constant (or more if too big)
+  return getI64Imm(get_ldah16(N->getValue()));
+}]>;
+def iZAPX : SDNodeXForm<and, [{ // get imm to ZAPi
+  ConstantSDNode *RHS = cast<ConstantSDNode>(N->getOperand(1));
+  return getI64Imm(get_zapImm(SDOperand(), RHS->getValue()));
+}]>;
+def nearP2X : SDNodeXForm<imm, [{
+  return getI64Imm(Log2_64(getNearPower2((uint64_t)N->getValue())));
+}]>;
+def nearP2RemX : SDNodeXForm<imm, [{
+  uint64_t x = abs(N->getValue() - getNearPower2((uint64_t)N->getValue()));
+  return getI64Imm(Log2_64(x));
+}]>;
+
+def immUExt8  : PatLeaf<(imm), [{ //imm fits in 8 bit zero extended field
+  return (uint64_t)N->getValue() == (uint8_t)N->getValue();
+}]>;
+def immUExt8inv  : PatLeaf<(imm), [{ //inverted imm fits in 8 bit zero extended field
+  return (uint64_t)~N->getValue() == (uint8_t)~N->getValue();
+}], invX>;
+def immUExt8neg  : PatLeaf<(imm), [{ //negated imm fits in 8 bit zero extended field
+  return ((uint64_t)~N->getValue() + 1) == (uint8_t)((uint64_t)~N->getValue() + 1);
+}], negX>;
+def immSExt16  : PatLeaf<(imm), [{ //imm fits in 16 bit sign extended field
+  return ((int64_t)N->getValue() << 48) >> 48 == (int64_t)N->getValue();
+}]>;
+def immSExt16int  : PatLeaf<(imm), [{ //(int)imm fits in a 16 bit sign extended field
+  return ((int64_t)N->getValue() << 48) >> 48 == ((int64_t)N->getValue() << 32) >> 32;
+}], SExt16>;
+
+def zappat : PatFrag<(ops node:$LHS), (and node:$LHS, imm:$L), [{
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    uint64_t build = get_zapImm(N->getOperand(0), (uint64_t)RHS->getValue());
+    return build != 0;
+  }
+  return false;
+}]>;
+
+def immFPZ  : PatLeaf<(fpimm), [{ //the only fpconstant nodes are +/- 0.0
+  (void)N; // silence warning.
+  return true;
+}]>;
+
+def immRem1  : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),1, 0);}]>;
+def immRem2  : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),2, 0);}]>;
+def immRem3  : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),3, 0);}]>;
+def immRem4  : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),4, 0);}]>;
+def immRem5  : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),5, 0);}]>;
+def immRem1n : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),1, 1);}]>;
+def immRem2n : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),2, 1);}]>;
+def immRem3n : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),3, 1);}]>;
+def immRem4n : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),4, 1);}]>;
+def immRem5n : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),5, 1);}]>;
+
+def immRemP2n : PatLeaf<(imm), [{
+  return isPowerOf2_64(getNearPower2((uint64_t)N->getValue()) - N->getValue());
+}]>;
+def immRemP2 : PatLeaf<(imm), [{
+  return isPowerOf2_64(N->getValue() - getNearPower2((uint64_t)N->getValue()));
+}]>;
+def immUExt8ME : PatLeaf<(imm), [{ //use this imm for mulqi
+  int64_t d =  abs((int64_t)N->getValue() - (int64_t)getNearPower2((uint64_t)N->getValue()));
+  if (isPowerOf2_64(d)) return false;
+  switch (d) {
+    case 1: case 3: case 5: return false; 
+    default: return (uint64_t)N->getValue() == (uint8_t)N->getValue();
+  };
+}]>;
+
+def intop : PatFrag<(ops node:$op), (sext_inreg node:$op, i32)>;
+def add4  : PatFrag<(ops node:$op1, node:$op2),
+                    (add (shl node:$op1, 2), node:$op2)>;
+def sub4  : PatFrag<(ops node:$op1, node:$op2),
+                    (sub (shl node:$op1, 2), node:$op2)>;
+def add8  : PatFrag<(ops node:$op1, node:$op2),
+                    (add (shl node:$op1, 3), node:$op2)>;
+def sub8  : PatFrag<(ops node:$op1, node:$op2),
+                    (sub (shl node:$op1, 3), node:$op2)>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class CmpOpFrag<dag res> : PatFrag<(ops node:$R), res>;
+
+//Pseudo ops for selection
+
+def IDEF_I : PseudoInstAlpha<(ops GPRC:$RA), ";#idef $RA",
+             [(set GPRC:$RA, (undef))], s_pseudo>;
+def IDEF_F32 : PseudoInstAlpha<(ops F4RC:$RA), ";#idef $RA",
+             [(set F4RC:$RA, (undef))], s_pseudo>;
+def IDEF_F64 : PseudoInstAlpha<(ops F8RC:$RA), ";#idef $RA",
+             [(set F8RC:$RA, (undef))], s_pseudo>;
+
+def WTF : PseudoInstAlpha<(ops variable_ops), "#wtf", [], s_pseudo>;
+
+let isLoad = 1, hasCtrlDep = 1 in {
+def ADJUSTSTACKUP : PseudoInstAlpha<(ops s64imm:$amt), "; ADJUP $amt", 
+                [(callseq_start imm:$amt)], s_pseudo>, Imp<[R30],[R30]>;
+def ADJUSTSTACKDOWN : PseudoInstAlpha<(ops s64imm:$amt), "; ADJDOWN $amt", 
+                [(callseq_end imm:$amt)], s_pseudo>, Imp<[R30],[R30]>;
+}
+def ALTENT : PseudoInstAlpha<(ops s64imm:$TARGET), "$$$TARGET..ng:\n", [], s_pseudo>;
+def PCLABEL : PseudoInstAlpha<(ops s64imm:$num), "PCMARKER_$num:\n",[], s_pseudo>;
+def MEMLABEL : PseudoInstAlpha<(ops s64imm:$i, s64imm:$j, s64imm:$k, s64imm:$m),
+         "LSMARKER$$$i$$$j$$$k$$$m:", [], s_pseudo>;
+
+
+//***********************
+//Real instructions
+//***********************
+
+//Operation Form:
+
+//conditional moves, int
+
+multiclass cmov_inst<bits<7> fun, string asmstr, PatFrag OpNode> {
+def r : OForm4<0x11, fun, !strconcat(asmstr, " $RCOND,$RTRUE,$RDEST"),
+             [(set GPRC:$RDEST, (select (OpNode GPRC:$RCOND), GPRC:$RTRUE, GPRC:$RFALSE))], s_cmov>;
+def i : OForm4L<0x11, fun, !strconcat(asmstr, " $RCOND,$RTRUE,$RDEST"),
+             [(set GPRC:$RDEST, (select (OpNode GPRC:$RCOND), immUExt8:$RTRUE, GPRC:$RFALSE))], s_cmov>;
+}
+
+defm CMOVEQ  : cmov_inst<0x24, "cmoveq",  CmpOpFrag<(seteq node:$R, 0)>>;
+defm CMOVNE  : cmov_inst<0x26, "cmovne",  CmpOpFrag<(setne node:$R, 0)>>;
+defm CMOVLT  : cmov_inst<0x44, "cmovlt",  CmpOpFrag<(setlt node:$R, 0)>>;
+defm CMOVLE  : cmov_inst<0x64, "cmovle",  CmpOpFrag<(setle node:$R, 0)>>;
+defm CMOVGT  : cmov_inst<0x66, "cmovgt",  CmpOpFrag<(setgt node:$R, 0)>>;
+defm CMOVGE  : cmov_inst<0x46, "cmovge",  CmpOpFrag<(setge node:$R, 0)>>;
+defm CMOVLBC : cmov_inst<0x16, "cmovlbc", CmpOpFrag<(xor   node:$R, 1)>>;
+defm CMOVLBS : cmov_inst<0x14, "cmovlbs", CmpOpFrag<(and   node:$R, 1)>>;
+
+//General pattern for cmov
+def : Pat<(select GPRC:$which, GPRC:$src1, GPRC:$src2),
+      (CMOVNEr GPRC:$src2, GPRC:$src1, GPRC:$which)>;
+def : Pat<(select GPRC:$which, GPRC:$src1, immUExt8:$src2),
+      (CMOVEQi GPRC:$src1, immUExt8:$src2, GPRC:$which)>;
+
+//Invert sense when we can for constants:
+def : Pat<(select (setne GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVEQi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setgt GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVLEi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setge GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVLTi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setlt GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVGEi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setle GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVGTi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+
+multiclass all_inst<bits<6> opc, bits<7> funl, bits<7> funq, 
+                    string asmstr, PatFrag OpNode, InstrItinClass itin> {
+  def Lr : OForm< opc, funl, !strconcat(asmstr, "l $RA,$RB,$RC"),
+               [(set GPRC:$RC, (intop (OpNode GPRC:$RA, GPRC:$RB)))], itin>;
+  def Li : OFormL<opc, funl, !strconcat(asmstr, "l $RA,$L,$RC"),
+               [(set GPRC:$RC, (intop (OpNode GPRC:$RA, immUExt8:$L)))], itin>;
+  def Qr : OForm< opc, funq, !strconcat(asmstr, "q $RA,$RB,$RC"),
+               [(set GPRC:$RC, (OpNode GPRC:$RA, GPRC:$RB))], itin>;
+  def Qi : OFormL<opc, funq, !strconcat(asmstr, "q $RA,$L,$RC"),
+               [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8:$L))], itin>;
+}
+
+defm MUL   : all_inst<0x13, 0x00, 0x20, "mul",   BinOpFrag<(mul node:$LHS, node:$RHS)>, s_imul>;
+defm ADD   : all_inst<0x10, 0x00, 0x20, "add",   BinOpFrag<(add node:$LHS, node:$RHS)>, s_iadd>;
+defm S4ADD : all_inst<0x10, 0x02, 0x22, "s4add", add4, s_iadd>;
+defm S8ADD : all_inst<0x10, 0x12, 0x32, "s8add", add8, s_iadd>;
+defm S4SUB : all_inst<0x10, 0x0B, 0x2B, "s4sub", sub4, s_iadd>;
+defm S8SUB : all_inst<0x10, 0x1B, 0x3B, "s8sub", sub8, s_iadd>;
+defm SUB   : all_inst<0x10, 0x09, 0x29, "sub",   BinOpFrag<(sub node:$LHS, node:$RHS)>, s_iadd>;
+//Const cases since legalize does sub x, int -> add x, inv(int) + 1
+def : Pat<(intop (add GPRC:$RA, immUExt8neg:$L)), (SUBLi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(add GPRC:$RA, immUExt8neg:$L), (SUBQi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(intop (add4 GPRC:$RA, immUExt8neg:$L)), (S4SUBLi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(add4 GPRC:$RA, immUExt8neg:$L), (S4SUBQi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(intop (add8 GPRC:$RA, immUExt8neg:$L)), (S8SUBLi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(add8 GPRC:$RA, immUExt8neg:$L), (S8SUBQi GPRC:$RA, immUExt8neg:$L)>;
+
+multiclass log_inst<bits<6> opc, bits<7> fun, string asmstr, SDNode OpNode, InstrItinClass itin> {
+def r : OForm<opc, fun, !strconcat(asmstr, " $RA,$RB,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, GPRC:$RB))], itin>;
+def i : OFormL<opc, fun, !strconcat(asmstr, " $RA,$L,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8:$L))], itin>;
+}
+multiclass inv_inst<bits<6> opc, bits<7> fun, string asmstr, SDNode OpNode, InstrItinClass itin> {
+def r : OForm<opc, fun, !strconcat(asmstr, " $RA,$RB,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, (not GPRC:$RB)))], itin>;
+def i : OFormL<opc, fun, !strconcat(asmstr, " $RA,$L,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8inv:$L))], itin>;
+}
+
+defm AND   : log_inst<0x11, 0x00, "and",   and,   s_ilog>;
+defm BIC   : inv_inst<0x11, 0x08, "bic",   and,   s_ilog>;
+defm BIS   : log_inst<0x11, 0x20, "bis",   or,    s_ilog>;
+defm ORNOT : inv_inst<0x11, 0x28, "ornot", or,    s_ilog>;
+defm XOR   : log_inst<0x11, 0x40, "xor",   xor,   s_ilog>;
+defm EQV   : inv_inst<0x11, 0x48, "eqv",   xor,   s_ilog>;
+
+defm SL    : log_inst<0x12, 0x39, "sll",   shl,   s_ishf>;
+defm SRA   : log_inst<0x12, 0x3c, "sra",   sra,   s_ishf>;
+defm SRL   : log_inst<0x12, 0x34, "srl",   srl,   s_ishf>;
+defm UMULH : log_inst<0x13, 0x30, "umulh", mulhu, s_imul>;
+
+def CTLZ     : OForm2<0x1C, 0x32, "CTLZ $RB,$RC", 
+                      [(set GPRC:$RC, (ctlz GPRC:$RB))], s_imisc>;
+def CTPOP    : OForm2<0x1C, 0x30, "CTPOP $RB,$RC", 
+                      [(set GPRC:$RC, (ctpop GPRC:$RB))], s_imisc>;
+def CTTZ     : OForm2<0x1C, 0x33, "CTTZ $RB,$RC", 
+                      [(set GPRC:$RC, (cttz GPRC:$RB))], s_imisc>;
+def EXTBL    : OForm< 0x12, 0x06, "EXTBL $RA,$RB,$RC", 
+                      [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 255))], s_ishf>;
+def EXTWL    : OForm< 0x12, 0x16, "EXTWL $RA,$RB,$RC", 
+                      [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 65535))], s_ishf>;
+def EXTLL    : OForm< 0x12, 0x26, "EXTLL $RA,$RB,$RC", 
+                      [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 4294967295))], s_ishf>;
+def SEXTB    : OForm2<0x1C, 0x00, "sextb $RB,$RC", 
+                      [(set GPRC:$RC, (sext_inreg GPRC:$RB, i8))], s_ishf>;
+def SEXTW    : OForm2<0x1C, 0x01, "sextw $RB,$RC", 
+                      [(set GPRC:$RC, (sext_inreg GPRC:$RB, i16))], s_ishf>;
+
+//def EXTBLi   : OFormL<0x12, 0x06, "EXTBL $RA,$L,$RC", []>; //Extract byte low
+//def EXTLH    : OForm< 0x12, 0x6A, "EXTLH $RA,$RB,$RC", []>; //Extract longword high
+//def EXTLHi   : OFormL<0x12, 0x6A, "EXTLH $RA,$L,$RC", []>; //Extract longword high
+//def EXTLLi   : OFormL<0x12, 0x26, "EXTLL $RA,$L,$RC", []>; //Extract longword low
+//def EXTQH    : OForm< 0x12, 0x7A, "EXTQH $RA,$RB,$RC", []>; //Extract quadword high
+//def EXTQHi   : OFormL<0x12, 0x7A, "EXTQH $RA,$L,$RC", []>; //Extract quadword high
+//def EXTQ     : OForm< 0x12, 0x36, "EXTQ $RA,$RB,$RC", []>; //Extract quadword low
+//def EXTQi    : OFormL<0x12, 0x36, "EXTQ $RA,$L,$RC", []>; //Extract quadword low
+//def EXTWH    : OForm< 0x12, 0x5A, "EXTWH $RA,$RB,$RC", []>; //Extract word high
+//def EXTWHi   : OFormL<0x12, 0x5A, "EXTWH $RA,$L,$RC", []>; //Extract word high
+//def EXTWLi   : OFormL<0x12, 0x16, "EXTWL $RA,$L,$RC", []>; //Extract word low
+
+//def INSBL    : OForm< 0x12, 0x0B, "INSBL $RA,$RB,$RC", []>; //Insert byte low
+//def INSBLi   : OFormL<0x12, 0x0B, "INSBL $RA,$L,$RC", []>; //Insert byte low
+//def INSLH    : OForm< 0x12, 0x67, "INSLH $RA,$RB,$RC", []>; //Insert longword high
+//def INSLHi   : OFormL<0x12, 0x67, "INSLH $RA,$L,$RC", []>; //Insert longword high
+//def INSLL    : OForm< 0x12, 0x2B, "INSLL $RA,$RB,$RC", []>; //Insert longword low
+//def INSLLi   : OFormL<0x12, 0x2B, "INSLL $RA,$L,$RC", []>; //Insert longword low
+//def INSQH    : OForm< 0x12, 0x77, "INSQH $RA,$RB,$RC", []>; //Insert quadword high
+//def INSQHi   : OFormL<0x12, 0x77, "INSQH $RA,$L,$RC", []>; //Insert quadword high
+//def INSQL    : OForm< 0x12, 0x3B, "INSQL $RA,$RB,$RC", []>; //Insert quadword low
+//def INSQLi   : OFormL<0x12, 0x3B, "INSQL $RA,$L,$RC", []>; //Insert quadword low
+//def INSWH    : OForm< 0x12, 0x57, "INSWH $RA,$RB,$RC", []>; //Insert word high
+//def INSWHi   : OFormL<0x12, 0x57, "INSWH $RA,$L,$RC", []>; //Insert word high
+//def INSWL    : OForm< 0x12, 0x1B, "INSWL $RA,$RB,$RC", []>; //Insert word low
+//def INSWLi   : OFormL<0x12, 0x1B, "INSWL $RA,$L,$RC", []>; //Insert word low
+
+//def MSKBL    : OForm< 0x12, 0x02, "MSKBL $RA,$RB,$RC", []>; //Mask byte low
+//def MSKBLi   : OFormL<0x12, 0x02, "MSKBL $RA,$L,$RC", []>; //Mask byte low
+//def MSKLH    : OForm< 0x12, 0x62, "MSKLH $RA,$RB,$RC", []>; //Mask longword high
+//def MSKLHi   : OFormL<0x12, 0x62, "MSKLH $RA,$L,$RC", []>; //Mask longword high
+//def MSKLL    : OForm< 0x12, 0x22, "MSKLL $RA,$RB,$RC", []>; //Mask longword low
+//def MSKLLi   : OFormL<0x12, 0x22, "MSKLL $RA,$L,$RC", []>; //Mask longword low
+//def MSKQH    : OForm< 0x12, 0x72, "MSKQH $RA,$RB,$RC", []>; //Mask quadword high
+//def MSKQHi   : OFormL<0x12, 0x72, "MSKQH $RA,$L,$RC", []>; //Mask quadword high
+//def MSKQL    : OForm< 0x12, 0x32, "MSKQL $RA,$RB,$RC", []>; //Mask quadword low
+//def MSKQLi   : OFormL<0x12, 0x32, "MSKQL $RA,$L,$RC", []>; //Mask quadword low
+//def MSKWH    : OForm< 0x12, 0x52, "MSKWH $RA,$RB,$RC", []>; //Mask word high
+//def MSKWHi   : OFormL<0x12, 0x52, "MSKWH $RA,$L,$RC", []>; //Mask word high
+//def MSKWL    : OForm< 0x12, 0x12, "MSKWL $RA,$RB,$RC", []>; //Mask word low
+//def MSKWLi   : OFormL<0x12, 0x12, "MSKWL $RA,$L,$RC", []>; //Mask word low
+                      
+def ZAPNOTi  : OFormL<0x12, 0x31, "zapnot $RA,$L,$RC", [], s_ishf>;
+
+// Define the pattern that produces ZAPNOTi.
+def : Pat<(i64 (zappat GPRC:$RA):$imm),
+          (ZAPNOTi GPRC:$RA, (iZAPX GPRC:$imm))>;
+
+
+//Comparison, int
+//So this is a waste of what this instruction can do, but it still saves something
+def CMPBGE  : OForm< 0x10, 0x0F, "cmpbge $RA,$RB,$RC", 
+                     [(set GPRC:$RC, (setuge (and GPRC:$RA, 255), (and GPRC:$RB, 255)))], s_ilog>;
+def CMPBGEi : OFormL<0x10, 0x0F, "cmpbge $RA,$L,$RC",
+                     [(set GPRC:$RC, (setuge (and GPRC:$RA, 255), immUExt8:$L))], s_ilog>;
+def CMPEQ   : OForm< 0x10, 0x2D, "cmpeq $RA,$RB,$RC", 
+                     [(set GPRC:$RC, (seteq GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPEQi  : OFormL<0x10, 0x2D, "cmpeq $RA,$L,$RC", 
+                     [(set GPRC:$RC, (seteq GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPLE   : OForm< 0x10, 0x6D, "cmple $RA,$RB,$RC", 
+                     [(set GPRC:$RC, (setle GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPLEi  : OFormL<0x10, 0x6D, "cmple $RA,$L,$RC",
+                     [(set GPRC:$RC, (setle GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPLT   : OForm< 0x10, 0x4D, "cmplt $RA,$RB,$RC",
+                     [(set GPRC:$RC, (setlt GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPLTi  : OFormL<0x10, 0x4D, "cmplt $RA,$L,$RC",
+                     [(set GPRC:$RC, (setlt GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPULE  : OForm< 0x10, 0x3D, "cmpule $RA,$RB,$RC",
+                     [(set GPRC:$RC, (setule GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPULEi : OFormL<0x10, 0x3D, "cmpule $RA,$L,$RC",
+                     [(set GPRC:$RC, (setule GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPULT  : OForm< 0x10, 0x1D, "cmpult $RA,$RB,$RC",
+                     [(set GPRC:$RC, (setult GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPULTi : OFormL<0x10, 0x1D, "cmpult $RA,$L,$RC", 
+                      [(set GPRC:$RC, (setult GPRC:$RA, immUExt8:$L))], s_iadd>;
+
+//Patterns for unsupported int comparisons
+def : Pat<(setueq GPRC:$X, GPRC:$Y), (CMPEQ GPRC:$X, GPRC:$Y)>;
+def : Pat<(setueq GPRC:$X, immUExt8:$Y), (CMPEQi GPRC:$X, immUExt8:$Y)>;
+
+def : Pat<(setugt GPRC:$X, GPRC:$Y), (CMPULT GPRC:$Y, GPRC:$X)>;
+def : Pat<(setugt immUExt8:$X, GPRC:$Y), (CMPULTi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setuge GPRC:$X, GPRC:$Y), (CMPULE GPRC:$Y, GPRC:$X)>;
+def : Pat<(setuge immUExt8:$X, GPRC:$Y), (CMPULEi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setgt GPRC:$X, GPRC:$Y), (CMPLT GPRC:$Y, GPRC:$X)>;
+def : Pat<(setgt immUExt8:$X, GPRC:$Y), (CMPLTi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setge GPRC:$X, GPRC:$Y), (CMPLE GPRC:$Y, GPRC:$X)>;
+def : Pat<(setge immUExt8:$X, GPRC:$Y), (CMPLEi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setne GPRC:$X, GPRC:$Y), (CMPEQi (CMPEQ GPRC:$X, GPRC:$Y), 0)>;
+def : Pat<(setne GPRC:$X, immUExt8:$Y), (CMPEQi (CMPEQi GPRC:$X, immUExt8:$Y), 0)>;
+
+def : Pat<(setune GPRC:$X, GPRC:$Y), (CMPEQi (CMPEQ GPRC:$X, GPRC:$Y), 0)>;
+def : Pat<(setune GPRC:$X, immUExt8:$Y), (CMPEQi (CMPEQ GPRC:$X, immUExt8:$Y), 0)>;
+
+
+let isReturn = 1, isTerminator = 1, noResults = 1, Ra = 31, Rb = 26, disp = 1, Uses = [R26] in {
+  def RETDAG : MbrForm< 0x1A, 0x02, (ops), "ret $$31,($$26),1", s_jsr>; //Return from subroutine
+  def RETDAGp : MbrpForm< 0x1A, 0x02, (ops), "ret $$31,($$26),1", [(retflag)], s_jsr>; //Return from subroutine
+}
+
+let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1,
+Ra = 31, disp = 0 in
+def JMP : MbrpForm< 0x1A, 0x00, (ops GPRC:$RS), "jmp $$31,($RS),0", 
+          [(brind GPRC:$RS)], s_jsr>; //Jump
+
+let isCall = 1, noResults = 1, Ra = 26,
+    Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19,
+            R20, R21, R22, R23, R24, R25, R26, R27, R28, R29,
+            F0, F1,
+            F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+            F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30], Uses = [R29] in {
+    def BSR : BFormD<0x34, "bsr $$26,$$$DISP..ng", [], s_jsr>; //Branch to subroutine
+}
+let isCall = 1, noResults = 1, Ra = 26, Rb = 27, disp = 0,
+    Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19,
+            R20, R21, R22, R23, R24, R25, R26, R27, R28, R29,
+            F0, F1,
+            F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+            F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30], Uses = [R27, R29] in {
+    def JSR : MbrForm< 0x1A, 0x01, (ops ), "jsr $$26,($$27),0", s_jsr>; //Jump to subroutine
+}
+
+let isCall = 1, noResults = 1, Ra = 23, Rb = 27, disp = 0,
+    Defs = [R23, R24, R25, R27, R28], Uses = [R24, R25, R27] in
+  def JSRs : MbrForm< 0x1A, 0x01, (ops ), "jsr $$23,($$27),0", s_jsr>; //Jump to div or rem
+
+
+def JSR_COROUTINE : MbrForm< 0x1A, 0x03, (ops GPRC:$RD, GPRC:$RS, s14imm:$DISP), "jsr_coroutine $RD,($RS),$DISP", s_jsr>; //Jump to subroutine return
+
+
+let OperandList = (ops GPRC:$RA, s64imm:$DISP, GPRC:$RB) in {
+def LDQ   : MForm<0x29, 0, 1, "ldq $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDQr  : MForm<0x29, 0, 1, "ldq $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+def LDL   : MForm<0x28, 0, 1, "ldl $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (sextloadi32 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDLr  : MForm<0x28, 0, 1, "ldl $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (sextloadi32 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+def LDBU  : MForm<0x0A, 0, 1, "ldbu $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (zextloadi8 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDBUr : MForm<0x0A, 0, 1, "ldbu $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (zextloadi8 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+def LDWU  : MForm<0x0C, 0, 1, "ldwu $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (zextloadi16 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDWUr : MForm<0x0C, 0, 1, "ldwu $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (zextloadi16 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+
+
+def STB   : MForm<0x0E, 1, 0, "stb $RA,$DISP($RB)",
+		 [(truncstorei8 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STBr  : MForm<0x0E, 1, 0, "stb $RA,$DISP($RB)\t\t!gprellow",
+		 [(truncstorei8 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+def STW   : MForm<0x0D, 1, 0, "stw $RA,$DISP($RB)",
+		 [(truncstorei16 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STWr  : MForm<0x0D, 1, 0, "stw $RA,$DISP($RB)\t\t!gprellow",
+		 [(truncstorei16 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+def STL   : MForm<0x2C, 1, 0, "stl $RA,$DISP($RB)",
+		 [(truncstorei32 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STLr  : MForm<0x2C, 1, 0, "stl $RA,$DISP($RB)\t\t!gprellow",
+		 [(truncstorei32 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+def STQ   : MForm<0x2D, 1, 0, "stq $RA,$DISP($RB)",
+		 [(store GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STQr  : MForm<0x2D, 1, 0, "stq $RA,$DISP($RB)\t\t!gprellow",
+		 [(store GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+
+//Load address
+def LDA   : MForm<0x08, 0, 0, "lda $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_lda>;
+def LDAr  : MForm<0x08, 0, 0, "lda $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_lda>;  //Load address
+def LDAH  : MForm<0x09, 0, 0, "ldah $RA,$DISP($RB)",
+                 [], s_lda>;  //Load address high
+def LDAHr : MForm<0x09, 0, 0, "ldah $RA,$DISP($RB)\t\t!gprelhigh",
+                 [(set GPRC:$RA, (Alpha_gprelhi tglobaladdr:$DISP, GPRC:$RB))], s_lda>;  //Load address high
+}
+
+let OperandList = (ops F4RC:$RA, s64imm:$DISP, GPRC:$RB) in {
+def STS  : MForm<0x26, 1, 0, "sts $RA,$DISP($RB)",
+		[(store F4RC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_fst>;
+def STSr : MForm<0x26, 1, 0, "sts $RA,$DISP($RB)\t\t!gprellow",
+		[(store F4RC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_fst>;
+def LDS  : MForm<0x22, 0, 1, "lds $RA,$DISP($RB)",
+		[(set F4RC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_fld>;
+def LDSr : MForm<0x22, 0, 1, "lds $RA,$DISP($RB)\t\t!gprellow",
+		[(set F4RC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_fld>;
+}
+let OperandList = (ops F8RC:$RA, s64imm:$DISP, GPRC:$RB) in {
+def STT  : MForm<0x27, 1, 0, "stt $RA,$DISP($RB)",
+		[(store F8RC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_fst>;
+def STTr : MForm<0x27, 1, 0, "stt $RA,$DISP($RB)\t\t!gprellow",
+		[(store F8RC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_fst>;
+def LDT  : MForm<0x23, 0, 1, "ldt $RA,$DISP($RB)",
+		[(set F8RC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_fld>;
+def LDTr : MForm<0x23, 0, 1, "ldt $RA,$DISP($RB)\t\t!gprellow",
+		[(set F8RC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_fld>;
+}
+
+
+//constpool rels
+def : Pat<(i64 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDQr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (sextloadi32 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDLr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (zextloadi8 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDBUr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (zextloadi16 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDWUr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (Alpha_gprello tconstpool:$DISP, GPRC:$RB)),
+          (LDAr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (Alpha_gprelhi tconstpool:$DISP, GPRC:$RB)),
+          (LDAHr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(f32 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDSr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(f64 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDTr tconstpool:$DISP, GPRC:$RB)>;
+
+//jumptable rels
+def : Pat<(i64 (Alpha_gprelhi tjumptable:$DISP, GPRC:$RB)),
+          (LDAHr tjumptable:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (Alpha_gprello tjumptable:$DISP, GPRC:$RB)),
+          (LDAr tjumptable:$DISP, GPRC:$RB)>;
+
+
+//misc ext patterns
+def : Pat<(i64 (extloadi8 (add GPRC:$RB, immSExt16:$DISP))),
+          (LDBU   immSExt16:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (extloadi16 (add GPRC:$RB, immSExt16:$DISP))),
+          (LDWU  immSExt16:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (extloadi32 (add GPRC:$RB, immSExt16:$DISP))),
+          (LDL   immSExt16:$DISP, GPRC:$RB)>;
+
+//0 disp patterns
+def : Pat<(i64 (load GPRC:$addr)),
+          (LDQ  0, GPRC:$addr)>;
+def : Pat<(f64 (load GPRC:$addr)),
+          (LDT  0, GPRC:$addr)>;
+def : Pat<(f32 (load GPRC:$addr)),
+          (LDS  0, GPRC:$addr)>;
+def : Pat<(i64 (sextloadi32 GPRC:$addr)),
+          (LDL  0, GPRC:$addr)>;
+def : Pat<(i64 (zextloadi16 GPRC:$addr)),
+          (LDWU 0, GPRC:$addr)>;
+def : Pat<(i64 (zextloadi8 GPRC:$addr)),
+          (LDBU 0, GPRC:$addr)>;
+def : Pat<(i64 (extloadi8 GPRC:$addr)),
+          (LDBU 0, GPRC:$addr)>;
+def : Pat<(i64 (extloadi16 GPRC:$addr)),
+          (LDWU 0, GPRC:$addr)>;
+def : Pat<(i64 (extloadi32 GPRC:$addr)),
+          (LDL  0, GPRC:$addr)>;
+
+def : Pat<(store GPRC:$DATA, GPRC:$addr),
+          (STQ  GPRC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(store F8RC:$DATA, GPRC:$addr),
+          (STT  F8RC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(store F4RC:$DATA, GPRC:$addr),
+          (STS  F4RC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(truncstorei32 GPRC:$DATA, GPRC:$addr),
+          (STL  GPRC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(truncstorei16 GPRC:$DATA, GPRC:$addr),
+          (STW GPRC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(truncstorei8 GPRC:$DATA, GPRC:$addr),
+          (STB GPRC:$DATA, 0, GPRC:$addr)>;
+
+
+//load address, rellocated gpdist form
+let OperandList = (ops GPRC:$RA, s16imm:$DISP, GPRC:$RB, s16imm:$NUM) in {
+def LDAg  : MForm<0x08, 0, 1, "lda $RA,0($RB)\t\t!gpdisp!$NUM", [], s_lda>;  //Load address
+def LDAHg : MForm<0x09, 0, 1, "ldah $RA,0($RB)\t\t!gpdisp!$NUM", [], s_lda>;  //Load address
+}
+
+//Load quad, rellocated literal form
+let OperandList = (ops GPRC:$RA, s64imm:$DISP, GPRC:$RB) in 
+def LDQl : MForm<0x29, 0, 1, "ldq $RA,$DISP($RB)\t\t!literal",
+                 [(set GPRC:$RA, (Alpha_rellit tglobaladdr:$DISP, GPRC:$RB))], s_ild>;
+def : Pat<(Alpha_rellit texternalsym:$ext, GPRC:$RB),
+          (LDQl texternalsym:$ext, GPRC:$RB)>;
+
+
+def RPCC : MfcForm<0x18, 0xC000, "rpcc $RA", s_rpcc>; //Read process cycle counter
+
+//Basic Floating point ops
+
+//Floats
+
+let OperandList = (ops F4RC:$RC, F4RC:$RB), Fa = 31 in 
+def SQRTS : FPForm<0x14, 0x58B, "sqrts/su $RB,$RC",
+                   [(set F4RC:$RC, (fsqrt F4RC:$RB))], s_fsqrts>;
+
+let OperandList = (ops F4RC:$RC, F4RC:$RA, F4RC:$RB) in {
+def ADDS  : FPForm<0x16, 0x580, "adds/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fadd F4RC:$RA, F4RC:$RB))], s_fadd>;
+def SUBS  : FPForm<0x16, 0x581, "subs/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fsub F4RC:$RA, F4RC:$RB))], s_fadd>;
+def DIVS  : FPForm<0x16, 0x583, "divs/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fdiv F4RC:$RA, F4RC:$RB))], s_fdivs>;
+def MULS  : FPForm<0x16, 0x582, "muls/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fmul F4RC:$RA, F4RC:$RB))], s_fmul>;
+
+def CPYSS  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fcopysign F4RC:$RB, F4RC:$RA))], s_fadd>;
+def CPYSES : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent
+def CPYSNS : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fneg (fcopysign F4RC:$RB, F4RC:$RA)))], s_fadd>;
+}
+
+//Doubles
+
+let OperandList = (ops F8RC:$RC, F8RC:$RB), Fa = 31 in 
+def SQRTT : FPForm<0x14, 0x5AB, "sqrtt/su $RB,$RC",
+                   [(set F8RC:$RC, (fsqrt F8RC:$RB))], s_fsqrtt>;
+
+let OperandList = (ops F8RC:$RC, F8RC:$RA, F8RC:$RB) in {
+def ADDT  : FPForm<0x16, 0x5A0, "addt/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fadd F8RC:$RA, F8RC:$RB))], s_fadd>;
+def SUBT  : FPForm<0x16, 0x5A1, "subt/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fsub F8RC:$RA, F8RC:$RB))], s_fadd>;
+def DIVT  : FPForm<0x16, 0x5A3, "divt/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fdiv F8RC:$RA, F8RC:$RB))], s_fdivt>;
+def MULT  : FPForm<0x16, 0x5A2, "mult/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fmul F8RC:$RA, F8RC:$RB))], s_fmul>;
+
+def CPYST  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fcopysign F8RC:$RB, F8RC:$RA))], s_fadd>;
+def CPYSET : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent
+def CPYSNT : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fneg (fcopysign F8RC:$RB, F8RC:$RA)))], s_fadd>;
+
+def CMPTEQ : FPForm<0x16, 0x5A5, "cmpteq/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (seteq F8RC:$RA, F8RC:$RB))]>;
+def CMPTLE : FPForm<0x16, 0x5A7, "cmptle/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (setle F8RC:$RA, F8RC:$RB))]>;
+def CMPTLT : FPForm<0x16, 0x5A6, "cmptlt/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (setlt F8RC:$RA, F8RC:$RB))]>;
+def CMPTUN : FPForm<0x16, 0x5A4, "cmptun/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (setuo F8RC:$RA, F8RC:$RB))]>;
+}
+
+//More CPYS forms:
+let OperandList = (ops F8RC:$RC, F4RC:$RA, F8RC:$RB) in {
+def CPYSTs  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fcopysign F8RC:$RB, F4RC:$RA))], s_fadd>;
+def CPYSNTs : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fneg (fcopysign F8RC:$RB, F4RC:$RA)))], s_fadd>;
+}
+let OperandList = (ops F4RC:$RC, F8RC:$RA, F4RC:$RB) in {
+def CPYSSt  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fcopysign F4RC:$RB, F8RC:$RA))], s_fadd>;
+def CPYSESt : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent
+def CPYSNSt : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fneg (fcopysign F4RC:$RB, F8RC:$RA)))], s_fadd>;
+}
+
+//conditional moves, floats
+let OperandList = (ops F4RC:$RDEST, F4RC:$RFALSE, F4RC:$RTRUE, F8RC:$RCOND),
+    isTwoAddress = 1 in {
+def FCMOVEQS : FPForm<0x17, 0x02A, "fcmoveq $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if = zero
+def FCMOVGES : FPForm<0x17, 0x02D, "fcmovge $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if >= zero
+def FCMOVGTS : FPForm<0x17, 0x02F, "fcmovgt $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if > zero
+def FCMOVLES : FPForm<0x17, 0x02E, "fcmovle $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if <= zero
+def FCMOVLTS : FPForm<0x17, 0x02C, "fcmovlt $RCOND,$RTRUE,$RDEST",[], s_fcmov>; // FCMOVE if < zero
+def FCMOVNES : FPForm<0x17, 0x02B, "fcmovne $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if != zero
+}
+//conditional moves, doubles
+let OperandList = (ops F8RC:$RDEST, F8RC:$RFALSE, F8RC:$RTRUE, F8RC:$RCOND),
+    isTwoAddress = 1 in {
+def FCMOVEQT : FPForm<0x17, 0x02A, "fcmoveq $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVGET : FPForm<0x17, 0x02D, "fcmovge $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVGTT : FPForm<0x17, 0x02F, "fcmovgt $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVLET : FPForm<0x17, 0x02E, "fcmovle $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVLTT : FPForm<0x17, 0x02C, "fcmovlt $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVNET : FPForm<0x17, 0x02B, "fcmovne $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+}
+
+//misc FP selects
+//Select double
+     
+def : Pat<(select (seteq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setoeq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setueq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setne F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setone F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setune F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setgt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setogt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setugt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setoge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setuge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setlt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setolt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setult F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setle F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setole F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setule F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+
+//Select single
+def : Pat<(select (seteq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setoeq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setueq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setne F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setone F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setune F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setgt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setogt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setugt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setoge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setuge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setlt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setolt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setult F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setle F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setole F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setule F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+
+
+
+let OperandList = (ops GPRC:$RC, F4RC:$RA), Fb = 31 in 
+def FTOIS : FPForm<0x1C, 0x078, "ftois $RA,$RC",[], s_ftoi>; //Floating to integer move, S_floating
+let OperandList = (ops GPRC:$RC, F8RC:$RA), Fb = 31 in 
+def FTOIT : FPForm<0x1C, 0x070, "ftoit $RA,$RC",
+        [(set GPRC:$RC, (bitconvert F8RC:$RA))], s_ftoi>; //Floating to integer move
+let OperandList = (ops F4RC:$RC, GPRC:$RA), Fb = 31 in 
+def ITOFS : FPForm<0x14, 0x004, "itofs $RA,$RC",[], s_itof>; //Integer to floating move, S_floating
+let OperandList = (ops F8RC:$RC, GPRC:$RA), Fb = 31 in 
+def ITOFT : FPForm<0x14, 0x024, "itoft $RA,$RC",
+        [(set F8RC:$RC, (bitconvert GPRC:$RA))], s_itof>; //Integer to floating move
+
+
+let OperandList = (ops F4RC:$RC, F8RC:$RB), Fa = 31 in 
+def CVTQS : FPForm<0x16, 0x7BC, "cvtqs/sui $RB,$RC",
+        [(set F4RC:$RC, (Alpha_cvtqs F8RC:$RB))], s_fadd>;
+let OperandList = (ops F8RC:$RC, F8RC:$RB), Fa = 31 in 
+def CVTQT : FPForm<0x16, 0x7BE, "cvtqt/sui $RB,$RC",
+        [(set F8RC:$RC, (Alpha_cvtqt F8RC:$RB))], s_fadd>;
+let OperandList = (ops F8RC:$RC, F8RC:$RB), Fa = 31 in 
+def CVTTQ : FPForm<0x16, 0x52F, "cvttq/svc $RB,$RC",
+        [(set F8RC:$RC, (Alpha_cvttq F8RC:$RB))], s_fadd>;
+let OperandList = (ops F8RC:$RC, F4RC:$RB), Fa = 31 in 
+def CVTST : FPForm<0x16, 0x6AC, "cvtst/s $RB,$RC",
+                   [(set F8RC:$RC, (fextend F4RC:$RB))], s_fadd>;
+let OperandList = (ops F4RC:$RC, F8RC:$RB), Fa = 31 in 
+def CVTTS : FPForm<0x16, 0x7AC, "cvtts/sui $RB,$RC",
+                   [(set F4RC:$RC, (fround F8RC:$RB))], s_fadd>;
+
+
+/////////////////////////////////////////////////////////
+//Branching
+/////////////////////////////////////////////////////////
+class br_icc<bits<6> opc, string asmstr>
+  : BFormN<opc, (ops u64imm:$opc, GPRC:$R, target:$dst), 
+    !strconcat(asmstr, " $R,$dst"),  s_icbr>;
+class br_fcc<bits<6> opc, string asmstr>
+  : BFormN<opc, (ops u64imm:$opc, F8RC:$R, target:$dst), 
+    !strconcat(asmstr, " $R,$dst"),  s_fbr>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, noResults = 1 in {
+let Ra = 31 in
+def BR : BFormD<0x30, "br $$31,$DISP", [(br bb:$DISP)], s_ubr>;
+
+def COND_BRANCH_I : BFormN<0, (ops u64imm:$opc, GPRC:$R, target:$dst), 
+                    "{:comment} COND_BRANCH imm:$opc, GPRC:$R, bb:$dst", 
+                    s_icbr>;
+def COND_BRANCH_F : BFormN<0, (ops u64imm:$opc, F8RC:$R, target:$dst), 
+                    "{:comment} COND_BRANCH imm:$opc, F8RC:$R, bb:$dst",
+                    s_fbr>;
+//Branches, int
+def BEQ  : br_icc<0x39, "beq">;
+def BGE  : br_icc<0x3E, "bge">;
+def BGT  : br_icc<0x3F, "bgt">;
+def BLBC : br_icc<0x38, "blbc">;
+def BLBS : br_icc<0x3C, "blbs">;
+def BLE  : br_icc<0x3B, "ble">;
+def BLT  : br_icc<0x3A, "blt">;
+def BNE  : br_icc<0x3D, "bne">;
+
+//Branches, float
+def FBEQ : br_fcc<0x31, "fbeq">;
+def FBGE : br_fcc<0x36, "fbge">;
+def FBGT : br_fcc<0x37, "fbgt">;
+def FBLE : br_fcc<0x33, "fble">;
+def FBLT : br_fcc<0x32, "fblt">;
+def FBNE : br_fcc<0x36, "fbne">;
+}
+
+//An ugly trick to get the opcode as an imm I can use
+def immBRCond : SDNodeXForm<imm, [{
+  switch((uint64_t)N->getValue()) {
+    case 0:  return getI64Imm(Alpha::BEQ);
+    case 1:  return getI64Imm(Alpha::BNE);
+    case 2:  return getI64Imm(Alpha::BGE);
+    case 3:  return getI64Imm(Alpha::BGT);
+    case 4:  return getI64Imm(Alpha::BLE);
+    case 5:  return getI64Imm(Alpha::BLT);
+    case 6:  return getI64Imm(Alpha::BLBS);
+    case 7:  return getI64Imm(Alpha::BLBC);
+    case 20: return getI64Imm(Alpha::FBEQ);
+    case 21: return getI64Imm(Alpha::FBNE);
+    case 22: return getI64Imm(Alpha::FBGE);
+    case 23: return getI64Imm(Alpha::FBGT);
+    case 24: return getI64Imm(Alpha::FBLE);
+    case 25: return getI64Imm(Alpha::FBLT);
+    default: assert(0 && "Unknown branch type");
+  }
+}]>;
+
+//Int cond patterns
+def : Pat<(brcond (seteq GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 0),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setge GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 2),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setgt GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 3),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (and   GPRC:$RA, 1), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 6),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setle GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 4),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setlt GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 5),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setne GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 1),  GPRC:$RA, bb:$DISP)>;
+
+def : Pat<(brcond GPRC:$RA, bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 1), GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setne GPRC:$RA, GPRC:$RB), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 0), (CMPEQ GPRC:$RA, GPRC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setne GPRC:$RA, immUExt8:$L), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 0), (CMPEQi GPRC:$RA, immUExt8:$L), bb:$DISP)>;
+
+//FP cond patterns
+def : Pat<(brcond (seteq F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 20),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setne F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setge F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 22),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setgt F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 23),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setle F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 24),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setlt F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 25),  F8RC:$RA, bb:$DISP)>;
+
+
+def : Pat<(brcond (seteq F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setoeq F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setueq F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+def : Pat<(brcond (setlt F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setolt F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setult F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+def : Pat<(brcond (setle F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setole F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setule F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+def : Pat<(brcond (setgt F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setogt F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setugt F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+
+def : Pat<(brcond (setge F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setoge F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setuge F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+
+def : Pat<(brcond (setne F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setone F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setune F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+
+def : Pat<(brcond (setoeq F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 20), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setueq F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 20), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setoge F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 22), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setuge F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 22), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setogt F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 23), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setugt F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 23), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setole F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 24), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setule F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 24), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setolt F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 25), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setult F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 25), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setone F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 21), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setune F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 21), F8RC:$RA,bb:$DISP)>;
+
+//End Branches
+
+//S_floating : IEEE Single
+//T_floating : IEEE Double
+
+//Unused instructions
+//Mnemonic Format Opcode Description
+//CALL_PAL Pcd 00 Trap to PALcode
+//ECB Mfc 18.E800 Evict cache block
+//EXCB Mfc 18.0400 Exception barrier
+//FETCH Mfc 18.8000 Prefetch data
+//FETCH_M Mfc 18.A000 Prefetch data, modify intent
+//LDL_L Mem 2A Load sign-extended longword locked
+//LDQ_L Mem 2B Load quadword locked
+//LDQ_U Mem 0B Load unaligned quadword
+//MB Mfc 18.4000 Memory barrier
+//STL_C Mem 2E Store longword conditional
+//STQ_C Mem 2F Store quadword conditional
+//STQ_U Mem 0F Store unaligned quadword
+//TRAPB Mfc 18.0000 Trap barrier
+//WH64 Mfc 18.F800 Write hint  64 bytes
+//WMB Mfc 18.4400 Write memory barrier
+//MF_FPCR F-P 17.025 Move from FPCR
+//MT_FPCR F-P 17.024 Move to FPCR
+//There are in the Multimedia extentions, so let's not use them yet
+//def MAXSB8  : OForm<0x1C, 0x3E, "MAXSB8 $RA,$RB,$RC">; //Vector signed byte maximum
+//def MAXSW4 : OForm< 0x1C, 0x3F, "MAXSW4 $RA,$RB,$RC">; //Vector signed word maximum
+//def MAXUB8  : OForm<0x1C, 0x3C, "MAXUB8 $RA,$RB,$RC">; //Vector unsigned byte maximum
+//def MAXUW4 : OForm< 0x1C, 0x3D, "MAXUW4 $RA,$RB,$RC">; //Vector unsigned word maximum
+//def MINSB8 : OForm< 0x1C, 0x38, "MINSB8 $RA,$RB,$RC">; //Vector signed byte minimum
+//def MINSW4 : OForm< 0x1C, 0x39, "MINSW4 $RA,$RB,$RC">; //Vector signed word minimum
+//def MINUB8 : OForm< 0x1C, 0x3A, "MINUB8 $RA,$RB,$RC">; //Vector unsigned byte minimum
+//def MINUW4 : OForm< 0x1C, 0x3B, "MINUW4 $RA,$RB,$RC">; //Vector unsigned word minimum
+//def PERR : OForm< 0x1C, 0x31, "PERR $RA,$RB,$RC">; //Pixel error
+//def PKLB : OForm< 0x1C, 0x37, "PKLB $RA,$RB,$RC">; //Pack longwords to bytes
+//def PKWB  : OForm<0x1C, 0x36, "PKWB $RA,$RB,$RC">; //Pack words to bytes
+//def UNPKBL : OForm< 0x1C, 0x35, "UNPKBL $RA,$RB,$RC">; //Unpack bytes to longwords
+//def UNPKBW : OForm< 0x1C, 0x34, "UNPKBW $RA,$RB,$RC">; //Unpack bytes to words
+//CVTLQ F-P 17.010 Convert longword to quadword
+//CVTQL F-P 17.030 Convert quadword to longword
+
+
+//Constant handling
+
+def immConst2Part  : PatLeaf<(imm), [{
+  //true if imm fits in a LDAH LDA pair
+  int64_t val = (int64_t)N->getValue();
+  return (val <= IMM_FULLHIGH  && val >= IMM_FULLLOW);
+}]>;
+def immConst2PartInt  : PatLeaf<(imm), [{
+  //true if imm fits in a LDAH LDA pair with zeroext
+  uint64_t uval = N->getValue();
+  int32_t val32 = (int32_t)uval;
+  return ((uval >> 32) == 0 && //empty upper bits
+          val32 <= IMM_FULLHIGH);
+//          val32 >= IMM_FULLLOW  + IMM_LOW  * IMM_MULT); //Always True
+}], SExt32>;
+
+def : Pat<(i64 immConst2Part:$imm),
+          (LDA (LL16 immConst2Part:$imm), (LDAH (LH16 immConst2Part:$imm), R31))>;
+
+def : Pat<(i64 immSExt16:$imm),
+          (LDA immSExt16:$imm, R31)>;
+
+def : Pat<(i64 immSExt16int:$imm),
+          (ZAPNOTi (LDA (SExt16 immSExt16int:$imm), R31), 15)>;
+def : Pat<(i64 immConst2PartInt:$imm),
+          (ZAPNOTi (LDA (LL16 (SExt32 immConst2PartInt:$imm)), 
+                        (LDAH (LH16 (SExt32 immConst2PartInt:$imm)), R31)), 15)>;
+
+
+//TODO: I want to just define these like this!
+//def : Pat<(i64 0),
+//          (R31)>;
+//def : Pat<(f64 0.0),
+//          (F31)>;
+//def : Pat<(f64 -0.0),
+//          (CPYSNT F31, F31)>;
+//def : Pat<(f32 0.0),
+//          (F31)>;
+//def : Pat<(f32 -0.0),
+//          (CPYSNS F31, F31)>;
+
+//Misc Patterns:
+
+def : Pat<(sext_inreg GPRC:$RB, i32),
+          (ADDLi GPRC:$RB, 0)>;
+
+def : Pat<(fabs F8RC:$RB),
+          (CPYST F31, F8RC:$RB)>;
+def : Pat<(fabs F4RC:$RB),
+          (CPYSS F31, F4RC:$RB)>;
+def : Pat<(fneg F8RC:$RB),
+          (CPYSNT F8RC:$RB, F8RC:$RB)>;
+def : Pat<(fneg F4RC:$RB),
+          (CPYSNS F4RC:$RB, F4RC:$RB)>;
+
+def : Pat<(fcopysign F4RC:$A, (fneg F4RC:$B)),
+          (CPYSNS F4RC:$B, F4RC:$A)>;
+def : Pat<(fcopysign F8RC:$A, (fneg F8RC:$B)),
+          (CPYSNT F8RC:$B, F8RC:$A)>;
+def : Pat<(fcopysign F4RC:$A, (fneg F8RC:$B)),
+          (CPYSNSt F8RC:$B, F4RC:$A)>;
+def : Pat<(fcopysign F8RC:$A, (fneg F4RC:$B)),
+          (CPYSNTs F4RC:$B, F8RC:$A)>;
+
+//Yes, signed multiply high is ugly
+def : Pat<(mulhs GPRC:$RA, GPRC:$RB),
+          (SUBQr (UMULHr GPRC:$RA, GPRC:$RB), (ADDQr (CMOVGEr GPRC:$RB, R31, GPRC:$RA), 
+                                                     (CMOVGEr GPRC:$RA, R31, GPRC:$RB)))>;
+
+//Stupid crazy arithmetic stuff:
+let AddedComplexity = 1 in {
+def : Pat<(mul GPRC:$RA, 5), (S4ADDQr GPRC:$RA, GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, 9), (S8ADDQr GPRC:$RA, GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, 3), (S4SUBQr GPRC:$RA, GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, 7), (S8SUBQr GPRC:$RA, GPRC:$RA)>;
+
+//slight tree expansion if we are multiplying near to a power of 2
+//n is above a power of 2
+def : Pat<(mul GPRC:$RA, immRem1:$imm), 
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem1:$imm)), GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, immRem2:$imm), 
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem2:$imm)), (ADDQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRem3:$imm),
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem3:$imm)), (S4SUBQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRem4:$imm),
+          (S4ADDQr GPRC:$RA, (SLr GPRC:$RA, (nearP2X immRem4:$imm)))>;
+def : Pat<(mul GPRC:$RA, immRem5:$imm),
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem5:$imm)), (S4ADDQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRemP2:$imm),
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRemP2:$imm)), (SLi GPRC:$RA, (nearP2RemX immRemP2:$imm)))>;
+
+//n is below a power of 2
+def : Pat<(mul GPRC:$RA, immRem1n:$imm), 
+          (SUBQr (SLr GPRC:$RA, (nearP2X immRem1n:$imm)), GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, immRem2n:$imm), 
+          (SUBQr (SLr GPRC:$RA, (nearP2X immRem2n:$imm)), (ADDQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRem3n:$imm),
+          (SUBQr (SLr GPRC:$RA, (nearP2X immRem3n:$imm)), (S4SUBQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRem4n:$imm),
+          (SUBQr (SLr GPRC:$RA, (nearP2X immRem4n:$imm)), (SLi GPRC:$RA, 2))>;
+def : Pat<(mul GPRC:$RA, immRem5n:$imm),
+          (SUBQr (SLr GPRC:$RA, (nearP2X immRem5n:$imm)), (S4ADDQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRemP2n:$imm),
+          (SUBQr (SLr GPRC:$RA, (nearP2X immRemP2n:$imm)), (SLi GPRC:$RA, (nearP2RemX immRemP2n:$imm)))>;
+} //Added complexity

diff --git a/lib/Target/Alpha/AlphaJITInfo.cpp b/lib/Target/Alpha/AlphaJITInfo.cpp
new file mode 100644
index 0000000..669a2d5
--- /dev/null
+++ b/lib/Target/Alpha/AlphaJITInfo.cpp

@@ -0,0 +1,305 @@
+//===-- AlphaJITInfo.cpp - Implement the JIT interfaces for the Alpha ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Alpha target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "AlphaJITInfo.h"
+#include "AlphaRelocations.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/Config/alloca.h"
+#include "llvm/Support/Debug.h"
+#include <cstdlib>
+#include <map>
+using namespace llvm;
+
+#define BUILD_OFormatI(Op, RA, LIT, FUN, RC) \
+  ((Op << 26) | (RA << 21) | (LIT << 13) | (1 << 12) | (FUN << 5) | (RC))
+#define BUILD_OFormat(Op, RA, RB, FUN, RC) \
+  ((Op << 26) | (RA << 21) | (RB << 16) | (FUN << 5) | (RC))
+
+#define BUILD_LDA(RD, RS, IMM16) \
+  ((0x08 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+#define BUILD_LDAH(RD, RS, IMM16) \
+  ((0x09 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+
+#define BUILD_LDQ(RD, RS, IMM16) \
+  ((0x29 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 0xFFFF))
+
+#define BUILD_JMP(RD, RS, IMM16) \
+  ((0x1A << 26) | ((RD) << 21) | ((RS) << 16) | (0x00 << 14) | ((IMM16) & 0x3FFF))
+#define BUILD_JSR(RD, RS, IMM16) \
+  ((0x1A << 26) | ((RD) << 21) | ((RS) << 16) | (0x01 << 14) | ((IMM16) & 0x3FFF))
+
+#define BUILD_SLLi(RD, RS, IMM8) \
+  (BUILD_OFormatI(0x12, RS, IMM8, 0x39, RD))
+
+#define BUILD_ORi(RD, RS, IMM8) \
+  (BUILD_OFormatI(0x11, RS, IMM8, 0x20, RD))
+
+#define BUILD_OR(RD, RS, RT) \
+  (BUILD_OFormat(0x11, RS, RT, 0x20, RD))
+
+
+
+static void EmitBranchToAt(void *At, void *To) {
+  unsigned long Fn = (unsigned long)To;
+
+  unsigned *AtI = (unsigned*)At;
+
+  AtI[0] = BUILD_OR(0, 27, 27);
+
+  DOUT << "Stub targeting " << To << "\n";
+
+  for (int x = 1; x <= 8; ++x) {
+    AtI[2*x - 1] = BUILD_SLLi(27,27,8);
+    unsigned d = (Fn >> (64 - 8 * x)) & 0x00FF;
+    //DOUT << "outputing " << hex << d << dec << "\n";
+    AtI[2*x] = BUILD_ORi(27, 27, d);
+  }
+  AtI[17] = BUILD_JMP(31,27,0); //jump, preserving ra, and setting pv
+  AtI[18] = 0x00FFFFFF; //mark this as a stub
+}
+
+void AlphaJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  //FIXME
+  assert(0);
+}
+
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+//static AlphaJITInfo* AlphaJTI;
+
+extern "C" {
+#ifdef __alpha
+
+  void AlphaCompilationCallbackC(long* oldpv, void* CameFromStub)
+  {
+    void* Target = JITCompilerFunction(CameFromStub);
+
+    //rewrite the stub to an unconditional branch
+    if (((unsigned*)CameFromStub)[18] == 0x00FFFFFF) {
+      DOUT << "Came from a stub, rewriting\n";
+      EmitBranchToAt(CameFromStub, Target);
+    } else {
+      DOUT << "confused, didn't come from stub at " << CameFromStub
+           << " old jump vector " << oldpv
+           << " new jump vector " << Target << "\n";
+    }
+
+    //Change pv to new Target
+    *oldpv = (long)Target;
+  }
+
+  void AlphaCompilationCallback(void);
+
+  asm(
+      ".text\n"
+      ".globl AlphaComilationCallbackC\n"
+      ".align 4\n"
+      ".globl AlphaCompilationCallback\n"
+      ".ent AlphaCompilationCallback\n"
+"AlphaCompilationCallback:\n"
+      //      //get JIT's GOT
+      "ldgp $29, 0($27)\n"
+      //Save args, callee saved, and perhaps others?
+      //args: $16-$21 $f16-$f21     (12)
+      //callee: $9-$14 $f2-$f9      (14)
+      //others: fp:$15 ra:$26 pv:$27 (3)
+      "lda $30, -232($30)\n"
+      "stq $16,   0($30)\n"
+      "stq $17,   8($30)\n"
+      "stq $18,  16($30)\n"
+      "stq $19,  24($30)\n"
+      "stq $20,  32($30)\n"
+      "stq $21,  40($30)\n"
+      "stt $f16, 48($30)\n"
+      "stt $f17, 56($30)\n"
+      "stt $f18, 64($30)\n"
+      "stt $f19, 72($30)\n"
+      "stt $f20, 80($30)\n"
+      "stt $f21, 88($30)\n"
+      "stq $9,   96($30)\n"
+      "stq $10, 104($30)\n"
+      "stq $11, 112($30)\n"
+      "stq $12, 120($30)\n"
+      "stq $13, 128($30)\n"
+      "stq $14, 136($30)\n"
+      "stt $f2, 144($30)\n"
+      "stt $f3, 152($30)\n"
+      "stt $f4, 160($30)\n"
+      "stt $f5, 168($30)\n"
+      "stt $f6, 176($30)\n"
+      "stt $f7, 184($30)\n"
+      "stt $f8, 192($30)\n"
+      "stt $f9, 200($30)\n"
+      "stq $15, 208($30)\n"
+      "stq $26, 216($30)\n"
+      "stq $27, 224($30)\n"
+
+      "addq $30, 224, $16\n" //pass the addr of saved pv as the first arg
+      "bis $0, $0, $17\n" //pass the roughly stub addr in second arg
+      "jsr $26, AlphaCompilationCallbackC\n" //call without saving ra
+
+      "ldq $16,   0($30)\n"
+      "ldq $17,   8($30)\n"
+      "ldq $18,  16($30)\n"
+      "ldq $19,  24($30)\n"
+      "ldq $20,  32($30)\n"
+      "ldq $21,  40($30)\n"
+      "ldt $f16, 48($30)\n"
+      "ldt $f17, 56($30)\n"
+      "ldt $f18, 64($30)\n"
+      "ldt $f19, 72($30)\n"
+      "ldt $f20, 80($30)\n"
+      "ldt $f21, 88($30)\n"
+      "ldq $9,   96($30)\n"
+      "ldq $10, 104($30)\n"
+      "ldq $11, 112($30)\n"
+      "ldq $12, 120($30)\n"
+      "ldq $13, 128($30)\n"
+      "ldq $14, 136($30)\n"
+      "ldt $f2, 144($30)\n"
+      "ldt $f3, 152($30)\n"
+      "ldt $f4, 160($30)\n"
+      "ldt $f5, 168($30)\n"
+      "ldt $f6, 176($30)\n"
+      "ldt $f7, 184($30)\n"
+      "ldt $f8, 192($30)\n"
+      "ldt $f9, 200($30)\n"
+      "ldq $15, 208($30)\n"
+      "ldq $26, 216($30)\n"
+      "ldq $27, 224($30)\n" //this was updated in the callback with the target
+
+      "lda $30, 232($30)\n" //restore sp
+      "jmp $31, ($27)\n" //jump to the new function
+      ".end AlphaCompilationCallback\n"
+      );
+#else
+  void AlphaCompilationCallback() {
+    cerr << "Cannot call AlphaCompilationCallback() on a non-Alpha arch!\n";
+    abort();
+  }
+#endif
+}
+
+void *AlphaJITInfo::emitFunctionStub(void *Fn, MachineCodeEmitter &MCE) {
+  //assert(Fn == AlphaCompilationCallback && "Where are you going?\n");
+  //Do things in a stupid slow way!
+  MCE.startFunctionStub(19*4);
+  void* Addr = (void*)(intptr_t)MCE.getCurrentPCValue();
+  for (int x = 0; x < 19; ++ x)
+    MCE.emitWordLE(0);
+  EmitBranchToAt(Addr, Fn);
+  DOUT << "Emitting Stub to " << Fn << " at [" << Addr << "]\n";
+  return MCE.finishFunctionStub(0);
+}
+
+TargetJITInfo::LazyResolverFn
+AlphaJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  //  setZerothGOTEntry((void*)AlphaCompilationCallback);
+  return AlphaCompilationCallback;
+}
+
+//These describe LDAx
+static const int IMM_LOW  = -32768;
+static const int IMM_HIGH = 32767;
+static const int IMM_MULT = 65536;
+
+static long getUpper16(long l)
+{
+  long y = l / IMM_MULT;
+  if (l % IMM_MULT > IMM_HIGH)
+    ++y;
+  if (l % IMM_MULT < IMM_LOW)
+    --y;
+  assert((short)y == y && "displacement out of range");
+  return y;
+}
+
+static long getLower16(long l)
+{
+  long h = getUpper16(l);
+  long y = l - h * IMM_MULT;
+  assert(y == (short)y && "Displacement out of range");
+  return y;
+}
+
+void AlphaJITInfo::relocate(void *Function, MachineRelocation *MR,
+                            unsigned NumRelocs, unsigned char* GOTBase) {
+  //because gpdist are paired and relative to the pc of the first inst,
+  //we need to have some state
+
+  static std::map<std::pair<void*, int>, void*> gpdistmap;
+
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4;
+    long idx = 0;
+    bool doCommon = true;
+    switch ((Alpha::RelocationType)MR->getRelocationType()) {
+    default: assert(0 && "Unknown relocation type!");
+    case Alpha::reloc_literal:
+      //This is a LDQl
+      idx = MR->getGOTIndex();
+      DOUT << "Literal relocation to slot " << idx;
+      idx = (idx - GOToffset) * 8;
+      DOUT << " offset " << idx << "\n";
+      break;
+    case Alpha::reloc_gprellow:
+      idx = (unsigned char*)MR->getResultPointer() - &GOTBase[GOToffset * 8];
+      idx = getLower16(idx);
+      DOUT << "gprellow relocation offset " << idx << "\n";
+      DOUT << " Pointer is " << (void*)MR->getResultPointer()
+           << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n";
+      break;
+    case Alpha::reloc_gprelhigh:
+      idx = (unsigned char*)MR->getResultPointer() - &GOTBase[GOToffset * 8];
+      idx = getUpper16(idx);
+      DOUT << "gprelhigh relocation offset " << idx << "\n";
+      DOUT << " Pointer is " << (void*)MR->getResultPointer()
+           << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n";
+      break;
+    case Alpha::reloc_gpdist:
+      switch (*RelocPos >> 26) {
+      case 0x09: //LDAH
+        idx = &GOTBase[GOToffset * 8] - (unsigned char*)RelocPos;
+        idx = getUpper16(idx);
+        DOUT << "LDAH: " << idx << "\n";
+        //add the relocation to the map
+        gpdistmap[std::make_pair(Function, MR->getConstantVal())] = RelocPos;
+        break;
+      case 0x08: //LDA
+        assert(gpdistmap[std::make_pair(Function, MR->getConstantVal())] &&
+               "LDAg without seeing LDAHg");
+        idx = &GOTBase[GOToffset * 8] -
+          (unsigned char*)gpdistmap[std::make_pair(Function, MR->getConstantVal())];
+        idx = getLower16(idx);
+        DOUT << "LDA: " << idx << "\n";
+        break;
+      default:
+        assert(0 && "Cannot handle gpdist yet");
+      }
+      break;
+    case Alpha::reloc_bsr: {
+      idx = (((unsigned char*)MR->getResultPointer() -
+             (unsigned char*)RelocPos) >> 2) + 1; //skip first 2 inst of fun
+      *RelocPos |= (idx & ((1 << 21)-1));
+      doCommon = false;
+      break;
+    }
+    }
+    if (doCommon) {
+      short x = (short)idx;
+      assert(x == idx);
+      *(short*)RelocPos = x;
+    }
+  }
+}

diff --git a/lib/Target/Alpha/AlphaJITInfo.h b/lib/Target/Alpha/AlphaJITInfo.h
new file mode 100644
index 0000000..26c45b1
--- /dev/null
+++ b/lib/Target/Alpha/AlphaJITInfo.h

@@ -0,0 +1,49 @@
+//===- AlphaJITInfo.h - Alpha impl. of the JIT interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHA_JITINFO_H
+#define ALPHA_JITINFO_H
+
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/GlobalValue.h"
+#include <string>
+#include <map>
+
+namespace llvm {
+  class TargetMachine;
+
+  class AlphaJITInfo : public TargetJITInfo {
+  protected:
+    TargetMachine &TM;
+  public:
+    AlphaJITInfo(TargetMachine &tm) : TM(tm)
+    { useGOT = true; }
+
+    virtual void *emitFunctionStub(void *Fn, MachineCodeEmitter &MCE);
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+  private:
+    static const unsigned GOToffset = 4096;
+
+  };
+}
+
+#endif

diff --git a/lib/Target/Alpha/AlphaLLRP.cpp b/lib/Target/Alpha/AlphaLLRP.cpp
new file mode 100644
index 0000000..27c2738
--- /dev/null
+++ b/lib/Target/Alpha/AlphaLLRP.cpp

@@ -0,0 +1,162 @@
+//===-- AlphaLLRP.cpp - Alpha Load Load Replay Trap elimination pass. -- --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Andrew Lenharth and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Here we check for potential replay traps introduced by the spiller
+// We also align some branch targets if we can do so for free.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "alpha-nops"
+#include "Alpha.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+STATISTIC(nopintro, "Number of nops inserted");
+STATISTIC(nopalign, "Number of nops inserted for alignment");
+
+namespace {
+  cl::opt<bool>
+  AlignAll("alpha-align-all", cl::Hidden,
+                   cl::desc("Align all blocks"));
+
+  struct AlphaLLRPPass : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    AlphaTargetMachine &TM;
+
+    static char ID;
+    AlphaLLRPPass(AlphaTargetMachine &tm) 
+      : MachineFunctionPass((intptr_t)&ID), TM(tm) { }
+
+    virtual const char *getPassName() const {
+      return "Alpha NOP inserter";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F) {
+      const TargetInstrInfo *TII = F.getTarget().getInstrInfo();
+      bool Changed = false;
+      MachineInstr* prev[3] = {0,0,0};
+      unsigned count = 0;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI) {
+        MachineBasicBlock& MBB = *FI;
+        bool ub = false;
+        for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+          if (count%4 == 0)
+            prev[0] = prev[1] = prev[2] = 0; //Slots cleared at fetch boundary
+          ++count;
+          MachineInstr *MI = I++;
+          switch (MI->getOpcode()) {
+          case Alpha::LDQ:  case Alpha::LDL:
+          case Alpha::LDWU: case Alpha::LDBU:
+          case Alpha::LDT: case Alpha::LDS:
+          case Alpha::STQ:  case Alpha::STL:
+          case Alpha::STW:  case Alpha::STB:
+          case Alpha::STT: case Alpha::STS:
+           if (MI->getOperand(2).getReg() == Alpha::R30) {
+             if (prev[0] 
+                 && prev[0]->getOperand(2).getReg() == 
+                 MI->getOperand(2).getReg()
+                 && prev[0]->getOperand(1).getImmedValue() == 
+                 MI->getOperand(1).getImmedValue()) {
+               prev[0] = prev[1];
+               prev[1] = prev[2];
+               prev[2] = 0;
+               BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31)
+                 .addReg(Alpha::R31); 
+               Changed = true; nopintro += 1;
+               count += 1;
+             } else if (prev[1] 
+                        && prev[1]->getOperand(2).getReg() == 
+                        MI->getOperand(2).getReg()
+                        && prev[1]->getOperand(1).getImmedValue() == 
+                        MI->getOperand(1).getImmedValue()) {
+               prev[0] = prev[2];
+               prev[1] = prev[2] = 0;
+               BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31)
+                 .addReg(Alpha::R31); 
+               BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31)
+                 .addReg(Alpha::R31);
+               Changed = true; nopintro += 2;
+               count += 2;
+             } else if (prev[2] 
+                        && prev[2]->getOperand(2).getReg() == 
+                        MI->getOperand(2).getReg()
+                        && prev[2]->getOperand(1).getImmedValue() == 
+                        MI->getOperand(1).getImmedValue()) {
+               prev[0] = prev[1] = prev[2] = 0;
+               BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31).addReg(Alpha::R31)
+                 .addReg(Alpha::R31);
+               BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31).addReg(Alpha::R31)
+                 .addReg(Alpha::R31);
+               BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31).addReg(Alpha::R31)
+                 .addReg(Alpha::R31);
+               Changed = true; nopintro += 3;
+               count += 3;
+             }
+             prev[0] = prev[1];
+             prev[1] = prev[2];
+             prev[2] = MI;
+             break;
+           }
+           prev[0] = prev[1];
+           prev[1] = prev[2];
+           prev[2] = 0;
+           break;
+          case Alpha::ALTENT:
+          case Alpha::MEMLABEL:
+          case Alpha::PCLABEL:
+          case Alpha::IDEF_I:
+          case Alpha::IDEF_F32:
+          case Alpha::IDEF_F64:
+            --count;
+            break;
+          case Alpha::BR:
+          case Alpha::JMP:
+            ub = true;
+            //fall through
+          default:
+            prev[0] = prev[1];
+            prev[1] = prev[2];
+            prev[2] = 0;
+            break;
+          }
+        }
+        if (ub || AlignAll) {
+          //we can align stuff for free at this point
+          while (count % 4) {
+            BuildMI(MBB, MBB.end(), TII->get(Alpha::BISr), Alpha::R31)
+              .addReg(Alpha::R31).addReg(Alpha::R31);
+            ++count;
+            ++nopalign;
+            prev[0] = prev[1];
+            prev[1] = prev[2];
+            prev[2] = 0;
+          }
+        }
+      }
+      return Changed;
+    }
+  };
+  char AlphaLLRPPass::ID = 0;
+} // end of anonymous namespace
+
+FunctionPass *llvm::createAlphaLLRPPass(AlphaTargetMachine &tm) {
+  return new AlphaLLRPPass(tm);
+}

diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp
new file mode 100644
index 0000000..59d3e81
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp

@@ -0,0 +1,433 @@
+//===- AlphaRegisterInfo.cpp - Alpha Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reginfo"
+#include "Alpha.h"
+#include "AlphaRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cstdlib>
+using namespace llvm;
+
+//These describe LDAx
+static const int IMM_LOW  = -32768;
+static const int IMM_HIGH = 32767;
+static const int IMM_MULT = 65536;
+
+static long getUpper16(long l)
+{
+  long y = l / IMM_MULT;
+  if (l % IMM_MULT > IMM_HIGH)
+    ++y;
+  return y;
+}
+
+static long getLower16(long l)
+{
+  long h = getUpper16(l);
+  return l - h * IMM_MULT;
+}
+
+AlphaRegisterInfo::AlphaRegisterInfo(const TargetInstrInfo &tii)
+  : AlphaGenRegisterInfo(Alpha::ADJUSTSTACKDOWN, Alpha::ADJUSTSTACKUP),
+    TII(tii)
+{
+}
+
+void
+AlphaRegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned SrcReg, int FrameIdx,
+                                       const TargetRegisterClass *RC) const {
+  //cerr << "Trying to store " << getPrettyName(SrcReg) << " to "
+  //     << FrameIdx << "\n";
+  //BuildMI(MBB, MI, Alpha::WTF, 0).addReg(SrcReg);
+  if (RC == Alpha::F4RCRegisterClass)
+    BuildMI(MBB, MI, TII.get(Alpha::STS))
+      .addReg(SrcReg, false, false, true)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::F8RCRegisterClass)
+    BuildMI(MBB, MI, TII.get(Alpha::STT))
+      .addReg(SrcReg, false, false, true)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::GPRCRegisterClass)
+    BuildMI(MBB, MI, TII.get(Alpha::STQ))
+      .addReg(SrcReg, false, false, true)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else
+    abort();
+}
+
+void
+AlphaRegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC) const {
+  //cerr << "Trying to load " << getPrettyName(DestReg) << " to "
+  //     << FrameIdx << "\n";
+  if (RC == Alpha::F4RCRegisterClass)
+    BuildMI(MBB, MI, TII.get(Alpha::LDS), DestReg)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::F8RCRegisterClass)
+    BuildMI(MBB, MI, TII.get(Alpha::LDT), DestReg)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::GPRCRegisterClass)
+    BuildMI(MBB, MI, TII.get(Alpha::LDQ), DestReg)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else
+    abort();
+}
+
+MachineInstr *AlphaRegisterInfo::foldMemoryOperand(MachineInstr *MI,
+                                                 unsigned OpNum,
+                                                 int FrameIndex) const {
+   // Make sure this is a reg-reg copy.
+   unsigned Opc = MI->getOpcode();
+
+   MachineInstr *NewMI = NULL;
+   switch(Opc) {
+   default:
+     break;
+   case Alpha::BISr:
+   case Alpha::CPYSS:
+   case Alpha::CPYST:
+     if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+       if (OpNum == 0) {  // move -> store
+         unsigned InReg = MI->getOperand(1).getReg();
+         Opc = (Opc == Alpha::BISr) ? Alpha::STQ : 
+           ((Opc == Alpha::CPYSS) ? Alpha::STS : Alpha::STT);
+         NewMI = BuildMI(TII.get(Opc)).addReg(InReg).addFrameIndex(FrameIndex)
+           .addReg(Alpha::F31);
+       } else {           // load -> move
+         unsigned OutReg = MI->getOperand(0).getReg();
+         Opc = (Opc == Alpha::BISr) ? Alpha::LDQ : 
+           ((Opc == Alpha::CPYSS) ? Alpha::LDS : Alpha::LDT);
+         NewMI = BuildMI(TII.get(Opc), OutReg).addFrameIndex(FrameIndex)
+           .addReg(Alpha::F31);
+       }
+     }
+     break;
+   }
+  if (NewMI)
+    NewMI->copyKillDeadInfo(MI);
+  return 0;
+}
+
+
+void AlphaRegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     unsigned DestReg, unsigned SrcReg,
+                                     const TargetRegisterClass *RC) const {
+  //cerr << "copyRegToReg " << DestReg << " <- " << SrcReg << "\n";
+  if (RC == Alpha::GPRCRegisterClass) {
+    BuildMI(MBB, MI, TII.get(Alpha::BISr), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else if (RC == Alpha::F4RCRegisterClass) {
+    BuildMI(MBB, MI, TII.get(Alpha::CPYSS), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else if (RC == Alpha::F8RCRegisterClass) {
+    BuildMI(MBB, MI, TII.get(Alpha::CPYST), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else {
+    cerr << "Attempt to copy register that is not GPR or FPR";
+    abort();
+  }
+}
+
+void AlphaRegisterInfo::reMaterialize(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned DestReg,
+                                      const MachineInstr *Orig) const {
+  MachineInstr *MI = Orig->clone();
+  MI->getOperand(0).setReg(DestReg);
+  MBB.insert(I, MI);
+}
+
+const unsigned* AlphaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = {
+    Alpha::R9, Alpha::R10,
+    Alpha::R11, Alpha::R12,
+    Alpha::R13, Alpha::R14,
+    Alpha::F2, Alpha::F3,
+    Alpha::F4, Alpha::F5,
+    Alpha::F6, Alpha::F7,
+    Alpha::F8, Alpha::F9,  0
+  };
+  return CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+AlphaRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
+    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
+    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,  0
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector AlphaRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(Alpha::R15);
+  Reserved.set(Alpha::R30);
+  Reserved.set(Alpha::R31);
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+bool AlphaRegisterInfo::hasFP(const MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->hasVarSizedObjects();
+}
+
+void AlphaRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (hasFP(MF)) {
+    // If we have a frame pointer, turn the adjcallstackup instruction into a
+    // 'sub ESP, <amt>' and the adjcallstackdown instruction into 'add ESP,
+    // <amt>'
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImmedValue();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      MachineInstr *New;
+      if (Old->getOpcode() == Alpha::ADJUSTSTACKDOWN) {
+        New=BuildMI(TII.get(Alpha::LDA), Alpha::R30)
+          .addImm(-Amount).addReg(Alpha::R30);
+      } else {
+         assert(Old->getOpcode() == Alpha::ADJUSTSTACKUP);
+         New=BuildMI(TII.get(Alpha::LDA), Alpha::R30)
+          .addImm(Amount).addReg(Alpha::R30);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+//Alpha has a slightly funny stack:
+//Args
+//<- incoming SP
+//fixed locals (and spills, callee saved, etc)
+//<- FP
+//variable locals
+//<- SP
+
+void AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  bool FP = hasFP(MF);
+
+  while (!MI.getOperand(i).isFrameIndex()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getFrameIndex();
+
+  // Add the base register of R30 (SP) or R15 (FP).
+  MI.getOperand(i + 1).ChangeToRegister(FP ? Alpha::R15 : Alpha::R30, false);
+
+  // Now add the frame object offset to the offset from the virtual frame index.
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  DOUT << "FI: " << FrameIndex << " Offset: " << Offset << "\n";
+
+  Offset += MF.getFrameInfo()->getStackSize();
+
+  DOUT << "Corrected Offset " << Offset
+       << " for stack size: " << MF.getFrameInfo()->getStackSize() << "\n";
+
+  if (Offset > IMM_HIGH || Offset < IMM_LOW) {
+    DOUT << "Unconditionally using R28 for evil purposes Offset: "
+         << Offset << "\n";
+    //so in this case, we need to use a temporary register, and move the
+    //original inst off the SP/FP
+    //fix up the old:
+    MI.getOperand(i + 1).ChangeToRegister(Alpha::R28, false);
+    MI.getOperand(i).ChangeToImmediate(getLower16(Offset));
+    //insert the new
+    MachineInstr* nMI=BuildMI(TII.get(Alpha::LDAH), Alpha::R28)
+      .addImm(getUpper16(Offset)).addReg(FP ? Alpha::R15 : Alpha::R30);
+    MBB.insert(II, nMI);
+  } else {
+    MI.getOperand(i).ChangeToImmediate(Offset);
+  }
+}
+
+
+void AlphaRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool FP = hasFP(MF);
+
+  static int curgpdist = 0;
+
+  //handle GOP offset
+  BuildMI(MBB, MBBI, TII.get(Alpha::LDAHg), Alpha::R29)
+    .addGlobalAddress(const_cast<Function*>(MF.getFunction()))
+    .addReg(Alpha::R27).addImm(++curgpdist);
+  BuildMI(MBB, MBBI, TII.get(Alpha::LDAg), Alpha::R29)
+    .addGlobalAddress(const_cast<Function*>(MF.getFunction()))
+    .addReg(Alpha::R29).addImm(curgpdist);
+
+  //evil const_cast until MO stuff setup to handle const
+  BuildMI(MBB, MBBI, TII.get(Alpha::ALTENT))
+    .addGlobalAddress(const_cast<Function*>(MF.getFunction()));
+
+  // Get the number of bytes to allocate from the FrameInfo
+  long NumBytes = MFI->getStackSize();
+
+  if (FP)
+    NumBytes += 8; //reserve space for the old FP
+
+  // Do we need to allocate space on the stack?
+  if (NumBytes == 0) return;
+
+  unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+  NumBytes = (NumBytes+Align-1)/Align*Align;
+
+  // Update frame info to pretend that this is part of the stack...
+  MFI->setStackSize(NumBytes);
+
+  // adjust stack pointer: r30 -= numbytes
+  NumBytes = -NumBytes;
+  if (NumBytes >= IMM_LOW) {
+    BuildMI(MBB, MBBI, TII.get(Alpha::LDA), Alpha::R30).addImm(NumBytes)
+      .addReg(Alpha::R30);
+  } else if (getUpper16(NumBytes) >= IMM_LOW) {
+    BuildMI(MBB, MBBI, TII.get(Alpha::LDAH), Alpha::R30).addImm(getUpper16(NumBytes))
+      .addReg(Alpha::R30);
+    BuildMI(MBB, MBBI, TII.get(Alpha::LDA), Alpha::R30).addImm(getLower16(NumBytes))
+      .addReg(Alpha::R30);
+  } else {
+    cerr << "Too big a stack frame at " << NumBytes << "\n";
+    abort();
+  }
+
+  //now if we need to, save the old FP and set the new
+  if (FP)
+  {
+    BuildMI(MBB, MBBI, TII.get(Alpha::STQ))
+      .addReg(Alpha::R15).addImm(0).addReg(Alpha::R30);
+    //this must be the last instr in the prolog
+    BuildMI(MBB, MBBI, TII.get(Alpha::BISr), Alpha::R15)
+      .addReg(Alpha::R30).addReg(Alpha::R30);
+  }
+
+}
+
+void AlphaRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert(MBBI->getOpcode() == Alpha::RETDAG ||
+         MBBI->getOpcode() == Alpha::RETDAGp
+         && "Can only insert epilog into returning blocks");
+
+  bool FP = hasFP(MF);
+
+  // Get the number of bytes allocated from the FrameInfo...
+  long NumBytes = MFI->getStackSize();
+
+  //now if we need to, restore the old FP
+  if (FP)
+  {
+    //copy the FP into the SP (discards allocas)
+    BuildMI(MBB, MBBI, TII.get(Alpha::BISr), Alpha::R30).addReg(Alpha::R15)
+      .addReg(Alpha::R15);
+    //restore the FP
+    BuildMI(MBB, MBBI, TII.get(Alpha::LDQ), Alpha::R15).addImm(0).addReg(Alpha::R15);
+  }
+
+   if (NumBytes != 0)
+     {
+       if (NumBytes <= IMM_HIGH) {
+         BuildMI(MBB, MBBI, TII.get(Alpha::LDA), Alpha::R30).addImm(NumBytes)
+           .addReg(Alpha::R30);
+       } else if (getUpper16(NumBytes) <= IMM_HIGH) {
+         BuildMI(MBB, MBBI, TII.get(Alpha::LDAH), Alpha::R30)
+           .addImm(getUpper16(NumBytes)).addReg(Alpha::R30);
+         BuildMI(MBB, MBBI, TII.get(Alpha::LDA), Alpha::R30)
+           .addImm(getLower16(NumBytes)).addReg(Alpha::R30);
+       } else {
+         cerr << "Too big a stack frame at " << NumBytes << "\n";
+         abort();
+       }
+     }
+}
+
+unsigned AlphaRegisterInfo::getRARegister() const {
+  assert(0 && "What is the return address register");
+  return 0;
+}
+
+unsigned AlphaRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? Alpha::R15 : Alpha::R30;
+}
+
+unsigned AlphaRegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned AlphaRegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+#include "AlphaGenRegisterInfo.inc"
+
+std::string AlphaRegisterInfo::getPrettyName(unsigned reg)
+{
+  std::string s(RegisterDescriptors[reg].Name);
+  return s;
+}

diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h
new file mode 100644
index 0000000..2872e59
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRegisterInfo.h

@@ -0,0 +1,85 @@
+//===- AlphaRegisterInfo.h - Alpha Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHAREGISTERINFO_H
+#define ALPHAREGISTERINFO_H
+
+#include "llvm/Target/MRegisterInfo.h"
+#include "AlphaGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+class Type;
+
+struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
+  const TargetInstrInfo &TII;
+
+  AlphaRegisterInfo(const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, int FrameIndex,
+                           const TargetRegisterClass *RC) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC) const;
+  
+  MachineInstr* foldMemoryOperand(MachineInstr *MI, unsigned OpNum, 
+                                  int FrameIndex) const;
+
+  void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *RC) const;
+
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, const MachineInstr *Orig) const;
+
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  //void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  static std::string getPrettyName(unsigned reg);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Alpha/AlphaRegisterInfo.td b/lib/Target/Alpha/AlphaRegisterInfo.td
new file mode 100644
index 0000000..9855ce2
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRegisterInfo.td

@@ -0,0 +1,171 @@
+//===- AlphaRegisterInfo.td - The Alpha Register File ------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Alpha register set.
+//
+//===----------------------------------------------------------------------===//
+
+class AlphaReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "Alpha";
+}
+
+// We identify all our registers with a 5-bit ID, for consistency's sake.
+
+// GPR - One of the 32 32-bit general-purpose registers
+class GPR<bits<5> num, string n> : AlphaReg<n> {
+  let Num = num;
+}
+
+// FPR - One of the 32 64-bit floating-point registers
+class FPR<bits<5> num, string n> : AlphaReg<n> {
+  let Num = num;
+}
+
+//#define FP    $15
+//#define RA    $26
+//#define PV    $27
+//#define GP    $29
+//#define SP    $30
+
+// General-purpose registers
+def R0  : GPR< 0,  "$0">, DwarfRegNum<0>;
+def R1  : GPR< 1,  "$1">, DwarfRegNum<1>;
+def R2  : GPR< 2,  "$2">, DwarfRegNum<2>;
+def R3  : GPR< 3,  "$3">, DwarfRegNum<3>;
+def R4  : GPR< 4,  "$4">, DwarfRegNum<4>;
+def R5  : GPR< 5,  "$5">, DwarfRegNum<5>;
+def R6  : GPR< 6,  "$6">, DwarfRegNum<6>;
+def R7  : GPR< 7,  "$7">, DwarfRegNum<7>;
+def R8  : GPR< 8,  "$8">, DwarfRegNum<8>;
+def R9  : GPR< 9,  "$9">, DwarfRegNum<9>;
+def R10 : GPR<10, "$10">, DwarfRegNum<10>;
+def R11 : GPR<11, "$11">, DwarfRegNum<11>;
+def R12 : GPR<12, "$12">, DwarfRegNum<12>;
+def R13 : GPR<13, "$13">, DwarfRegNum<13>;
+def R14 : GPR<14, "$14">, DwarfRegNum<14>;
+def R15 : GPR<15, "$15">, DwarfRegNum<15>;
+def R16 : GPR<16, "$16">, DwarfRegNum<16>;
+def R17 : GPR<17, "$17">, DwarfRegNum<17>;
+def R18 : GPR<18, "$18">, DwarfRegNum<18>;
+def R19 : GPR<19, "$19">, DwarfRegNum<19>;
+def R20 : GPR<20, "$20">, DwarfRegNum<20>;
+def R21 : GPR<21, "$21">, DwarfRegNum<21>;
+def R22 : GPR<22, "$22">, DwarfRegNum<22>;
+def R23 : GPR<23, "$23">, DwarfRegNum<23>;
+def R24 : GPR<24, "$24">, DwarfRegNum<24>;
+def R25 : GPR<25, "$25">, DwarfRegNum<25>;
+def R26 : GPR<26, "$26">, DwarfRegNum<26>;
+def R27 : GPR<27, "$27">, DwarfRegNum<27>;
+def R28 : GPR<28, "$28">, DwarfRegNum<28>;
+def R29 : GPR<29, "$29">, DwarfRegNum<29>;
+def R30 : GPR<30, "$30">, DwarfRegNum<30>;
+def R31 : GPR<31, "$31">, DwarfRegNum<31>;
+
+// Floating-point registers
+def F0  : FPR< 0,  "$f0">, DwarfRegNum<33>;
+def F1  : FPR< 1,  "$f1">, DwarfRegNum<34>;
+def F2  : FPR< 2,  "$f2">, DwarfRegNum<35>;
+def F3  : FPR< 3,  "$f3">, DwarfRegNum<36>;
+def F4  : FPR< 4,  "$f4">, DwarfRegNum<37>;
+def F5  : FPR< 5,  "$f5">, DwarfRegNum<38>;
+def F6  : FPR< 6,  "$f6">, DwarfRegNum<39>;
+def F7  : FPR< 7,  "$f7">, DwarfRegNum<40>;
+def F8  : FPR< 8,  "$f8">, DwarfRegNum<41>;
+def F9  : FPR< 9,  "$f9">, DwarfRegNum<42>;
+def F10 : FPR<10, "$f10">, DwarfRegNum<43>;
+def F11 : FPR<11, "$f11">, DwarfRegNum<44>;
+def F12 : FPR<12, "$f12">, DwarfRegNum<45>;
+def F13 : FPR<13, "$f13">, DwarfRegNum<46>;
+def F14 : FPR<14, "$f14">, DwarfRegNum<47>;
+def F15 : FPR<15, "$f15">, DwarfRegNum<48>;
+def F16 : FPR<16, "$f16">, DwarfRegNum<49>;
+def F17 : FPR<17, "$f17">, DwarfRegNum<50>;
+def F18 : FPR<18, "$f18">, DwarfRegNum<51>;
+def F19 : FPR<19, "$f19">, DwarfRegNum<52>;
+def F20 : FPR<20, "$f20">, DwarfRegNum<53>;
+def F21 : FPR<21, "$f21">, DwarfRegNum<54>;
+def F22 : FPR<22, "$f22">, DwarfRegNum<55>;
+def F23 : FPR<23, "$f23">, DwarfRegNum<56>;
+def F24 : FPR<24, "$f24">, DwarfRegNum<57>;
+def F25 : FPR<25, "$f25">, DwarfRegNum<58>;
+def F26 : FPR<26, "$f26">, DwarfRegNum<59>;
+def F27 : FPR<27, "$f27">, DwarfRegNum<60>;
+def F28 : FPR<28, "$f28">, DwarfRegNum<61>;
+def F29 : FPR<29, "$f29">, DwarfRegNum<62>;
+def F30 : FPR<30, "$f30">, DwarfRegNum<63>;
+def F31 : FPR<31, "$f31">, DwarfRegNum<64>;
+
+  // //#define FP    $15
+  // //#define RA    $26
+  // //#define PV    $27
+  // //#define GP    $29
+  // //#define SP    $30
+  // $28 is undefined after any and all calls
+
+/// Register classes
+def GPRC : RegisterClass<"Alpha", [i64], 64,
+     // Volatile
+     [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19, R20, R21, R22,
+      R23, R24, R25, R28, 
+     //Special meaning, but volatile
+     R27, //procedure address
+     R26, //return address
+     R29, //global offset table address
+     // Non-volatile
+     R9, R10, R11, R12, R13, R14,
+// Don't allocate 15, 30, 31
+     R15, R30, R31 ]> //zero
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GPRCClass::iterator
+    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
+        return end()-3;
+    }
+  }];
+}
+
+def F4RC : RegisterClass<"Alpha", [f32], 64, [F0, F1, 
+        F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+        F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
+        // Saved:
+        F2, F3, F4, F5, F6, F7, F8, F9,
+        F31 ]> //zero
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    F4RCClass::iterator
+    F4RCClass::allocation_order_end(const MachineFunction &MF) const {
+        return end()-1;
+    }
+  }];
+}
+
+def F8RC : RegisterClass<"Alpha", [f64], 64, [F0, F1, 
+        F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+        F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
+        // Saved:
+        F2, F3, F4, F5, F6, F7, F8, F9,
+        F31 ]> //zero
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    F8RCClass::iterator
+    F8RCClass::allocation_order_end(const MachineFunction &MF) const {
+        return end()-1;
+    }
+  }];
+}

diff --git a/lib/Target/Alpha/AlphaRelocations.h b/lib/Target/Alpha/AlphaRelocations.h
new file mode 100644
index 0000000..c532f21
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRelocations.h

@@ -0,0 +1,31 @@
+//===- AlphaRelocations.h - Alpha Code Relocations --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Alpha target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHARELOCATIONS_H
+#define ALPHARELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace Alpha {
+    enum RelocationType {
+      reloc_literal,
+      reloc_gprellow,
+      reloc_gprelhigh,
+      reloc_gpdist,
+      reloc_bsr
+    };
+  }
+}
+
+#endif

diff --git a/lib/Target/Alpha/AlphaSchedule.td b/lib/Target/Alpha/AlphaSchedule.td
new file mode 100644
index 0000000..b3aab97
--- /dev/null
+++ b/lib/Target/Alpha/AlphaSchedule.td

@@ -0,0 +1,84 @@
+//===- AlphaSchedule.td - Alpha Scheduling Definitions -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Andrew Lenharth and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//This is table 2-2 from the 21264 compiler writers guide
+//modified some
+
+//Pipelines
+
+def L0   : FuncUnit;
+def L1   : FuncUnit;
+def FST0 : FuncUnit;
+def FST1 : FuncUnit;
+def U0   : FuncUnit;
+def U1   : FuncUnit;
+def FA   : FuncUnit;
+def FM   : FuncUnit;
+
+def s_ild   : InstrItinClass;
+def s_fld   : InstrItinClass;
+def s_ist   : InstrItinClass;
+def s_fst   : InstrItinClass;
+def s_lda   : InstrItinClass;
+def s_rpcc  : InstrItinClass;
+def s_rx    : InstrItinClass;
+def s_mxpr  : InstrItinClass;
+def s_icbr  : InstrItinClass;
+def s_ubr   : InstrItinClass;
+def s_jsr   : InstrItinClass;
+def s_iadd  : InstrItinClass;
+def s_ilog  : InstrItinClass;
+def s_ishf  : InstrItinClass;
+def s_cmov  : InstrItinClass;
+def s_imul  : InstrItinClass;
+def s_imisc : InstrItinClass;
+def s_fbr   : InstrItinClass;
+def s_fadd  : InstrItinClass;
+def s_fmul  : InstrItinClass;
+def s_fcmov : InstrItinClass;
+def s_fdivt : InstrItinClass;
+def s_fdivs : InstrItinClass;
+def s_fsqrts: InstrItinClass;
+def s_fsqrtt: InstrItinClass;
+def s_ftoi  : InstrItinClass;
+def s_itof  : InstrItinClass;
+def s_pseudo : InstrItinClass;
+
+//Table 24 Instruction Class Latency in Cycles
+//modified some
+
+def Alpha21264Itineraries : ProcessorItineraries<[
+  InstrItinData<s_ild    , [InstrStage<3, [L0, L1]>]>,
+  InstrItinData<s_fld    , [InstrStage<4, [L0, L1]>]>,
+  InstrItinData<s_ist    , [InstrStage<0, [L0, L1]>]>,
+  InstrItinData<s_fst    , [InstrStage<0, [FST0, FST1, L0, L1]>]>,
+  InstrItinData<s_lda    , [InstrStage<1, [L0, L1, U0, U1]>]>,
+  InstrItinData<s_rpcc   , [InstrStage<1, [L1]>]>,
+  InstrItinData<s_rx     , [InstrStage<1, [L1]>]>,
+  InstrItinData<s_mxpr   , [InstrStage<1, [L0, L1]>]>,
+  InstrItinData<s_icbr   , [InstrStage<0, [U0, U1]>]>,
+  InstrItinData<s_ubr    , [InstrStage<3, [U0, U1]>]>,
+  InstrItinData<s_jsr    , [InstrStage<3, [L0]>]>,
+  InstrItinData<s_iadd   , [InstrStage<1, [L0, U0, L1, U1]>]>,
+  InstrItinData<s_ilog   , [InstrStage<1, [L0, U0, L1, U1]>]>,
+  InstrItinData<s_ishf   , [InstrStage<1, [U0, U1]>]>,
+  InstrItinData<s_cmov   , [InstrStage<1, [L0, U0, L1, U1]>]>,
+  InstrItinData<s_imul   , [InstrStage<7, [U1]>]>,
+  InstrItinData<s_imisc  , [InstrStage<3, [U0]>]>,
+  InstrItinData<s_fbr    , [InstrStage<0, [FA]>]>,
+  InstrItinData<s_fadd   , [InstrStage<6, [FA]>]>,
+  InstrItinData<s_fmul   , [InstrStage<6, [FM]>]>,
+  InstrItinData<s_fcmov  , [InstrStage<6, [FA]>]>,
+  InstrItinData<s_fdivs  , [InstrStage<12, [FA]>]>,
+  InstrItinData<s_fdivt  , [InstrStage<15, [FA]>]>,
+  InstrItinData<s_fsqrts , [InstrStage<18, [FA]>]>,
+  InstrItinData<s_fsqrtt , [InstrStage<33, [FA]>]>,
+  InstrItinData<s_ftoi   , [InstrStage<3, [FST0, FST1, L0, L1]>]>,
+  InstrItinData<s_itof   , [InstrStage<4, [L0, L1]>]>
+]>;

diff --git a/lib/Target/Alpha/AlphaSubtarget.cpp b/lib/Target/Alpha/AlphaSubtarget.cpp
new file mode 100644
index 0000000..4b7d612
--- /dev/null
+++ b/lib/Target/Alpha/AlphaSubtarget.cpp

@@ -0,0 +1,25 @@
+//===- AlphaSubtarget.cpp - Alpha Subtarget Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Andrew Lenharth and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Alpha specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AlphaSubtarget.h"
+#include "Alpha.h"
+#include "AlphaGenSubtarget.inc"
+using namespace llvm;
+
+AlphaSubtarget::AlphaSubtarget(const Module &M, const std::string &FS)
+  : HasCT(false) {
+  std::string CPU = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}

diff --git a/lib/Target/Alpha/AlphaSubtarget.h b/lib/Target/Alpha/AlphaSubtarget.h
new file mode 100644
index 0000000..3fb95ad
--- /dev/null
+++ b/lib/Target/Alpha/AlphaSubtarget.h

@@ -0,0 +1,46 @@
+//=====-- AlphaSubtarget.h - Define Subtarget for the Alpha --*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Andrew Lenharth and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Alpha specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHASUBTARGET_H
+#define ALPHASUBTARGET_H
+
+#include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+class Module;
+
+class AlphaSubtarget : public TargetSubtarget {
+protected:
+
+  bool HasCT;
+
+  InstrItineraryData InstrItins;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  AlphaSubtarget(const Module &M, const std::string &FS);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU);
+
+  bool hasCT() const { return HasCT; }
+};
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/Alpha/AlphaTargetAsmInfo.cpp b/lib/Target/Alpha/AlphaTargetAsmInfo.cpp
new file mode 100644
index 0000000..233d2c7
--- /dev/null
+++ b/lib/Target/Alpha/AlphaTargetAsmInfo.cpp

@@ -0,0 +1,24 @@
+//===-- AlphaTargetAsmInfo.cpp - Alpha asm properties -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AlphaTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AlphaTargetAsmInfo.h"
+
+using namespace llvm;
+
+AlphaTargetAsmInfo::AlphaTargetAsmInfo(const AlphaTargetMachine &TM) {
+  AlignmentIsInBytes = false;
+  PrivateGlobalPrefix = "$";
+  JumpTableDirective = ".gprel32";
+  JumpTableDataSection = "\t.section .rodata\n";
+  WeakRefDirective = "\t.weak\t";
+}

diff --git a/lib/Target/Alpha/AlphaTargetAsmInfo.h b/lib/Target/Alpha/AlphaTargetAsmInfo.h
new file mode 100644
index 0000000..c8b4fd5
--- /dev/null
+++ b/lib/Target/Alpha/AlphaTargetAsmInfo.h

@@ -0,0 +1,30 @@
+//=====-- AlphaTargetAsmInfo.h - Alpha asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AlphaTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHATARGETASMINFO_H
+#define ALPHATARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class AlphaTargetMachine;
+
+  struct AlphaTargetAsmInfo : public TargetAsmInfo {
+    AlphaTargetAsmInfo(const AlphaTargetMachine &TM);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/Alpha/AlphaTargetMachine.cpp b/lib/Target/Alpha/AlphaTargetMachine.cpp
new file mode 100644
index 0000000..d4137a5
--- /dev/null
+++ b/lib/Target/Alpha/AlphaTargetMachine.cpp

@@ -0,0 +1,97 @@
+//===-- AlphaTargetMachine.cpp - Define TargetMachine for Alpha -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaJITInfo.h"
+#include "AlphaTargetAsmInfo.h"
+#include "AlphaTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+
+using namespace llvm;
+
+namespace {
+  // Register the targets
+  RegisterTarget<AlphaTargetMachine> X("alpha", "  Alpha (incomplete)");
+}
+
+const TargetAsmInfo *AlphaTargetMachine::createTargetAsmInfo() const {
+  return new AlphaTargetAsmInfo(*this);
+}
+
+unsigned AlphaTargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "alpha*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 5 && TT[0] == 'a' && TT[1] == 'l' && TT[2] == 'p' &&
+      TT[3] == 'h' && TT[4] == 'a')
+    return 20;
+  // If the target triple is something non-alpha, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer64)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+unsigned AlphaTargetMachine::getJITMatchQuality() {
+#ifdef __alpha
+  return 10;
+#else
+  return 0;
+#endif
+}
+
+AlphaTargetMachine::AlphaTargetMachine(const Module &M, const std::string &FS)
+  : DataLayout("e"),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0),
+    JITInfo(*this),
+    Subtarget(M, FS),
+    TLInfo(*this) {
+  setRelocationModel(Reloc::PIC_);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool AlphaTargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) {
+  PM.add(createAlphaISelDag(*this));
+  return false;
+}
+bool AlphaTargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) {
+  // Must run branch selection immediately preceding the asm printer
+  PM.add(createAlphaBranchSelectionPass());
+  return false;
+}
+bool AlphaTargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                            std::ostream &Out) {
+  PM.add(createAlphaLLRPPass(*this));
+  PM.add(createAlphaCodePrinterPass(Out, *this));
+  return false;
+}
+bool AlphaTargetMachine::addCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                        MachineCodeEmitter &MCE) {
+  PM.add(createAlphaCodeEmitterPass(*this, MCE));
+  return false;
+}
+bool AlphaTargetMachine::addSimpleCodeEmitter(FunctionPassManager &PM,
+                                              bool Fast,
+                                              MachineCodeEmitter &MCE) {
+  return addCodeEmitter(PM, Fast, MCE);
+}

diff --git a/lib/Target/Alpha/AlphaTargetMachine.h b/lib/Target/Alpha/AlphaTargetMachine.h
new file mode 100644
index 0000000..5a57f63
--- /dev/null
+++ b/lib/Target/Alpha/AlphaTargetMachine.h

@@ -0,0 +1,73 @@
+//===-- AlphaTargetMachine.h - Define TargetMachine for Alpha ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Alpha-specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHA_TARGETMACHINE_H
+#define ALPHA_TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "AlphaInstrInfo.h"
+#include "AlphaJITInfo.h"
+#include "AlphaISelLowering.h"
+#include "AlphaSubtarget.h"
+
+namespace llvm {
+
+class GlobalValue;
+
+class AlphaTargetMachine : public LLVMTargetMachine {
+  const TargetData DataLayout;       // Calculates type size & alignment
+  AlphaInstrInfo InstrInfo;
+  TargetFrameInfo FrameInfo;
+  AlphaJITInfo JITInfo;
+  AlphaSubtarget Subtarget;
+  AlphaTargetLowering TLInfo;
+  
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+public:
+  AlphaTargetMachine(const Module &M, const std::string &FS);
+
+  virtual const AlphaInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual const TargetSubtarget  *getSubtargetImpl() const{ return &Subtarget; }
+  virtual const MRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual TargetLowering* getTargetLowering() const { 
+    return const_cast<AlphaTargetLowering*>(&TLInfo);
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual TargetJITInfo* getJITInfo() {
+    return &JITInfo;
+  }
+
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+  
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);
+  virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast);
+  virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                  std::ostream &Out);
+  virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast,
+                              MachineCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                    MachineCodeEmitter &MCE);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Alpha/Makefile b/lib/Target/Alpha/Makefile
new file mode 100644
index 0000000..bb9895a
--- /dev/null
+++ b/lib/Target/Alpha/Makefile

@@ -0,0 +1,20 @@
+##===- lib/Target/Alpha/Makefile -------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by the LLVM research group and is distributed under
+# the University of Illinois Open Source License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMAlpha
+TARGET = Alpha
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = AlphaGenRegisterInfo.h.inc AlphaGenRegisterNames.inc \
+                AlphaGenRegisterInfo.inc AlphaGenInstrNames.inc \
+                AlphaGenInstrInfo.inc AlphaGenCodeEmitter.inc \
+                AlphaGenAsmWriter.inc AlphaGenDAGISel.inc \
+                AlphaGenSubtarget.inc
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Alpha/README.txt b/lib/Target/Alpha/README.txt
new file mode 100644
index 0000000..9ae1517
--- /dev/null
+++ b/lib/Target/Alpha/README.txt

@@ -0,0 +1,42 @@
+***
+
+add gcc builtins for alpha instructions
+
+
+***
+
+custom expand byteswap into nifty 
+extract/insert/mask byte/word/longword/quadword low/high
+sequences
+
+***
+
+see if any of the extract/insert/mask operations can be added
+
+***
+
+match more interesting things for cmovlbc cmovlbs (move if low bit clear/set)
+
+***
+
+lower srem and urem
+
+remq(i,j):  i - (j * divq(i,j)) if j != 0
+remqu(i,j): i - (j * divqu(i,j)) if j != 0
+reml(i,j):  i - (j * divl(i,j)) if j != 0
+remlu(i,j): i - (j * divlu(i,j)) if j != 0
+
+***
+
+add crazy vector instructions (MVI):
+
+(MIN|MAX)(U|S)(B8|W4) min and max, signed and unsigned, byte and word
+PKWB, UNPKBW pack/unpack word to byte
+PKLB UNPKBL pack/unpack long to byte
+PERR pixel error (sum accross bytes of bytewise abs(i8v8 a - i8v8 b))
+
+cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extentions)
+
+this has some good examples for other operations that can be synthesised well 
+from these rather meager vector ops (such as saturating add).
+http://www.alphalinux.org/docs/MVI-full.html

diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
new file mode 100644
index 0000000..b0c76c8
--- /dev/null
+++ b/lib/Target/CBackend/CBackend.cpp

@@ -0,0 +1,2930 @@
+//===-- CBackend.cpp - Library for converting LLVM code to C --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library converts LLVM code to C code, compilable by GCC and other C
+// compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CTargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/ParameterAttributes.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Config/config.h"
+#include <algorithm>
+#include <sstream>
+using namespace llvm;
+
+namespace {
+  // Register the target.
+  RegisterTarget<CTargetMachine> X("c", "  C backend");
+
+  /// CBackendNameAllUsedStructsAndMergeFunctions - This pass inserts names for
+  /// any unnamed structure types that are used by the program, and merges
+  /// external functions with the same name.
+  ///
+  class CBackendNameAllUsedStructsAndMergeFunctions : public ModulePass {
+  public:
+    static char ID;
+    CBackendNameAllUsedStructsAndMergeFunctions() 
+      : ModulePass((intptr_t)&ID) {}
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<FindUsedTypes>();
+    }
+
+    virtual const char *getPassName() const {
+      return "C backend type canonicalizer";
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+
+  char CBackendNameAllUsedStructsAndMergeFunctions::ID = 0;
+
+  /// CWriter - This class is the main chunk of code that converts an LLVM
+  /// module to a C translation unit.
+  class CWriter : public FunctionPass, public InstVisitor<CWriter> {
+    std::ostream &Out;
+    IntrinsicLowering *IL;
+    Mangler *Mang;
+    LoopInfo *LI;
+    const Module *TheModule;
+    const TargetAsmInfo* TAsm;
+    const TargetData* TD;
+    std::map<const Type *, std::string> TypeNames;
+    std::map<const ConstantFP *, unsigned> FPConstantMap;
+    std::set<Function*> intrinsicPrototypesAlreadyGenerated;
+
+  public:
+    static char ID;
+    CWriter(std::ostream &o) 
+      : FunctionPass((intptr_t)&ID), Out(o), IL(0), Mang(0), LI(0), 
+        TheModule(0), TAsm(0), TD(0) {}
+
+    virtual const char *getPassName() const { return "C backend"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.setPreservesAll();
+    }
+
+    virtual bool doInitialization(Module &M);
+
+    bool runOnFunction(Function &F) {
+      LI = &getAnalysis<LoopInfo>();
+
+      // Get rid of intrinsics we can't handle.
+      lowerIntrinsics(F);
+
+      // Output all floating point constants that cannot be printed accurately.
+      printFloatingPointConstants(F);
+
+      printFunction(F);
+      FPConstantMap.clear();
+      return false;
+    }
+
+    virtual bool doFinalization(Module &M) {
+      // Free memory...
+      delete Mang;
+      TypeNames.clear();
+      return false;
+    }
+
+    std::ostream &printType(std::ostream &Out, const Type *Ty, 
+                            bool isSigned = false,
+                            const std::string &VariableName = "",
+                            bool IgnoreName = false);
+    std::ostream &printSimpleType(std::ostream &Out, const Type *Ty, 
+                                     bool isSigned, 
+                                     const std::string &NameSoFar = "");
+
+    void printStructReturnPointerFunctionType(std::ostream &Out,
+                                              const PointerType *Ty);
+    
+    void writeOperand(Value *Operand);
+    void writeOperandRaw(Value *Operand);
+    void writeOperandInternal(Value *Operand);
+    void writeOperandWithCast(Value* Operand, unsigned Opcode);
+    void writeOperandWithCast(Value* Operand, ICmpInst::Predicate predicate);
+    bool writeInstructionCast(const Instruction &I);
+
+  private :
+    std::string InterpretASMConstraint(InlineAsm::ConstraintInfo& c);
+
+    void lowerIntrinsics(Function &F);
+
+    void printModule(Module *M);
+    void printModuleTypes(const TypeSymbolTable &ST);
+    void printContainedStructs(const Type *Ty, std::set<const StructType *> &);
+    void printFloatingPointConstants(Function &F);
+    void printFunctionSignature(const Function *F, bool Prototype);
+
+    void printFunction(Function &);
+    void printBasicBlock(BasicBlock *BB);
+    void printLoop(Loop *L);
+
+    void printCast(unsigned opcode, const Type *SrcTy, const Type *DstTy);
+    void printConstant(Constant *CPV);
+    void printConstantWithCast(Constant *CPV, unsigned Opcode);
+    bool printConstExprCast(const ConstantExpr *CE);
+    void printConstantArray(ConstantArray *CPA);
+    void printConstantVector(ConstantVector *CP);
+
+    // isInlinableInst - Attempt to inline instructions into their uses to build
+    // trees as much as possible.  To do this, we have to consistently decide
+    // what is acceptable to inline, so that variable declarations don't get
+    // printed and an extra copy of the expr is not emitted.
+    //
+    static bool isInlinableInst(const Instruction &I) {
+      // Always inline cmp instructions, even if they are shared by multiple
+      // expressions.  GCC generates horrible code if we don't.
+      if (isa<CmpInst>(I)) 
+        return true;
+
+      // Must be an expression, must be used exactly once.  If it is dead, we
+      // emit it inline where it would go.
+      if (I.getType() == Type::VoidTy || !I.hasOneUse() ||
+          isa<TerminatorInst>(I) || isa<CallInst>(I) || isa<PHINode>(I) ||
+          isa<LoadInst>(I) || isa<VAArgInst>(I))
+        // Don't inline a load across a store or other bad things!
+        return false;
+
+      // Must not be used in inline asm
+      if (I.hasOneUse() && isInlineAsm(*I.use_back())) return false;
+
+      // Only inline instruction it if it's use is in the same BB as the inst.
+      return I.getParent() == cast<Instruction>(I.use_back())->getParent();
+    }
+
+    // isDirectAlloca - Define fixed sized allocas in the entry block as direct
+    // variables which are accessed with the & operator.  This causes GCC to
+    // generate significantly better code than to emit alloca calls directly.
+    //
+    static const AllocaInst *isDirectAlloca(const Value *V) {
+      const AllocaInst *AI = dyn_cast<AllocaInst>(V);
+      if (!AI) return false;
+      if (AI->isArrayAllocation())
+        return 0;   // FIXME: we can also inline fixed size array allocas!
+      if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock())
+        return 0;
+      return AI;
+    }
+    
+    // isInlineAsm - Check if the instruction is a call to an inline asm chunk
+    static bool isInlineAsm(const Instruction& I) {
+      if (isa<CallInst>(&I) && isa<InlineAsm>(I.getOperand(0)))
+        return true;
+      return false;
+    }
+    
+    // Instruction visitation functions
+    friend class InstVisitor<CWriter>;
+
+    void visitReturnInst(ReturnInst &I);
+    void visitBranchInst(BranchInst &I);
+    void visitSwitchInst(SwitchInst &I);
+    void visitInvokeInst(InvokeInst &I) {
+      assert(0 && "Lowerinvoke pass didn't work!");
+    }
+
+    void visitUnwindInst(UnwindInst &I) {
+      assert(0 && "Lowerinvoke pass didn't work!");
+    }
+    void visitUnreachableInst(UnreachableInst &I);
+
+    void visitPHINode(PHINode &I);
+    void visitBinaryOperator(Instruction &I);
+    void visitICmpInst(ICmpInst &I);
+    void visitFCmpInst(FCmpInst &I);
+
+    void visitCastInst (CastInst &I);
+    void visitSelectInst(SelectInst &I);
+    void visitCallInst (CallInst &I);
+    void visitInlineAsm(CallInst &I);
+
+    void visitMallocInst(MallocInst &I);
+    void visitAllocaInst(AllocaInst &I);
+    void visitFreeInst  (FreeInst   &I);
+    void visitLoadInst  (LoadInst   &I);
+    void visitStoreInst (StoreInst  &I);
+    void visitGetElementPtrInst(GetElementPtrInst &I);
+    void visitVAArgInst (VAArgInst &I);
+
+    void visitInstruction(Instruction &I) {
+      cerr << "C Writer does not know about " << I;
+      abort();
+    }
+
+    void outputLValue(Instruction *I) {
+      Out << "  " << GetValueName(I) << " = ";
+    }
+
+    bool isGotoCodeNecessary(BasicBlock *From, BasicBlock *To);
+    void printPHICopiesForSuccessor(BasicBlock *CurBlock,
+                                    BasicBlock *Successor, unsigned Indent);
+    void printBranchToBlock(BasicBlock *CurBlock, BasicBlock *SuccBlock,
+                            unsigned Indent);
+    void printIndexingExpression(Value *Ptr, gep_type_iterator I,
+                                 gep_type_iterator E);
+
+    std::string GetValueName(const Value *Operand);
+  };
+}
+
+char CWriter::ID = 0;
+
+/// This method inserts names for any unnamed structure types that are used by
+/// the program, and removes names from structure types that are not used by the
+/// program.
+///
+bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
+  // Get a set of types that are used by the program...
+  std::set<const Type *> UT = getAnalysis<FindUsedTypes>().getTypes();
+
+  // Loop over the module symbol table, removing types from UT that are
+  // already named, and removing names for types that are not used.
+  //
+  TypeSymbolTable &TST = M.getTypeSymbolTable();
+  for (TypeSymbolTable::iterator TI = TST.begin(), TE = TST.end();
+       TI != TE; ) {
+    TypeSymbolTable::iterator I = TI++;
+    
+    // If this isn't a struct type, remove it from our set of types to name.
+    // This simplifies emission later.
+    if (!isa<StructType>(I->second) && !isa<OpaqueType>(I->second)) {
+      TST.remove(I);
+    } else {
+      // If this is not used, remove it from the symbol table.
+      std::set<const Type *>::iterator UTI = UT.find(I->second);
+      if (UTI == UT.end())
+        TST.remove(I);
+      else
+        UT.erase(UTI);    // Only keep one name for this type.
+    }
+  }
+
+  // UT now contains types that are not named.  Loop over it, naming
+  // structure types.
+  //
+  bool Changed = false;
+  unsigned RenameCounter = 0;
+  for (std::set<const Type *>::const_iterator I = UT.begin(), E = UT.end();
+       I != E; ++I)
+    if (const StructType *ST = dyn_cast<StructType>(*I)) {
+      while (M.addTypeName("unnamed"+utostr(RenameCounter), ST))
+        ++RenameCounter;
+      Changed = true;
+    }
+      
+      
+  // Loop over all external functions and globals.  If we have two with
+  // identical names, merge them.
+  // FIXME: This code should disappear when we don't allow values with the same
+  // names when they have different types!
+  std::map<std::string, GlobalValue*> ExtSymbols;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E;) {
+    Function *GV = I++;
+    if (GV->isDeclaration() && GV->hasName()) {
+      std::pair<std::map<std::string, GlobalValue*>::iterator, bool> X
+        = ExtSymbols.insert(std::make_pair(GV->getName(), GV));
+      if (!X.second) {
+        // Found a conflict, replace this global with the previous one.
+        GlobalValue *OldGV = X.first->second;
+        GV->replaceAllUsesWith(ConstantExpr::getBitCast(OldGV, GV->getType()));
+        GV->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+  // Do the same for globals.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E;) {
+    GlobalVariable *GV = I++;
+    if (GV->isDeclaration() && GV->hasName()) {
+      std::pair<std::map<std::string, GlobalValue*>::iterator, bool> X
+        = ExtSymbols.insert(std::make_pair(GV->getName(), GV));
+      if (!X.second) {
+        // Found a conflict, replace this global with the previous one.
+        GlobalValue *OldGV = X.first->second;
+        GV->replaceAllUsesWith(ConstantExpr::getBitCast(OldGV, GV->getType()));
+        GV->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+  
+  return Changed;
+}
+
+/// printStructReturnPointerFunctionType - This is like printType for a struct
+/// return type, except, instead of printing the type as void (*)(Struct*, ...)
+/// print it as "Struct (*)(...)", for struct return functions.
+void CWriter::printStructReturnPointerFunctionType(std::ostream &Out,
+                                                   const PointerType *TheTy) {
+  const FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
+  std::stringstream FunctionInnards;
+  FunctionInnards << " (*) (";
+  bool PrintedType = false;
+
+  FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end();
+  const Type *RetTy = cast<PointerType>(I->get())->getElementType();
+  unsigned Idx = 1;
+  const ParamAttrsList *Attrs = FTy->getParamAttrs();
+  for (++I; I != E; ++I) {
+    if (PrintedType)
+      FunctionInnards << ", ";
+    printType(FunctionInnards, *I, 
+        /*isSigned=*/Attrs && Attrs->paramHasAttr(Idx, ParamAttr::SExt), "");
+    PrintedType = true;
+  }
+  if (FTy->isVarArg()) {
+    if (PrintedType)
+      FunctionInnards << ", ...";
+  } else if (!PrintedType) {
+    FunctionInnards << "void";
+  }
+  FunctionInnards << ')';
+  std::string tstr = FunctionInnards.str();
+  printType(Out, RetTy, 
+      /*isSigned=*/Attrs && Attrs->paramHasAttr(0, ParamAttr::SExt), tstr);
+}
+
+std::ostream &
+CWriter::printSimpleType(std::ostream &Out, const Type *Ty, bool isSigned,
+                            const std::string &NameSoFar) {
+  assert((Ty->isPrimitiveType() || Ty->isInteger()) && 
+         "Invalid type for printSimpleType");
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:   return Out << "void " << NameSoFar;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1) 
+      return Out << "bool " << NameSoFar;
+    else if (NumBits <= 8)
+      return Out << (isSigned?"signed":"unsigned") << " char " << NameSoFar;
+    else if (NumBits <= 16)
+      return Out << (isSigned?"signed":"unsigned") << " short " << NameSoFar;
+    else if (NumBits <= 32)
+      return Out << (isSigned?"signed":"unsigned") << " int " << NameSoFar;
+    else { 
+      assert(NumBits <= 64 && "Bit widths > 64 not implemented yet");
+      return Out << (isSigned?"signed":"unsigned") << " long long "<< NameSoFar;
+    }
+  }
+  case Type::FloatTyID:  return Out << "float "   << NameSoFar;
+  case Type::DoubleTyID: return Out << "double "  << NameSoFar;
+  default :
+    cerr << "Unknown primitive type: " << *Ty << "\n";
+    abort();
+  }
+}
+
+// Pass the Type* and the variable name and this prints out the variable
+// declaration.
+//
+std::ostream &CWriter::printType(std::ostream &Out, const Type *Ty,
+                                 bool isSigned, const std::string &NameSoFar,
+                                 bool IgnoreName) {
+  if (Ty->isPrimitiveType() || Ty->isInteger()) {
+    printSimpleType(Out, Ty, isSigned, NameSoFar);
+    return Out;
+  }
+
+  // Check to see if the type is named.
+  if (!IgnoreName || isa<OpaqueType>(Ty)) {
+    std::map<const Type *, std::string>::iterator I = TypeNames.find(Ty);
+    if (I != TypeNames.end()) return Out << I->second << ' ' << NameSoFar;
+  }
+
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID: {
+    const FunctionType *FTy = cast<FunctionType>(Ty);
+    std::stringstream FunctionInnards;
+    FunctionInnards << " (" << NameSoFar << ") (";
+    const ParamAttrsList *Attrs = FTy->getParamAttrs();
+    unsigned Idx = 1;
+    for (FunctionType::param_iterator I = FTy->param_begin(),
+           E = FTy->param_end(); I != E; ++I) {
+      if (I != FTy->param_begin())
+        FunctionInnards << ", ";
+      printType(FunctionInnards, *I, 
+         /*isSigned=*/Attrs && Attrs->paramHasAttr(Idx, ParamAttr::SExt), "");
+      ++Idx;
+    }
+    if (FTy->isVarArg()) {
+      if (FTy->getNumParams())
+        FunctionInnards << ", ...";
+    } else if (!FTy->getNumParams()) {
+      FunctionInnards << "void";
+    }
+    FunctionInnards << ')';
+    std::string tstr = FunctionInnards.str();
+    printType(Out, FTy->getReturnType(), 
+        /*isSigned=*/Attrs && Attrs->paramHasAttr(0, ParamAttr::SExt), tstr);
+    return Out;
+  }
+  case Type::StructTyID: {
+    const StructType *STy = cast<StructType>(Ty);
+    Out << NameSoFar + " {\n";
+    unsigned Idx = 0;
+    for (StructType::element_iterator I = STy->element_begin(),
+           E = STy->element_end(); I != E; ++I) {
+      Out << "  ";
+      printType(Out, *I, false, "field" + utostr(Idx++));
+      Out << ";\n";
+    }
+    Out << '}';
+    if (STy->isPacked())
+      Out << " __attribute__ ((packed))";
+    return Out;
+  }
+
+  case Type::PointerTyID: {
+    const PointerType *PTy = cast<PointerType>(Ty);
+    std::string ptrName = "*" + NameSoFar;
+
+    if (isa<ArrayType>(PTy->getElementType()) ||
+        isa<VectorType>(PTy->getElementType()))
+      ptrName = "(" + ptrName + ")";
+
+    return printType(Out, PTy->getElementType(), false, ptrName);
+  }
+
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<ArrayType>(Ty);
+    unsigned NumElements = ATy->getNumElements();
+    if (NumElements == 0) NumElements = 1;
+    return printType(Out, ATy->getElementType(), false,
+                     NameSoFar + "[" + utostr(NumElements) + "]");
+  }
+
+  case Type::VectorTyID: {
+    const VectorType *PTy = cast<VectorType>(Ty);
+    unsigned NumElements = PTy->getNumElements();
+    if (NumElements == 0) NumElements = 1;
+    return printType(Out, PTy->getElementType(), false,
+                     NameSoFar + "[" + utostr(NumElements) + "]");
+  }
+
+  case Type::OpaqueTyID: {
+    static int Count = 0;
+    std::string TyName = "struct opaque_" + itostr(Count++);
+    assert(TypeNames.find(Ty) == TypeNames.end());
+    TypeNames[Ty] = TyName;
+    return Out << TyName << ' ' << NameSoFar;
+  }
+  default:
+    assert(0 && "Unhandled case in getTypeProps!");
+    abort();
+  }
+
+  return Out;
+}
+
+void CWriter::printConstantArray(ConstantArray *CPA) {
+
+  // As a special case, print the array as a string if it is an array of
+  // ubytes or an array of sbytes with positive values.
+  //
+  const Type *ETy = CPA->getType()->getElementType();
+  bool isString = (ETy == Type::Int8Ty || ETy == Type::Int8Ty);
+
+  // Make sure the last character is a null char, as automatically added by C
+  if (isString && (CPA->getNumOperands() == 0 ||
+                   !cast<Constant>(*(CPA->op_end()-1))->isNullValue()))
+    isString = false;
+
+  if (isString) {
+    Out << '\"';
+    // Keep track of whether the last number was a hexadecimal escape
+    bool LastWasHex = false;
+
+    // Do not include the last character, which we know is null
+    for (unsigned i = 0, e = CPA->getNumOperands()-1; i != e; ++i) {
+      unsigned char C = cast<ConstantInt>(CPA->getOperand(i))->getZExtValue();
+
+      // Print it out literally if it is a printable character.  The only thing
+      // to be careful about is when the last letter output was a hex escape
+      // code, in which case we have to be careful not to print out hex digits
+      // explicitly (the C compiler thinks it is a continuation of the previous
+      // character, sheesh...)
+      //
+      if (isprint(C) && (!LastWasHex || !isxdigit(C))) {
+        LastWasHex = false;
+        if (C == '"' || C == '\\')
+          Out << "\\" << C;
+        else
+          Out << C;
+      } else {
+        LastWasHex = false;
+        switch (C) {
+        case '\n': Out << "\\n"; break;
+        case '\t': Out << "\\t"; break;
+        case '\r': Out << "\\r"; break;
+        case '\v': Out << "\\v"; break;
+        case '\a': Out << "\\a"; break;
+        case '\"': Out << "\\\""; break;
+        case '\'': Out << "\\\'"; break;
+        default:
+          Out << "\\x";
+          Out << (char)(( C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'));
+          Out << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
+          LastWasHex = true;
+          break;
+        }
+      }
+    }
+    Out << '\"';
+  } else {
+    Out << '{';
+    if (CPA->getNumOperands()) {
+      Out << ' ';
+      printConstant(cast<Constant>(CPA->getOperand(0)));
+      for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) {
+        Out << ", ";
+        printConstant(cast<Constant>(CPA->getOperand(i)));
+      }
+    }
+    Out << " }";
+  }
+}
+
+void CWriter::printConstantVector(ConstantVector *CP) {
+  Out << '{';
+  if (CP->getNumOperands()) {
+    Out << ' ';
+    printConstant(cast<Constant>(CP->getOperand(0)));
+    for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) {
+      Out << ", ";
+      printConstant(cast<Constant>(CP->getOperand(i)));
+    }
+  }
+  Out << " }";
+}
+
+// isFPCSafeToPrint - Returns true if we may assume that CFP may be written out
+// textually as a double (rather than as a reference to a stack-allocated
+// variable). We decide this by converting CFP to a string and back into a
+// double, and then checking whether the conversion results in a bit-equal
+// double to the original value of CFP. This depends on us and the target C
+// compiler agreeing on the conversion process (which is pretty likely since we
+// only deal in IEEE FP).
+//
+static bool isFPCSafeToPrint(const ConstantFP *CFP) {
+#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
+  char Buffer[100];
+  sprintf(Buffer, "%a", CFP->getValue());
+
+  if (!strncmp(Buffer, "0x", 2) ||
+      !strncmp(Buffer, "-0x", 3) ||
+      !strncmp(Buffer, "+0x", 3))
+    return atof(Buffer) == CFP->getValue();
+  return false;
+#else
+  std::string StrVal = ftostr(CFP->getValue());
+
+  while (StrVal[0] == ' ')
+    StrVal.erase(StrVal.begin());
+
+  // Check to make sure that the stringized number is not some string like "Inf"
+  // or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
+  if ((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+      ((StrVal[0] == '-' || StrVal[0] == '+') &&
+       (StrVal[1] >= '0' && StrVal[1] <= '9')))
+    // Reparse stringized version!
+    return atof(StrVal.c_str()) == CFP->getValue();
+  return false;
+#endif
+}
+
+/// Print out the casting for a cast operation. This does the double casting
+/// necessary for conversion to the destination type, if necessary. 
+/// @brief Print a cast
+void CWriter::printCast(unsigned opc, const Type *SrcTy, const Type *DstTy) {
+  // Print the destination type cast
+  switch (opc) {
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::IntToPtr:
+    case Instruction::Trunc:
+    case Instruction::BitCast:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc: // For these the DstTy sign doesn't matter
+      Out << '(';
+      printType(Out, DstTy);
+      Out << ')';
+      break;
+    case Instruction::ZExt:
+    case Instruction::PtrToInt:
+    case Instruction::FPToUI: // For these, make sure we get an unsigned dest
+      Out << '(';
+      printSimpleType(Out, DstTy, false);
+      Out << ')';
+      break;
+    case Instruction::SExt: 
+    case Instruction::FPToSI: // For these, make sure we get a signed dest
+      Out << '(';
+      printSimpleType(Out, DstTy, true);
+      Out << ')';
+      break;
+    default:
+      assert(0 && "Invalid cast opcode");
+  }
+
+  // Print the source type cast
+  switch (opc) {
+    case Instruction::UIToFP:
+    case Instruction::ZExt:
+      Out << '(';
+      printSimpleType(Out, SrcTy, false);
+      Out << ')';
+      break;
+    case Instruction::SIToFP:
+    case Instruction::SExt:
+      Out << '(';
+      printSimpleType(Out, SrcTy, true); 
+      Out << ')';
+      break;
+    case Instruction::IntToPtr:
+    case Instruction::PtrToInt:
+      // Avoid "cast to pointer from integer of different size" warnings
+      Out << "(unsigned long)";
+      break;
+    case Instruction::Trunc:
+    case Instruction::BitCast:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPToSI:
+    case Instruction::FPToUI:
+      break; // These don't need a source cast.
+    default:
+      assert(0 && "Invalid cast opcode");
+      break;
+  }
+}
+
+// printConstant - The LLVM Constant to C Constant converter.
+void CWriter::printConstant(Constant *CPV) {
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
+    switch (CE->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      Out << "(";
+      printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType());
+      if (CE->getOpcode() == Instruction::SExt &&
+          CE->getOperand(0)->getType() == Type::Int1Ty) {
+        // Make sure we really sext from bool here by subtracting from 0
+        Out << "0-";
+      }
+      printConstant(CE->getOperand(0));
+      if (CE->getType() == Type::Int1Ty &&
+          (CE->getOpcode() == Instruction::Trunc ||
+           CE->getOpcode() == Instruction::FPToUI ||
+           CE->getOpcode() == Instruction::FPToSI ||
+           CE->getOpcode() == Instruction::PtrToInt)) {
+        // Make sure we really truncate to bool here by anding with 1
+        Out << "&1u";
+      }
+      Out << ')';
+      return;
+
+    case Instruction::GetElementPtr:
+      Out << "(&(";
+      printIndexingExpression(CE->getOperand(0), gep_type_begin(CPV),
+                              gep_type_end(CPV));
+      Out << "))";
+      return;
+    case Instruction::Select:
+      Out << '(';
+      printConstant(CE->getOperand(0));
+      Out << '?';
+      printConstant(CE->getOperand(1));
+      Out << ':';
+      printConstant(CE->getOperand(2));
+      Out << ')';
+      return;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    {
+      Out << '(';
+      bool NeedsClosingParens = printConstExprCast(CE); 
+      printConstantWithCast(CE->getOperand(0), CE->getOpcode());
+      switch (CE->getOpcode()) {
+      case Instruction::Add: Out << " + "; break;
+      case Instruction::Sub: Out << " - "; break;
+      case Instruction::Mul: Out << " * "; break;
+      case Instruction::URem:
+      case Instruction::SRem: 
+      case Instruction::FRem: Out << " % "; break;
+      case Instruction::UDiv: 
+      case Instruction::SDiv: 
+      case Instruction::FDiv: Out << " / "; break;
+      case Instruction::And: Out << " & "; break;
+      case Instruction::Or:  Out << " | "; break;
+      case Instruction::Xor: Out << " ^ "; break;
+      case Instruction::Shl: Out << " << "; break;
+      case Instruction::LShr:
+      case Instruction::AShr: Out << " >> "; break;
+      case Instruction::ICmp:
+        switch (CE->getPredicate()) {
+          case ICmpInst::ICMP_EQ: Out << " == "; break;
+          case ICmpInst::ICMP_NE: Out << " != "; break;
+          case ICmpInst::ICMP_SLT: 
+          case ICmpInst::ICMP_ULT: Out << " < "; break;
+          case ICmpInst::ICMP_SLE:
+          case ICmpInst::ICMP_ULE: Out << " <= "; break;
+          case ICmpInst::ICMP_SGT:
+          case ICmpInst::ICMP_UGT: Out << " > "; break;
+          case ICmpInst::ICMP_SGE:
+          case ICmpInst::ICMP_UGE: Out << " >= "; break;
+          default: assert(0 && "Illegal ICmp predicate");
+        }
+        break;
+      default: assert(0 && "Illegal opcode here!");
+      }
+      printConstantWithCast(CE->getOperand(1), CE->getOpcode());
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    case Instruction::FCmp: {
+      Out << '('; 
+      bool NeedsClosingParens = printConstExprCast(CE); 
+      if (CE->getPredicate() == FCmpInst::FCMP_FALSE)
+        Out << "0";
+      else if (CE->getPredicate() == FCmpInst::FCMP_TRUE)
+        Out << "1";
+      else {
+        const char* op = 0;
+        switch (CE->getPredicate()) {
+        default: assert(0 && "Illegal FCmp predicate");
+        case FCmpInst::FCMP_ORD: op = "ord"; break;
+        case FCmpInst::FCMP_UNO: op = "uno"; break;
+        case FCmpInst::FCMP_UEQ: op = "ueq"; break;
+        case FCmpInst::FCMP_UNE: op = "une"; break;
+        case FCmpInst::FCMP_ULT: op = "ult"; break;
+        case FCmpInst::FCMP_ULE: op = "ule"; break;
+        case FCmpInst::FCMP_UGT: op = "ugt"; break;
+        case FCmpInst::FCMP_UGE: op = "uge"; break;
+        case FCmpInst::FCMP_OEQ: op = "oeq"; break;
+        case FCmpInst::FCMP_ONE: op = "one"; break;
+        case FCmpInst::FCMP_OLT: op = "olt"; break;
+        case FCmpInst::FCMP_OLE: op = "ole"; break;
+        case FCmpInst::FCMP_OGT: op = "ogt"; break;
+        case FCmpInst::FCMP_OGE: op = "oge"; break;
+        }
+        Out << "llvm_fcmp_" << op << "(";
+        printConstantWithCast(CE->getOperand(0), CE->getOpcode());
+        Out << ", ";
+        printConstantWithCast(CE->getOperand(1), CE->getOpcode());
+        Out << ")";
+      }
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+    }
+    default:
+      cerr << "CWriter Error: Unhandled constant expression: "
+           << *CE << "\n";
+      abort();
+    }
+  } else if (isa<UndefValue>(CPV) && CPV->getType()->isFirstClassType()) {
+    Out << "((";
+    printType(Out, CPV->getType()); // sign doesn't matter
+    Out << ")/*UNDEF*/0)";
+    return;
+  }
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    const Type* Ty = CI->getType();
+    if (Ty == Type::Int1Ty)
+      Out << (CI->getZExtValue() ? '1' : '0') ;
+    else {
+      Out << "((";
+      printSimpleType(Out, Ty, false) << ')';
+      if (CI->isMinValue(true)) 
+        Out << CI->getZExtValue() << 'u';
+      else
+        Out << CI->getSExtValue();
+      if (Ty->getPrimitiveSizeInBits() > 32)
+        Out << "ll";
+      Out << ')';
+    }
+    return;
+  } 
+
+  switch (CPV->getType()->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID: {
+    ConstantFP *FPC = cast<ConstantFP>(CPV);
+    std::map<const ConstantFP*, unsigned>::iterator I = FPConstantMap.find(FPC);
+    if (I != FPConstantMap.end()) {
+      // Because of FP precision problems we must load from a stack allocated
+      // value that holds the value in hex.
+      Out << "(*(" << (FPC->getType() == Type::FloatTy ? "float" : "double")
+          << "*)&FPConstant" << I->second << ')';
+    } else {
+      if (IsNAN(FPC->getValue())) {
+        // The value is NaN
+
+        // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN,
+        // it's 0x7ff4.
+        const unsigned long QuietNaN = 0x7ff8UL;
+        //const unsigned long SignalNaN = 0x7ff4UL;
+
+        // We need to grab the first part of the FP #
+        char Buffer[100];
+
+        uint64_t ll = DoubleToBits(FPC->getValue());
+        sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
+
+        std::string Num(&Buffer[0], &Buffer[6]);
+        unsigned long Val = strtoul(Num.c_str(), 0, 16);
+
+        if (FPC->getType() == Type::FloatTy)
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\""
+              << Buffer << "\") /*nan*/ ";
+        else
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\""
+              << Buffer << "\") /*nan*/ ";
+      } else if (IsInf(FPC->getValue())) {
+        // The value is Inf
+        if (FPC->getValue() < 0) Out << '-';
+        Out << "LLVM_INF" << (FPC->getType() == Type::FloatTy ? "F" : "")
+            << " /*inf*/ ";
+      } else {
+        std::string Num;
+#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
+        // Print out the constant as a floating point number.
+        char Buffer[100];
+        sprintf(Buffer, "%a", FPC->getValue());
+        Num = Buffer;
+#else
+        Num = ftostr(FPC->getValue());
+#endif
+        Out << Num;
+      }
+    }
+    break;
+  }
+
+  case Type::ArrayTyID:
+    if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
+      const ArrayType *AT = cast<ArrayType>(CPV->getType());
+      Out << '{';
+      if (AT->getNumElements()) {
+        Out << ' ';
+        Constant *CZ = Constant::getNullValue(AT->getElementType());
+        printConstant(CZ);
+        for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) {
+          Out << ", ";
+          printConstant(CZ);
+        }
+      }
+      Out << " }";
+    } else {
+      printConstantArray(cast<ConstantArray>(CPV));
+    }
+    break;
+
+  case Type::VectorTyID:
+    if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
+      const VectorType *AT = cast<VectorType>(CPV->getType());
+      Out << '{';
+      if (AT->getNumElements()) {
+        Out << ' ';
+        Constant *CZ = Constant::getNullValue(AT->getElementType());
+        printConstant(CZ);
+        for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) {
+          Out << ", ";
+          printConstant(CZ);
+        }
+      }
+      Out << " }";
+    } else {
+      printConstantVector(cast<ConstantVector>(CPV));
+    }
+    break;
+
+  case Type::StructTyID:
+    if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
+      const StructType *ST = cast<StructType>(CPV->getType());
+      Out << '{';
+      if (ST->getNumElements()) {
+        Out << ' ';
+        printConstant(Constant::getNullValue(ST->getElementType(0)));
+        for (unsigned i = 1, e = ST->getNumElements(); i != e; ++i) {
+          Out << ", ";
+          printConstant(Constant::getNullValue(ST->getElementType(i)));
+        }
+      }
+      Out << " }";
+    } else {
+      Out << '{';
+      if (CPV->getNumOperands()) {
+        Out << ' ';
+        printConstant(cast<Constant>(CPV->getOperand(0)));
+        for (unsigned i = 1, e = CPV->getNumOperands(); i != e; ++i) {
+          Out << ", ";
+          printConstant(cast<Constant>(CPV->getOperand(i)));
+        }
+      }
+      Out << " }";
+    }
+    break;
+
+  case Type::PointerTyID:
+    if (isa<ConstantPointerNull>(CPV)) {
+      Out << "((";
+      printType(Out, CPV->getType()); // sign doesn't matter
+      Out << ")/*NULL*/0)";
+      break;
+    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) {
+      writeOperand(GV);
+      break;
+    }
+    // FALL THROUGH
+  default:
+    cerr << "Unknown constant type: " << *CPV << "\n";
+    abort();
+  }
+}
+
+// Some constant expressions need to be casted back to the original types
+// because their operands were casted to the expected type. This function takes
+// care of detecting that case and printing the cast for the ConstantExpr.
+bool CWriter::printConstExprCast(const ConstantExpr* CE) {
+  bool NeedsExplicitCast = false;
+  const Type *Ty = CE->getOperand(0)->getType();
+  bool TypeIsSigned = false;
+  switch (CE->getOpcode()) {
+  case Instruction::LShr:
+  case Instruction::URem: 
+  case Instruction::UDiv: NeedsExplicitCast = true; break;
+  case Instruction::AShr:
+  case Instruction::SRem: 
+  case Instruction::SDiv: NeedsExplicitCast = true; TypeIsSigned = true; break;
+  case Instruction::SExt:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    TypeIsSigned = true;
+    break;
+  case Instruction::ZExt:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    break;
+  default: break;
+  }
+  if (NeedsExplicitCast) {
+    Out << "((";
+    if (Ty->isInteger() && Ty != Type::Int1Ty)
+      printSimpleType(Out, Ty, TypeIsSigned);
+    else
+      printType(Out, Ty); // not integer, sign doesn't matter
+    Out << ")(";
+  }
+  return NeedsExplicitCast;
+}
+
+//  Print a constant assuming that it is the operand for a given Opcode. The
+//  opcodes that care about sign need to cast their operands to the expected
+//  type before the operation proceeds. This function does the casting.
+void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
+
+  // Extract the operand's type, we'll need it.
+  const Type* OpTy = CPV->getType();
+
+  // Indicate whether to do the cast or not.
+  bool shouldCast = false;
+  bool typeIsSigned = false;
+
+  // Based on the Opcode for which this Constant is being written, determine
+  // the new type to which the operand should be casted by setting the value
+  // of OpTy. If we change OpTy, also set shouldCast to true so it gets
+  // casted below.
+  switch (Opcode) {
+    default:
+      // for most instructions, it doesn't matter
+      break; 
+    case Instruction::LShr:
+    case Instruction::UDiv:
+    case Instruction::URem:
+      shouldCast = true;
+      break;
+    case Instruction::AShr:
+    case Instruction::SDiv:
+    case Instruction::SRem:
+      shouldCast = true;
+      typeIsSigned = true;
+      break;
+  }
+
+  // Write out the casted constant if we should, otherwise just write the
+  // operand.
+  if (shouldCast) {
+    Out << "((";
+    printSimpleType(Out, OpTy, typeIsSigned);
+    Out << ")";
+    printConstant(CPV);
+    Out << ")";
+  } else 
+    printConstant(CPV);
+}
+
+std::string CWriter::GetValueName(const Value *Operand) {
+  std::string Name;
+
+  if (!isa<GlobalValue>(Operand) && Operand->getName() != "") {
+    std::string VarName;
+
+    Name = Operand->getName();
+    VarName.reserve(Name.capacity());
+
+    for (std::string::iterator I = Name.begin(), E = Name.end();
+         I != E; ++I) {
+      char ch = *I;
+
+      if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+            (ch >= '0' && ch <= '9') || ch == '_'))
+        VarName += '_';
+      else
+        VarName += ch;
+    }
+
+    Name = "llvm_cbe_" + VarName;
+  } else {
+    Name = Mang->getValueName(Operand);
+  }
+
+  return Name;
+}
+
+void CWriter::writeOperandInternal(Value *Operand) {
+  if (Instruction *I = dyn_cast<Instruction>(Operand))
+    if (isInlinableInst(*I) && !isDirectAlloca(I)) {
+      // Should we inline this instruction to build a tree?
+      Out << '(';
+      visit(*I);
+      Out << ')';
+      return;
+    }
+
+  Constant* CPV = dyn_cast<Constant>(Operand);
+
+  if (CPV && !isa<GlobalValue>(CPV))
+    printConstant(CPV);
+  else
+    Out << GetValueName(Operand);
+}
+
+void CWriter::writeOperandRaw(Value *Operand) {
+  Constant* CPV = dyn_cast<Constant>(Operand);
+  if (CPV && !isa<GlobalValue>(CPV)) {
+    printConstant(CPV);
+  } else {
+    Out << GetValueName(Operand);
+  }
+}
+
+void CWriter::writeOperand(Value *Operand) {
+  if (isa<GlobalVariable>(Operand) || isDirectAlloca(Operand))
+    Out << "(&";  // Global variables are referenced as their addresses by llvm
+
+  writeOperandInternal(Operand);
+
+  if (isa<GlobalVariable>(Operand) || isDirectAlloca(Operand))
+    Out << ')';
+}
+
+// Some instructions need to have their result value casted back to the 
+// original types because their operands were casted to the expected type. 
+// This function takes care of detecting that case and printing the cast 
+// for the Instruction.
+bool CWriter::writeInstructionCast(const Instruction &I) {
+  const Type *Ty = I.getOperand(0)->getType();
+  switch (I.getOpcode()) {
+  case Instruction::LShr:
+  case Instruction::URem: 
+  case Instruction::UDiv: 
+    Out << "((";
+    printSimpleType(Out, Ty, false);
+    Out << ")(";
+    return true;
+  case Instruction::AShr:
+  case Instruction::SRem: 
+  case Instruction::SDiv: 
+    Out << "((";
+    printSimpleType(Out, Ty, true);
+    Out << ")(";
+    return true;
+  default: break;
+  }
+  return false;
+}
+
+// Write the operand with a cast to another type based on the Opcode being used.
+// This will be used in cases where an instruction has specific type
+// requirements (usually signedness) for its operands. 
+void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
+
+  // Extract the operand's type, we'll need it.
+  const Type* OpTy = Operand->getType();
+
+  // Indicate whether to do the cast or not.
+  bool shouldCast = false;
+
+  // Indicate whether the cast should be to a signed type or not.
+  bool castIsSigned = false;
+
+  // Based on the Opcode for which this Operand is being written, determine
+  // the new type to which the operand should be casted by setting the value
+  // of OpTy. If we change OpTy, also set shouldCast to true.
+  switch (Opcode) {
+    default:
+      // for most instructions, it doesn't matter
+      break; 
+    case Instruction::LShr:
+    case Instruction::UDiv:
+    case Instruction::URem: // Cast to unsigned first
+      shouldCast = true;
+      castIsSigned = false;
+      break;
+    case Instruction::AShr:
+    case Instruction::SDiv:
+    case Instruction::SRem: // Cast to signed first
+      shouldCast = true;
+      castIsSigned = true;
+      break;
+  }
+
+  // Write out the casted operand if we should, otherwise just write the
+  // operand.
+  if (shouldCast) {
+    Out << "((";
+    printSimpleType(Out, OpTy, castIsSigned);
+    Out << ")";
+    writeOperand(Operand);
+    Out << ")";
+  } else 
+    writeOperand(Operand);
+}
+
+// Write the operand with a cast to another type based on the icmp predicate 
+// being used. 
+void CWriter::writeOperandWithCast(Value* Operand, ICmpInst::Predicate predicate) {
+
+  // Extract the operand's type, we'll need it.
+  const Type* OpTy = Operand->getType();
+
+  // Indicate whether to do the cast or not.
+  bool shouldCast = false;
+
+  // Indicate whether the cast should be to a signed type or not.
+  bool castIsSigned = false;
+
+  // Based on the Opcode for which this Operand is being written, determine
+  // the new type to which the operand should be casted by setting the value
+  // of OpTy. If we change OpTy, also set shouldCast to true.
+  switch (predicate) {
+    default:
+      // for eq and ne, it doesn't matter
+      break; 
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_UGE:
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_ULE:
+      shouldCast = true;
+      break;
+    case ICmpInst::ICMP_SGT:
+    case ICmpInst::ICMP_SGE:
+    case ICmpInst::ICMP_SLT:
+    case ICmpInst::ICMP_SLE:
+      shouldCast = true;
+      castIsSigned = true;
+      break;
+  }
+
+  // Write out the casted operand if we should, otherwise just write the
+  // operand.
+  if (shouldCast) {
+    Out << "((";
+    if (OpTy->isInteger() && OpTy != Type::Int1Ty)
+      printSimpleType(Out, OpTy, castIsSigned);
+    else
+      printType(Out, OpTy); // not integer, sign doesn't matter
+    Out << ")";
+    writeOperand(Operand);
+    Out << ")";
+  } else 
+    writeOperand(Operand);
+}
+
+// generateCompilerSpecificCode - This is where we add conditional compilation
+// directives to cater to specific compilers as need be.
+//
+static void generateCompilerSpecificCode(std::ostream& Out) {
+  // Alloca is hard to get, and we don't want to include stdlib.h here.
+  Out << "/* get a declaration for alloca */\n"
+      << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n"
+      << "#define  alloca(x) __builtin_alloca((x))\n"
+      << "#define _alloca(x) __builtin_alloca((x))\n"    
+      << "#elif defined(__APPLE__)\n"
+      << "extern void *__builtin_alloca(unsigned long);\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#define longjmp _longjmp\n"
+      << "#define setjmp _setjmp\n"
+      << "#elif defined(__sun__)\n"
+      << "#if defined(__sparcv9)\n"
+      << "extern void *__builtin_alloca(unsigned long);\n"
+      << "#else\n"
+      << "extern void *__builtin_alloca(unsigned int);\n"
+      << "#endif\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#elif defined(__FreeBSD__) || defined(__OpenBSD__)\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#elif defined(_MSC_VER)\n"
+      << "#define inline _inline\n"
+      << "#define alloca(x) _alloca(x)\n"
+      << "#else\n"
+      << "#include <alloca.h>\n"
+      << "#endif\n\n";
+
+  // We output GCC specific attributes to preserve 'linkonce'ness on globals.
+  // If we aren't being compiled with GCC, just drop these attributes.
+  Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
+      << "#define __attribute__(X)\n"
+      << "#endif\n\n";
+
+  // On Mac OS X, "external weak" is spelled "__attribute__((weak_import))".
+  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __EXTERNAL_WEAK__\n"
+      << "#endif\n\n";
+
+  // For now, turn off the weak linkage attribute on Mac OS X. (See above.)
+  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#endif\n\n";
+
+  // Add hidden visibility support. FIXME: APPLE_CC?
+  Out << "#if defined(__GNUC__)\n"
+      << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n"
+      << "#endif\n\n";
+    
+  // Define NaN and Inf as GCC builtins if using GCC, as 0 otherwise
+  // From the GCC documentation:
+  //
+  //   double __builtin_nan (const char *str)
+  //
+  // This is an implementation of the ISO C99 function nan.
+  //
+  // Since ISO C99 defines this function in terms of strtod, which we do
+  // not implement, a description of the parsing is in order. The string is
+  // parsed as by strtol; that is, the base is recognized by leading 0 or
+  // 0x prefixes. The number parsed is placed in the significand such that
+  // the least significant bit of the number is at the least significant
+  // bit of the significand. The number is truncated to fit the significand
+  // field provided. The significand is forced to be a quiet NaN.
+  //
+  // This function, if given a string literal, is evaluated early enough
+  // that it is considered a compile-time constant.
+  //
+  //   float __builtin_nanf (const char *str)
+  //
+  // Similar to __builtin_nan, except the return type is float.
+  //
+  //   double __builtin_inf (void)
+  //
+  // Similar to __builtin_huge_val, except a warning is generated if the
+  // target floating-point format does not support infinities. This
+  // function is suitable for implementing the ISO C99 macro INFINITY.
+  //
+  //   float __builtin_inff (void)
+  //
+  // Similar to __builtin_inf, except the return type is float.
+  Out << "#ifdef __GNUC__\n"
+      << "#define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */\n"
+      << "#define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */\n"
+      << "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n"
+      << "#define LLVM_INF           __builtin_inf()         /* Double */\n"
+      << "#define LLVM_INFF          __builtin_inff()        /* Float */\n"
+      << "#define LLVM_PREFETCH(addr,rw,locality) "
+                              "__builtin_prefetch(addr,rw,locality)\n"
+      << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n"
+      << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n"
+      << "#define LLVM_ASM           __asm__\n"
+      << "#else\n"
+      << "#define LLVM_NAN(NanStr)   ((double)0.0)           /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  0.0F                    /* Float */\n"
+      << "#define LLVM_NANS(NanStr)  ((double)0.0)           /* Double */\n"
+      << "#define LLVM_NANSF(NanStr) 0.0F                    /* Float */\n"
+      << "#define LLVM_INF           ((double)0.0)           /* Double */\n"
+      << "#define LLVM_INFF          0.0F                    /* Float */\n"
+      << "#define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */\n"
+      << "#define __ATTRIBUTE_CTOR__\n"
+      << "#define __ATTRIBUTE_DTOR__\n"
+      << "#define LLVM_ASM(X)\n"
+      << "#endif\n\n";
+  
+  Out << "#if __GNUC__ < 4 /* Old GCC's, or compilers not GCC */ \n"
+      << "#define __builtin_stack_save() 0   /* not implemented */\n"
+      << "#define __builtin_stack_restore(X) /* noop */\n"
+      << "#endif\n\n";
+
+  // Output target-specific code that should be inserted into main.
+  Out << "#define CODE_FOR_MAIN() /* Any target-specific code for main()*/\n";
+  // On X86, set the FP control word to 64-bits of precision instead of 80 bits.
+  Out << "#if defined(__GNUC__) && !defined(__llvm__)\n"
+      << "#if defined(i386) || defined(__i386__) || defined(__i386) || "
+      << "defined(__x86_64__)\n"
+      << "#undef CODE_FOR_MAIN\n"
+      << "#define CODE_FOR_MAIN() \\\n"
+      << "  {short F;__asm__ (\"fnstcw %0\" : \"=m\" (*&F)); \\\n"
+      << "  F=(F&~0x300)|0x200;__asm__(\"fldcw %0\"::\"m\"(*&F));}\n"
+      << "#endif\n#endif\n";
+
+}
+
+/// FindStaticTors - Given a static ctor/dtor list, unpack its contents into
+/// the StaticTors set.
+static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){
+  ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!InitList) return;
+  
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){
+      if (CS->getNumOperands() != 2) return;  // Not array of 2-element structs.
+      
+      if (CS->getOperand(1)->isNullValue())
+        return;  // Found a null terminator, exit printing.
+      Constant *FP = CS->getOperand(1);
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP))
+        if (CE->isCast())
+          FP = CE->getOperand(0);
+      if (Function *F = dyn_cast<Function>(FP))
+        StaticTors.insert(F);
+    }
+}
+
+enum SpecialGlobalClass {
+  NotSpecial = 0,
+  GlobalCtors, GlobalDtors,
+  NotPrinted
+};
+
+/// getGlobalVariableClass - If this is a global that is specially recognized
+/// by LLVM, return a code that indicates how we should handle it.
+static SpecialGlobalClass getGlobalVariableClass(const GlobalVariable *GV) {
+  // If this is a global ctors/dtors list, handle it now.
+  if (GV->hasAppendingLinkage() && GV->use_empty()) {
+    if (GV->getName() == "llvm.global_ctors")
+      return GlobalCtors;
+    else if (GV->getName() == "llvm.global_dtors")
+      return GlobalDtors;
+  }
+  
+  // Otherwise, it it is other metadata, don't print it.  This catches things
+  // like debug information.
+  if (GV->getSection() == "llvm.metadata")
+    return NotPrinted;
+  
+  return NotSpecial;
+}
+
+
+bool CWriter::doInitialization(Module &M) {
+  // Initialize
+  TheModule = &M;
+
+  TD = new TargetData(&M);
+  IL = new IntrinsicLowering(*TD);
+  IL->AddPrototypes(M);
+
+  // Ensure that all structure types have names...
+  Mang = new Mangler(M);
+  Mang->markCharUnacceptable('.');
+
+  // Keep track of which functions are static ctors/dtors so they can have
+  // an attribute added to their prototypes.
+  std::set<Function*> StaticCtors, StaticDtors;
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    switch (getGlobalVariableClass(I)) {
+    default: break;
+    case GlobalCtors:
+      FindStaticTors(I, StaticCtors);
+      break;
+    case GlobalDtors:
+      FindStaticTors(I, StaticDtors);
+      break;
+    }
+  }
+  
+  // get declaration for alloca
+  Out << "/* Provide Declarations */\n";
+  Out << "#include <stdarg.h>\n";      // Varargs support
+  Out << "#include <setjmp.h>\n";      // Unwind support
+  generateCompilerSpecificCode(Out);
+
+  // Provide a definition for `bool' if not compiling with a C++ compiler.
+  Out << "\n"
+      << "#ifndef __cplusplus\ntypedef unsigned char bool;\n#endif\n"
+
+      << "\n\n/* Support for floating point constants */\n"
+      << "typedef unsigned long long ConstantDoubleTy;\n"
+      << "typedef unsigned int        ConstantFloatTy;\n"
+
+      << "\n\n/* Global Declarations */\n";
+
+  // First output all the declarations for the program, because C requires
+  // Functions & globals to be declared before they are used.
+  //
+
+  // Loop over the symbol table, emitting all named constants...
+  printModuleTypes(M.getTypeSymbolTable());
+
+  // Global variable declarations...
+  if (!M.global_empty()) {
+    Out << "\n/* External Global Variable Declarations */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I) {
+
+      if (I->hasExternalLinkage() || I->hasExternalWeakLinkage())
+        Out << "extern ";
+      else if (I->hasDLLImportLinkage())
+        Out << "__declspec(dllimport) ";
+      else
+        continue; // Internal Global
+
+      // Thread Local Storage
+      if (I->isThreadLocal())
+        Out << "__thread ";
+
+      printType(Out, I->getType()->getElementType(), false, GetValueName(I));
+
+      if (I->hasExternalWeakLinkage())
+         Out << " __EXTERNAL_WEAK__";
+      Out << ";\n";
+    }
+  }
+
+  // Function declarations
+  Out << "\n/* Function Declarations */\n";
+  Out << "double fmod(double, double);\n";   // Support for FP rem
+  Out << "float fmodf(float, float);\n";
+  
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    // Don't print declarations for intrinsic functions.
+    if (!I->getIntrinsicID() && I->getName() != "setjmp" && 
+        I->getName() != "longjmp" && I->getName() != "_setjmp") {
+      if (I->hasExternalWeakLinkage())
+        Out << "extern ";
+      printFunctionSignature(I, true);
+      if (I->hasWeakLinkage() || I->hasLinkOnceLinkage()) 
+        Out << " __ATTRIBUTE_WEAK__";
+      if (I->hasExternalWeakLinkage())
+        Out << " __EXTERNAL_WEAK__";
+      if (StaticCtors.count(I))
+        Out << " __ATTRIBUTE_CTOR__";
+      if (StaticDtors.count(I))
+        Out << " __ATTRIBUTE_DTOR__";
+      if (I->hasHiddenVisibility())
+        Out << " __HIDDEN__";
+      
+      if (I->hasName() && I->getName()[0] == 1)
+        Out << " LLVM_ASM(\"" << I->getName().c_str()+1 << "\")";
+          
+      Out << ";\n";
+    }
+  }
+
+  // Output the global variable declarations
+  if (!M.global_empty()) {
+    Out << "\n\n/* Global Variable Declarations */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Ignore special globals, such as debug info.
+        if (getGlobalVariableClass(I))
+          continue;
+
+        if (I->hasInternalLinkage())
+          Out << "static ";
+        else
+          Out << "extern ";
+
+        // Thread Local Storage
+        if (I->isThreadLocal())
+          Out << "__thread ";
+
+        printType(Out, I->getType()->getElementType(), false, 
+                  GetValueName(I));
+
+        if (I->hasLinkOnceLinkage())
+          Out << " __attribute__((common))";
+        else if (I->hasWeakLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+        else if (I->hasExternalWeakLinkage())
+          Out << " __EXTERNAL_WEAK__";
+        if (I->hasHiddenVisibility())
+          Out << " __HIDDEN__";
+        Out << ";\n";
+      }
+  }
+
+  // Output the global variable definitions and contents...
+  if (!M.global_empty()) {
+    Out << "\n\n/* Global Variable Definitions and Initialization */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+         I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Ignore special globals, such as debug info.
+        if (getGlobalVariableClass(I))
+          continue;
+
+        if (I->hasInternalLinkage())
+          Out << "static ";
+        else if (I->hasDLLImportLinkage())
+          Out << "__declspec(dllimport) ";
+        else if (I->hasDLLExportLinkage())
+          Out << "__declspec(dllexport) ";
+
+        // Thread Local Storage
+        if (I->isThreadLocal())
+          Out << "__thread ";
+
+        printType(Out, I->getType()->getElementType(), false, 
+                  GetValueName(I));
+        if (I->hasLinkOnceLinkage())
+          Out << " __attribute__((common))";
+        else if (I->hasWeakLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+
+        if (I->hasHiddenVisibility())
+          Out << " __HIDDEN__";
+        
+        // If the initializer is not null, emit the initializer.  If it is null,
+        // we try to avoid emitting large amounts of zeros.  The problem with
+        // this, however, occurs when the variable has weak linkage.  In this
+        // case, the assembler will complain about the variable being both weak
+        // and common, so we disable this optimization.
+        if (!I->getInitializer()->isNullValue()) {
+          Out << " = " ;
+          writeOperand(I->getInitializer());
+        } else if (I->hasWeakLinkage()) {
+          // We have to specify an initializer, but it doesn't have to be
+          // complete.  If the value is an aggregate, print out { 0 }, and let
+          // the compiler figure out the rest of the zeros.
+          Out << " = " ;
+          if (isa<StructType>(I->getInitializer()->getType()) ||
+              isa<ArrayType>(I->getInitializer()->getType()) ||
+              isa<VectorType>(I->getInitializer()->getType())) {
+            Out << "{ 0 }";
+          } else {
+            // Just print it out normally.
+            writeOperand(I->getInitializer());
+          }
+        }
+        Out << ";\n";
+      }
+  }
+
+  if (!M.empty())
+    Out << "\n\n/* Function Bodies */\n";
+
+  // Emit some helper functions for dealing with FCMP instruction's 
+  // predicates
+  Out << "static inline int llvm_fcmp_ord(double X, double Y) { ";
+  Out << "return X == X && Y == Y; }\n";
+  Out << "static inline int llvm_fcmp_uno(double X, double Y) { ";
+  Out << "return X != X || Y != Y; }\n";
+  Out << "static inline int llvm_fcmp_ueq(double X, double Y) { ";
+  Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_une(double X, double Y) { ";
+  Out << "return X != Y; }\n";
+  Out << "static inline int llvm_fcmp_ult(double X, double Y) { ";
+  Out << "return X <  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_ugt(double X, double Y) { ";
+  Out << "return X >  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_ule(double X, double Y) { ";
+  Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_uge(double X, double Y) { ";
+  Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_oeq(double X, double Y) { ";
+  Out << "return X == Y ; }\n";
+  Out << "static inline int llvm_fcmp_one(double X, double Y) { ";
+  Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_olt(double X, double Y) { ";
+  Out << "return X <  Y ; }\n";
+  Out << "static inline int llvm_fcmp_ogt(double X, double Y) { ";
+  Out << "return X >  Y ; }\n";
+  Out << "static inline int llvm_fcmp_ole(double X, double Y) { ";
+  Out << "return X <= Y ; }\n";
+  Out << "static inline int llvm_fcmp_oge(double X, double Y) { ";
+  Out << "return X >= Y ; }\n";
+  return false;
+}
+
+
+/// Output all floating point constants that cannot be printed accurately...
+void CWriter::printFloatingPointConstants(Function &F) {
+  // Scan the module for floating point constants.  If any FP constant is used
+  // in the function, we want to redirect it here so that we do not depend on
+  // the precision of the printed form, unless the printed form preserves
+  // precision.
+  //
+  static unsigned FPCounter = 0;
+  for (constant_iterator I = constant_begin(&F), E = constant_end(&F);
+       I != E; ++I)
+    if (const ConstantFP *FPC = dyn_cast<ConstantFP>(*I))
+      if (!isFPCSafeToPrint(FPC) && // Do not put in FPConstantMap if safe.
+          !FPConstantMap.count(FPC)) {
+        double Val = FPC->getValue();
+
+        FPConstantMap[FPC] = FPCounter;  // Number the FP constants
+
+        if (FPC->getType() == Type::DoubleTy) {
+          Out << "static const ConstantDoubleTy FPConstant" << FPCounter++
+              << " = 0x" << std::hex << DoubleToBits(Val) << std::dec
+              << "ULL;    /* " << Val << " */\n";
+        } else if (FPC->getType() == Type::FloatTy) {
+          Out << "static const ConstantFloatTy FPConstant" << FPCounter++
+              << " = 0x" << std::hex << FloatToBits(Val) << std::dec
+              << "U;    /* " << Val << " */\n";
+        } else
+          assert(0 && "Unknown float type!");
+      }
+
+  Out << '\n';
+}
+
+
+/// printSymbolTable - Run through symbol table looking for type names.  If a
+/// type name is found, emit its declaration...
+///
+void CWriter::printModuleTypes(const TypeSymbolTable &TST) {
+  Out << "/* Helper union for bitcasts */\n";
+  Out << "typedef union {\n";
+  Out << "  unsigned int Int32;\n";
+  Out << "  unsigned long long Int64;\n";
+  Out << "  float Float;\n";
+  Out << "  double Double;\n";
+  Out << "} llvmBitCastUnion;\n";
+
+  // We are only interested in the type plane of the symbol table.
+  TypeSymbolTable::const_iterator I   = TST.begin();
+  TypeSymbolTable::const_iterator End = TST.end();
+
+  // If there are no type names, exit early.
+  if (I == End) return;
+
+  // Print out forward declarations for structure types before anything else!
+  Out << "/* Structure forward decls */\n";
+  for (; I != End; ++I) {
+    std::string Name = "struct l_" + Mang->makeNameProper(I->first);
+    Out << Name << ";\n";
+    TypeNames.insert(std::make_pair(I->second, Name));
+  }
+
+  Out << '\n';
+
+  // Now we can print out typedefs.  Above, we guaranteed that this can only be
+  // for struct or opaque types.
+  Out << "/* Typedefs */\n";
+  for (I = TST.begin(); I != End; ++I) {
+    std::string Name = "l_" + Mang->makeNameProper(I->first);
+    Out << "typedef ";
+    printType(Out, I->second, false, Name);
+    Out << ";\n";
+  }
+
+  Out << '\n';
+
+  // Keep track of which structures have been printed so far...
+  std::set<const StructType *> StructPrinted;
+
+  // Loop over all structures then push them into the stack so they are
+  // printed in the correct order.
+  //
+  Out << "/* Structure contents */\n";
+  for (I = TST.begin(); I != End; ++I)
+    if (const StructType *STy = dyn_cast<StructType>(I->second))
+      // Only print out used types!
+      printContainedStructs(STy, StructPrinted);
+}
+
+// Push the struct onto the stack and recursively push all structs
+// this one depends on.
+//
+// TODO:  Make this work properly with vector types
+//
+void CWriter::printContainedStructs(const Type *Ty,
+                                    std::set<const StructType*> &StructPrinted){
+  // Don't walk through pointers.
+  if (isa<PointerType>(Ty) || Ty->isPrimitiveType() || Ty->isInteger()) return;
+  
+  // Print all contained types first.
+  for (Type::subtype_iterator I = Ty->subtype_begin(),
+       E = Ty->subtype_end(); I != E; ++I)
+    printContainedStructs(*I, StructPrinted);
+  
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    // Check to see if we have already printed this struct.
+    if (StructPrinted.insert(STy).second) {
+      // Print structure type out.
+      std::string Name = TypeNames[STy];
+      printType(Out, STy, false, Name, true);
+      Out << ";\n\n";
+    }
+  }
+}
+
+void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
+  /// isStructReturn - Should this function actually return a struct by-value?
+  bool isStructReturn = F->getFunctionType()->isStructReturn();
+  
+  if (F->hasInternalLinkage()) Out << "static ";
+  if (F->hasDLLImportLinkage()) Out << "__declspec(dllimport) ";
+  if (F->hasDLLExportLinkage()) Out << "__declspec(dllexport) ";  
+  switch (F->getCallingConv()) {
+   case CallingConv::X86_StdCall:
+    Out << "__stdcall ";
+    break;
+   case CallingConv::X86_FastCall:
+    Out << "__fastcall ";
+    break;
+  }
+  
+  // Loop over the arguments, printing them...
+  const FunctionType *FT = cast<FunctionType>(F->getFunctionType());
+  const ParamAttrsList *Attrs = FT->getParamAttrs();
+
+  std::stringstream FunctionInnards;
+
+  // Print out the name...
+  FunctionInnards << GetValueName(F) << '(';
+
+  bool PrintedArg = false;
+  if (!F->isDeclaration()) {
+    if (!F->arg_empty()) {
+      Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+      
+      // If this is a struct-return function, don't print the hidden
+      // struct-return argument.
+      if (isStructReturn) {
+        assert(I != E && "Invalid struct return function!");
+        ++I;
+      }
+      
+      std::string ArgName;
+      unsigned Idx = 1;
+      for (; I != E; ++I) {
+        if (PrintedArg) FunctionInnards << ", ";
+        if (I->hasName() || !Prototype)
+          ArgName = GetValueName(I);
+        else
+          ArgName = "";
+        printType(FunctionInnards, I->getType(), 
+            /*isSigned=*/Attrs && Attrs->paramHasAttr(Idx, ParamAttr::SExt), 
+            ArgName);
+        PrintedArg = true;
+        ++Idx;
+      }
+    }
+  } else {
+    // Loop over the arguments, printing them.
+    FunctionType::param_iterator I = FT->param_begin(), E = FT->param_end();
+    
+    // If this is a struct-return function, don't print the hidden
+    // struct-return argument.
+    if (isStructReturn) {
+      assert(I != E && "Invalid struct return function!");
+      ++I;
+    }
+    
+    unsigned Idx = 1;
+    for (; I != E; ++I) {
+      if (PrintedArg) FunctionInnards << ", ";
+      printType(FunctionInnards, *I,
+             /*isSigned=*/Attrs && Attrs->paramHasAttr(Idx, ParamAttr::SExt));
+      PrintedArg = true;
+      ++Idx;
+    }
+  }
+
+  // Finish printing arguments... if this is a vararg function, print the ...,
+  // unless there are no known types, in which case, we just emit ().
+  //
+  if (FT->isVarArg() && PrintedArg) {
+    if (PrintedArg) FunctionInnards << ", ";
+    FunctionInnards << "...";  // Output varargs portion of signature!
+  } else if (!FT->isVarArg() && !PrintedArg) {
+    FunctionInnards << "void"; // ret() -> ret(void) in C.
+  }
+  FunctionInnards << ')';
+  
+  // Get the return tpe for the function.
+  const Type *RetTy;
+  if (!isStructReturn)
+    RetTy = F->getReturnType();
+  else {
+    // If this is a struct-return function, print the struct-return type.
+    RetTy = cast<PointerType>(FT->getParamType(0))->getElementType();
+  }
+    
+  // Print out the return type and the signature built above.
+  printType(Out, RetTy, 
+            /*isSigned=*/ Attrs && Attrs->paramHasAttr(0, ParamAttr::SExt), 
+            FunctionInnards.str());
+}
+
+static inline bool isFPIntBitCast(const Instruction &I) {
+  if (!isa<BitCastInst>(I))
+    return false;
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DstTy = I.getType();
+  return (SrcTy->isFloatingPoint() && DstTy->isInteger()) ||
+         (DstTy->isFloatingPoint() && SrcTy->isInteger());
+}
+
+void CWriter::printFunction(Function &F) {
+  /// isStructReturn - Should this function actually return a struct by-value?
+  bool isStructReturn = F.getFunctionType()->isStructReturn();
+
+  printFunctionSignature(&F, false);
+  Out << " {\n";
+  
+  // If this is a struct return function, handle the result with magic.
+  if (isStructReturn) {
+    const Type *StructTy =
+      cast<PointerType>(F.arg_begin()->getType())->getElementType();
+    Out << "  ";
+    printType(Out, StructTy, false, "StructReturn");
+    Out << ";  /* Struct return temporary */\n";
+
+    Out << "  ";
+    printType(Out, F.arg_begin()->getType(), false, 
+              GetValueName(F.arg_begin()));
+    Out << " = &StructReturn;\n";
+  }
+
+  bool PrintedVar = false;
+  
+  // print local variable information for the function
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    if (const AllocaInst *AI = isDirectAlloca(&*I)) {
+      Out << "  ";
+      printType(Out, AI->getAllocatedType(), false, GetValueName(AI));
+      Out << ";    /* Address-exposed local */\n";
+      PrintedVar = true;
+    } else if (I->getType() != Type::VoidTy && !isInlinableInst(*I)) {
+      Out << "  ";
+      printType(Out, I->getType(), false, GetValueName(&*I));
+      Out << ";\n";
+
+      if (isa<PHINode>(*I)) {  // Print out PHI node temporaries as well...
+        Out << "  ";
+        printType(Out, I->getType(), false,
+                  GetValueName(&*I)+"__PHI_TEMPORARY");
+        Out << ";\n";
+      }
+      PrintedVar = true;
+    }
+    // We need a temporary for the BitCast to use so it can pluck a value out
+    // of a union to do the BitCast. This is separate from the need for a
+    // variable to hold the result of the BitCast. 
+    if (isFPIntBitCast(*I)) {
+      Out << "  llvmBitCastUnion " << GetValueName(&*I)
+          << "__BITCAST_TEMPORARY;\n";
+      PrintedVar = true;
+    }
+  }
+
+  if (PrintedVar)
+    Out << '\n';
+
+  if (F.hasExternalLinkage() && F.getName() == "main")
+    Out << "  CODE_FOR_MAIN();\n";
+
+  // print the basic blocks
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Loop *L = LI->getLoopFor(BB)) {
+      if (L->getHeader() == BB && L->getParentLoop() == 0)
+        printLoop(L);
+    } else {
+      printBasicBlock(BB);
+    }
+  }
+
+  Out << "}\n\n";
+}
+
+void CWriter::printLoop(Loop *L) {
+  Out << "  do {     /* Syntactic loop '" << L->getHeader()->getName()
+      << "' to make GCC happy */\n";
+  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) {
+    BasicBlock *BB = L->getBlocks()[i];
+    Loop *BBLoop = LI->getLoopFor(BB);
+    if (BBLoop == L)
+      printBasicBlock(BB);
+    else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L)
+      printLoop(BBLoop);
+  }
+  Out << "  } while (1); /* end of syntactic loop '"
+      << L->getHeader()->getName() << "' */\n";
+}
+
+void CWriter::printBasicBlock(BasicBlock *BB) {
+
+  // Don't print the label for the basic block if there are no uses, or if
+  // the only terminator use is the predecessor basic block's terminator.
+  // We have to scan the use list because PHI nodes use basic blocks too but
+  // do not require a label to be generated.
+  //
+  bool NeedsLabel = false;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (isGotoCodeNecessary(*PI, BB)) {
+      NeedsLabel = true;
+      break;
+    }
+
+  if (NeedsLabel) Out << GetValueName(BB) << ":\n";
+
+  // Output all of the instructions in the basic block...
+  for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E;
+       ++II) {
+    if (!isInlinableInst(*II) && !isDirectAlloca(II)) {
+      if (II->getType() != Type::VoidTy && !isInlineAsm(*II))
+        outputLValue(II);
+      else
+        Out << "  ";
+      visit(*II);
+      Out << ";\n";
+    }
+  }
+
+  // Don't emit prefix or suffix for the terminator...
+  visit(*BB->getTerminator());
+}
+
+
+// Specific Instruction type classes... note that all of the casts are
+// necessary because we use the instruction classes as opaque types...
+//
+void CWriter::visitReturnInst(ReturnInst &I) {
+  // If this is a struct return function, return the temporary struct.
+  bool isStructReturn = I.getParent()->getParent()->
+    getFunctionType()->isStructReturn();
+
+  if (isStructReturn) {
+    Out << "  return StructReturn;\n";
+    return;
+  }
+  
+  // Don't output a void return if this is the last basic block in the function
+  if (I.getNumOperands() == 0 &&
+      &*--I.getParent()->getParent()->end() == I.getParent() &&
+      !I.getParent()->size() == 1) {
+    return;
+  }
+
+  Out << "  return";
+  if (I.getNumOperands()) {
+    Out << ' ';
+    writeOperand(I.getOperand(0));
+  }
+  Out << ";\n";
+}
+
+void CWriter::visitSwitchInst(SwitchInst &SI) {
+
+  Out << "  switch (";
+  writeOperand(SI.getOperand(0));
+  Out << ") {\n  default:\n";
+  printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
+  printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
+  Out << ";\n";
+  for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2) {
+    Out << "  case ";
+    writeOperand(SI.getOperand(i));
+    Out << ":\n";
+    BasicBlock *Succ = cast<BasicBlock>(SI.getOperand(i+1));
+    printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
+    printBranchToBlock(SI.getParent(), Succ, 2);
+    if (Function::iterator(Succ) == next(Function::iterator(SI.getParent())))
+      Out << "    break;\n";
+  }
+  Out << "  }\n";
+}
+
+void CWriter::visitUnreachableInst(UnreachableInst &I) {
+  Out << "  /*UNREACHABLE*/;\n";
+}
+
+bool CWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) {
+  /// FIXME: This should be reenabled, but loop reordering safe!!
+  return true;
+
+  if (next(Function::iterator(From)) != Function::iterator(To))
+    return true;  // Not the direct successor, we need a goto.
+
+  //isa<SwitchInst>(From->getTerminator())
+
+  if (LI->getLoopFor(From) != LI->getLoopFor(To))
+    return true;
+  return false;
+}
+
+void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock,
+                                          BasicBlock *Successor,
+                                          unsigned Indent) {
+  for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    // Now we have to do the printing.
+    Value *IV = PN->getIncomingValueForBlock(CurBlock);
+    if (!isa<UndefValue>(IV)) {
+      Out << std::string(Indent, ' ');
+      Out << "  " << GetValueName(I) << "__PHI_TEMPORARY = ";
+      writeOperand(IV);
+      Out << ";   /* for PHI node */\n";
+    }
+  }
+}
+
+void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ,
+                                 unsigned Indent) {
+  if (isGotoCodeNecessary(CurBB, Succ)) {
+    Out << std::string(Indent, ' ') << "  goto ";
+    writeOperand(Succ);
+    Out << ";\n";
+  }
+}
+
+// Branch instruction printing - Avoid printing out a branch to a basic block
+// that immediately succeeds the current one.
+//
+void CWriter::visitBranchInst(BranchInst &I) {
+
+  if (I.isConditional()) {
+    if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(0))) {
+      Out << "  if (";
+      writeOperand(I.getCondition());
+      Out << ") {\n";
+
+      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 2);
+      printBranchToBlock(I.getParent(), I.getSuccessor(0), 2);
+
+      if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(1))) {
+        Out << "  } else {\n";
+        printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
+        printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
+      }
+    } else {
+      // First goto not necessary, assume second one is...
+      Out << "  if (!";
+      writeOperand(I.getCondition());
+      Out << ") {\n";
+
+      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
+      printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
+    }
+
+    Out << "  }\n";
+  } else {
+    printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 0);
+    printBranchToBlock(I.getParent(), I.getSuccessor(0), 0);
+  }
+  Out << "\n";
+}
+
+// PHI nodes get copied into temporary values at the end of predecessor basic
+// blocks.  We now need to copy these temporary values into the REAL value for
+// the PHI.
+void CWriter::visitPHINode(PHINode &I) {
+  writeOperand(&I);
+  Out << "__PHI_TEMPORARY";
+}
+
+
+void CWriter::visitBinaryOperator(Instruction &I) {
+  // binary instructions, shift instructions, setCond instructions.
+  assert(!isa<PointerType>(I.getType()));
+
+  // We must cast the results of binary operations which might be promoted.
+  bool needsCast = false;
+  if ((I.getType() == Type::Int8Ty) || (I.getType() == Type::Int16Ty) 
+      || (I.getType() == Type::FloatTy)) {
+    needsCast = true;
+    Out << "((";
+    printType(Out, I.getType(), false);
+    Out << ")(";
+  }
+
+  // If this is a negation operation, print it out as such.  For FP, we don't
+  // want to print "-0.0 - X".
+  if (BinaryOperator::isNeg(&I)) {
+    Out << "-(";
+    writeOperand(BinaryOperator::getNegArgument(cast<BinaryOperator>(&I)));
+    Out << ")";
+  } else if (I.getOpcode() == Instruction::FRem) {
+    // Output a call to fmod/fmodf instead of emitting a%b
+    if (I.getType() == Type::FloatTy)
+      Out << "fmodf(";
+    else
+      Out << "fmod(";
+    writeOperand(I.getOperand(0));
+    Out << ", ";
+    writeOperand(I.getOperand(1));
+    Out << ")";
+  } else {
+
+    // Write out the cast of the instruction's value back to the proper type
+    // if necessary.
+    bool NeedsClosingParens = writeInstructionCast(I);
+
+    // Certain instructions require the operand to be forced to a specific type
+    // so we use writeOperandWithCast here instead of writeOperand. Similarly
+    // below for operand 1
+    writeOperandWithCast(I.getOperand(0), I.getOpcode());
+
+    switch (I.getOpcode()) {
+    case Instruction::Add:  Out << " + "; break;
+    case Instruction::Sub:  Out << " - "; break;
+    case Instruction::Mul:  Out << " * "; break;
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem: Out << " % "; break;
+    case Instruction::UDiv:
+    case Instruction::SDiv: 
+    case Instruction::FDiv: Out << " / "; break;
+    case Instruction::And:  Out << " & "; break;
+    case Instruction::Or:   Out << " | "; break;
+    case Instruction::Xor:  Out << " ^ "; break;
+    case Instruction::Shl : Out << " << "; break;
+    case Instruction::LShr:
+    case Instruction::AShr: Out << " >> "; break;
+    default: cerr << "Invalid operator type!" << I; abort();
+    }
+
+    writeOperandWithCast(I.getOperand(1), I.getOpcode());
+    if (NeedsClosingParens)
+      Out << "))";
+  }
+
+  if (needsCast) {
+    Out << "))";
+  }
+}
+
+void CWriter::visitICmpInst(ICmpInst &I) {
+  // We must cast the results of icmp which might be promoted.
+  bool needsCast = false;
+
+  // Write out the cast of the instruction's value back to the proper type
+  // if necessary.
+  bool NeedsClosingParens = writeInstructionCast(I);
+
+  // Certain icmp predicate require the operand to be forced to a specific type
+  // so we use writeOperandWithCast here instead of writeOperand. Similarly
+  // below for operand 1
+  writeOperandWithCast(I.getOperand(0), I.getPredicate());
+
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_EQ:  Out << " == "; break;
+  case ICmpInst::ICMP_NE:  Out << " != "; break;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE: Out << " <= "; break;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE: Out << " >= "; break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT: Out << " < "; break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT: Out << " > "; break;
+  default: cerr << "Invalid icmp predicate!" << I; abort();
+  }
+
+  writeOperandWithCast(I.getOperand(1), I.getPredicate());
+  if (NeedsClosingParens)
+    Out << "))";
+
+  if (needsCast) {
+    Out << "))";
+  }
+}
+
+void CWriter::visitFCmpInst(FCmpInst &I) {
+  if (I.getPredicate() == FCmpInst::FCMP_FALSE) {
+    Out << "0";
+    return;
+  }
+  if (I.getPredicate() == FCmpInst::FCMP_TRUE) {
+    Out << "1";
+    return;
+  }
+
+  const char* op = 0;
+  switch (I.getPredicate()) {
+  default: assert(0 && "Illegal FCmp predicate");
+  case FCmpInst::FCMP_ORD: op = "ord"; break;
+  case FCmpInst::FCMP_UNO: op = "uno"; break;
+  case FCmpInst::FCMP_UEQ: op = "ueq"; break;
+  case FCmpInst::FCMP_UNE: op = "une"; break;
+  case FCmpInst::FCMP_ULT: op = "ult"; break;
+  case FCmpInst::FCMP_ULE: op = "ule"; break;
+  case FCmpInst::FCMP_UGT: op = "ugt"; break;
+  case FCmpInst::FCMP_UGE: op = "uge"; break;
+  case FCmpInst::FCMP_OEQ: op = "oeq"; break;
+  case FCmpInst::FCMP_ONE: op = "one"; break;
+  case FCmpInst::FCMP_OLT: op = "olt"; break;
+  case FCmpInst::FCMP_OLE: op = "ole"; break;
+  case FCmpInst::FCMP_OGT: op = "ogt"; break;
+  case FCmpInst::FCMP_OGE: op = "oge"; break;
+  }
+
+  Out << "llvm_fcmp_" << op << "(";
+  // Write the first operand
+  writeOperand(I.getOperand(0));
+  Out << ", ";
+  // Write the second operand
+  writeOperand(I.getOperand(1));
+  Out << ")";
+}
+
+static const char * getFloatBitCastField(const Type *Ty) {
+  switch (Ty->getTypeID()) {
+    default: assert(0 && "Invalid Type");
+    case Type::FloatTyID:  return "Float";
+    case Type::DoubleTyID: return "Double";
+    case Type::IntegerTyID: {
+      unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+      if (NumBits <= 32)
+        return "Int32";
+      else
+        return "Int64";
+    }
+  }
+}
+
+void CWriter::visitCastInst(CastInst &I) {
+  const Type *DstTy = I.getType();
+  const Type *SrcTy = I.getOperand(0)->getType();
+  Out << '(';
+  if (isFPIntBitCast(I)) {
+    // These int<->float and long<->double casts need to be handled specially
+    Out << GetValueName(&I) << "__BITCAST_TEMPORARY." 
+        << getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
+    writeOperand(I.getOperand(0));
+    Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY."
+        << getFloatBitCastField(I.getType());
+  } else {
+    printCast(I.getOpcode(), SrcTy, DstTy);
+    if (I.getOpcode() == Instruction::SExt && SrcTy == Type::Int1Ty) {
+      // Make sure we really get a sext from bool by subtracing the bool from 0
+      Out << "0-";
+    }
+    writeOperand(I.getOperand(0));
+    if (DstTy == Type::Int1Ty && 
+        (I.getOpcode() == Instruction::Trunc ||
+         I.getOpcode() == Instruction::FPToUI ||
+         I.getOpcode() == Instruction::FPToSI ||
+         I.getOpcode() == Instruction::PtrToInt)) {
+      // Make sure we really get a trunc to bool by anding the operand with 1 
+      Out << "&1u";
+    }
+  }
+  Out << ')';
+}
+
+void CWriter::visitSelectInst(SelectInst &I) {
+  Out << "((";
+  writeOperand(I.getCondition());
+  Out << ") ? (";
+  writeOperand(I.getTrueValue());
+  Out << ") : (";
+  writeOperand(I.getFalseValue());
+  Out << "))";
+}
+
+
+void CWriter::lowerIntrinsics(Function &F) {
+  // This is used to keep track of intrinsics that get generated to a lowered
+  // function. We must generate the prototypes before the function body which
+  // will only be expanded on first use (by the loop below).
+  std::vector<Function*> prototypesToGen;
+
+  // Examine all the instructions in this function to find the intrinsics that
+  // need to be lowered.
+  for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
+        if (Function *F = CI->getCalledFunction())
+          switch (F->getIntrinsicID()) {
+          case Intrinsic::not_intrinsic:
+          case Intrinsic::vastart:
+          case Intrinsic::vacopy:
+          case Intrinsic::vaend:
+          case Intrinsic::returnaddress:
+          case Intrinsic::frameaddress:
+          case Intrinsic::setjmp:
+          case Intrinsic::longjmp:
+          case Intrinsic::prefetch:
+          case Intrinsic::dbg_stoppoint:
+          case Intrinsic::powi_f32:
+          case Intrinsic::powi_f64:
+            // We directly implement these intrinsics
+            break;
+          default:
+            // If this is an intrinsic that directly corresponds to a GCC
+            // builtin, we handle it.
+            const char *BuiltinName = "";
+#define GET_GCC_BUILTIN_NAME
+#include "llvm/Intrinsics.gen"
+#undef GET_GCC_BUILTIN_NAME
+            // If we handle it, don't lower it.
+            if (BuiltinName[0]) break;
+            
+            // All other intrinsic calls we must lower.
+            Instruction *Before = 0;
+            if (CI != &BB->front())
+              Before = prior(BasicBlock::iterator(CI));
+
+            IL->LowerIntrinsicCall(CI);
+            if (Before) {        // Move iterator to instruction after call
+              I = Before; ++I;
+            } else {
+              I = BB->begin();
+            }
+            // If the intrinsic got lowered to another call, and that call has
+            // a definition then we need to make sure its prototype is emitted
+            // before any calls to it.
+            if (CallInst *Call = dyn_cast<CallInst>(I))
+              if (Function *NewF = Call->getCalledFunction())
+                if (!NewF->isDeclaration())
+                  prototypesToGen.push_back(NewF);
+
+            break;
+          }
+
+  // We may have collected some prototypes to emit in the loop above. 
+  // Emit them now, before the function that uses them is emitted. But,
+  // be careful not to emit them twice.
+  std::vector<Function*>::iterator I = prototypesToGen.begin();
+  std::vector<Function*>::iterator E = prototypesToGen.end();
+  for ( ; I != E; ++I) {
+    if (intrinsicPrototypesAlreadyGenerated.insert(*I).second) {
+      Out << '\n';
+      printFunctionSignature(*I, true);
+      Out << ";\n";
+    }
+  }
+}
+
+
+void CWriter::visitCallInst(CallInst &I) {
+  //check if we have inline asm
+  if (isInlineAsm(I)) {
+    visitInlineAsm(I);
+    return;
+  }
+
+  bool WroteCallee = false;
+
+  // Handle intrinsic function calls first...
+  if (Function *F = I.getCalledFunction())
+    if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID()) {
+      switch (ID) {
+      default: {
+        // If this is an intrinsic that directly corresponds to a GCC
+        // builtin, we emit it here.
+        const char *BuiltinName = "";
+#define GET_GCC_BUILTIN_NAME
+#include "llvm/Intrinsics.gen"
+#undef GET_GCC_BUILTIN_NAME
+        assert(BuiltinName[0] && "Unknown LLVM intrinsic!");
+
+        Out << BuiltinName;
+        WroteCallee = true;
+        break;
+      }
+      case Intrinsic::vastart:
+        Out << "0; ";
+
+        Out << "va_start(*(va_list*)";
+        writeOperand(I.getOperand(1));
+        Out << ", ";
+        // Output the last argument to the enclosing function...
+        if (I.getParent()->getParent()->arg_empty()) {
+          cerr << "The C backend does not currently support zero "
+               << "argument varargs functions, such as '"
+               << I.getParent()->getParent()->getName() << "'!\n";
+          abort();
+        }
+        writeOperand(--I.getParent()->getParent()->arg_end());
+        Out << ')';
+        return;
+      case Intrinsic::vaend:
+        if (!isa<ConstantPointerNull>(I.getOperand(1))) {
+          Out << "0; va_end(*(va_list*)";
+          writeOperand(I.getOperand(1));
+          Out << ')';
+        } else {
+          Out << "va_end(*(va_list*)0)";
+        }
+        return;
+      case Intrinsic::vacopy:
+        Out << "0; ";
+        Out << "va_copy(*(va_list*)";
+        writeOperand(I.getOperand(1));
+        Out << ", *(va_list*)";
+        writeOperand(I.getOperand(2));
+        Out << ')';
+        return;
+      case Intrinsic::returnaddress:
+        Out << "__builtin_return_address(";
+        writeOperand(I.getOperand(1));
+        Out << ')';
+        return;
+      case Intrinsic::frameaddress:
+        Out << "__builtin_frame_address(";
+        writeOperand(I.getOperand(1));
+        Out << ')';
+        return;
+      case Intrinsic::powi_f32:
+      case Intrinsic::powi_f64:
+        Out << "__builtin_powi(";
+        writeOperand(I.getOperand(1));
+        Out << ", ";
+        writeOperand(I.getOperand(2));
+        Out << ')';
+        return;
+      case Intrinsic::setjmp:
+        Out << "setjmp(*(jmp_buf*)";
+        writeOperand(I.getOperand(1));
+        Out << ')';
+        return;
+      case Intrinsic::longjmp:
+        Out << "longjmp(*(jmp_buf*)";
+        writeOperand(I.getOperand(1));
+        Out << ", ";
+        writeOperand(I.getOperand(2));
+        Out << ')';
+        return;
+      case Intrinsic::prefetch:
+        Out << "LLVM_PREFETCH((const void *)";
+        writeOperand(I.getOperand(1));
+        Out << ", ";
+        writeOperand(I.getOperand(2));
+        Out << ", ";
+        writeOperand(I.getOperand(3));
+        Out << ")";
+        return;
+      case Intrinsic::dbg_stoppoint: {
+        // If we use writeOperand directly we get a "u" suffix which is rejected
+        // by gcc.
+        DbgStopPointInst &SPI = cast<DbgStopPointInst>(I);
+
+        Out << "\n#line "
+            << SPI.getLine()
+            << " \"" << SPI.getDirectory()
+            << SPI.getFileName() << "\"\n";
+        return;
+      }
+      }
+    }
+
+  Value *Callee = I.getCalledValue();
+
+  const PointerType  *PTy   = cast<PointerType>(Callee->getType());
+  const FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
+
+  // If this is a call to a struct-return function, assign to the first
+  // parameter instead of passing it to the call.
+  bool isStructRet = FTy->isStructReturn();
+  if (isStructRet) {
+    Out << "*(";
+    writeOperand(I.getOperand(1));
+    Out << ") = ";
+  }
+  
+  if (I.isTailCall()) Out << " /*tail*/ ";
+  
+  if (!WroteCallee) {
+    // If this is an indirect call to a struct return function, we need to cast
+    // the pointer.
+    bool NeedsCast = isStructRet && !isa<Function>(Callee);
+
+    // GCC is a real PITA.  It does not permit codegening casts of functions to
+    // function pointers if they are in a call (it generates a trap instruction
+    // instead!).  We work around this by inserting a cast to void* in between
+    // the function and the function pointer cast.  Unfortunately, we can't just
+    // form the constant expression here, because the folder will immediately
+    // nuke it.
+    //
+    // Note finally, that this is completely unsafe.  ANSI C does not guarantee
+    // that void* and function pointers have the same size. :( To deal with this
+    // in the common case, we handle casts where the number of arguments passed
+    // match exactly.
+    //
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Callee))
+      if (CE->isCast())
+        if (Function *RF = dyn_cast<Function>(CE->getOperand(0))) {
+          NeedsCast = true;
+          Callee = RF;
+        }
+  
+    if (NeedsCast) {
+      // Ok, just cast the pointer type.
+      Out << "((";
+      if (!isStructRet)
+        printType(Out, I.getCalledValue()->getType());
+      else
+        printStructReturnPointerFunctionType(Out, 
+                             cast<PointerType>(I.getCalledValue()->getType()));
+      Out << ")(void*)";
+    }
+    writeOperand(Callee);
+    if (NeedsCast) Out << ')';
+  }
+
+  Out << '(';
+
+  unsigned NumDeclaredParams = FTy->getNumParams();
+
+  CallSite::arg_iterator AI = I.op_begin()+1, AE = I.op_end();
+  unsigned ArgNo = 0;
+  if (isStructRet) {   // Skip struct return argument.
+    ++AI;
+    ++ArgNo;
+  }
+      
+  const ParamAttrsList *Attrs = FTy->getParamAttrs();
+  bool PrintedArg = false;
+  unsigned Idx = 1;
+  for (; AI != AE; ++AI, ++ArgNo, ++Idx) {
+    if (PrintedArg) Out << ", ";
+    if (ArgNo < NumDeclaredParams &&
+        (*AI)->getType() != FTy->getParamType(ArgNo)) {
+      Out << '(';
+      printType(Out, FTy->getParamType(ArgNo), 
+            /*isSigned=*/Attrs && Attrs->paramHasAttr(Idx, ParamAttr::SExt));
+      Out << ')';
+    }
+    writeOperand(*AI);
+    PrintedArg = true;
+  }
+  Out << ')';
+}
+
+
+//This converts the llvm constraint string to something gcc is expecting.
+//TODO: work out platform independent constraints and factor those out
+//      of the per target tables
+//      handle multiple constraint codes
+std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
+
+  assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle");
+
+  const char** table = 0;
+  
+  //Grab the translation table from TargetAsmInfo if it exists
+  if (!TAsm) {
+    std::string E;
+    const TargetMachineRegistry::Entry* Match = 
+      TargetMachineRegistry::getClosestStaticTargetForModule(*TheModule, E);
+    if (Match) {
+      //Per platform Target Machines don't exist, so create it
+      // this must be done only once
+      const TargetMachine* TM = Match->CtorFn(*TheModule, "");
+      TAsm = TM->getTargetAsmInfo();
+    }
+  }
+  if (TAsm)
+    table = TAsm->getAsmCBE();
+
+  //Search the translation table if it exists
+  for (int i = 0; table && table[i]; i += 2)
+    if (c.Codes[0] == table[i])
+      return table[i+1];
+
+  //default is identity
+  return c.Codes[0];
+}
+
+//TODO: import logic from AsmPrinter.cpp
+static std::string gccifyAsm(std::string asmstr) {
+  for (std::string::size_type i = 0; i != asmstr.size(); ++i)
+    if (asmstr[i] == '\n')
+      asmstr.replace(i, 1, "\\n");
+    else if (asmstr[i] == '\t')
+      asmstr.replace(i, 1, "\\t");
+    else if (asmstr[i] == '$') {
+      if (asmstr[i + 1] == '{') {
+        std::string::size_type a = asmstr.find_first_of(':', i + 1);
+        std::string::size_type b = asmstr.find_first_of('}', i + 1);
+        std::string n = "%" + 
+          asmstr.substr(a + 1, b - a - 1) +
+          asmstr.substr(i + 2, a - i - 2);
+        asmstr.replace(i, b - i + 1, n);
+        i += n.size() - 1;
+      } else
+        asmstr.replace(i, 1, "%");
+    }
+    else if (asmstr[i] == '%')//grr
+      { asmstr.replace(i, 1, "%%"); ++i;}
+  
+  return asmstr;
+}
+
+//TODO: assumptions about what consume arguments from the call are likely wrong
+//      handle communitivity
+void CWriter::visitInlineAsm(CallInst &CI) {
+  InlineAsm* as = cast<InlineAsm>(CI.getOperand(0));
+  std::vector<InlineAsm::ConstraintInfo> Constraints = as->ParseConstraints();
+  std::vector<std::pair<std::string, Value*> > Input;
+  std::vector<std::pair<std::string, Value*> > Output;
+  std::string Clobber;
+  int count = CI.getType() == Type::VoidTy ? 1 : 0;
+  for (std::vector<InlineAsm::ConstraintInfo>::iterator I = Constraints.begin(),
+         E = Constraints.end(); I != E; ++I) {
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string c = 
+      InterpretASMConstraint(*I);
+    switch(I->Type) {
+    default:
+      assert(0 && "Unknown asm constraint");
+      break;
+    case InlineAsm::isInput: {
+      if (c.size()) {
+        Input.push_back(std::make_pair(c, count ? CI.getOperand(count) : &CI));
+        ++count; //consume arg
+      }
+      break;
+    }
+    case InlineAsm::isOutput: {
+      if (c.size()) {
+        Output.push_back(std::make_pair("="+((I->isEarlyClobber ? "&" : "")+c),
+                                        count ? CI.getOperand(count) : &CI));
+        ++count; //consume arg
+      }
+      break;
+    }
+    case InlineAsm::isClobber: {
+      if (c.size()) 
+        Clobber += ",\"" + c + "\"";
+      break;
+    }
+    }
+  }
+  
+  //fix up the asm string for gcc
+  std::string asmstr = gccifyAsm(as->getAsmString());
+  
+  Out << "__asm__ volatile (\"" << asmstr << "\"\n";
+  Out << "        :";
+  for (std::vector<std::pair<std::string, Value*> >::iterator I = Output.begin(),
+         E = Output.end(); I != E; ++I) {
+    Out << "\"" << I->first << "\"(";
+    writeOperandRaw(I->second);
+    Out << ")";
+    if (I + 1 != E)
+      Out << ",";
+  }
+  Out << "\n        :";
+  for (std::vector<std::pair<std::string, Value*> >::iterator I = Input.begin(),
+         E = Input.end(); I != E; ++I) {
+    Out << "\"" << I->first << "\"(";
+    writeOperandRaw(I->second);
+    Out << ")";
+    if (I + 1 != E)
+      Out << ",";
+  }
+  if (Clobber.size())
+    Out << "\n        :" << Clobber.substr(1);
+  Out << ")";
+}
+
+void CWriter::visitMallocInst(MallocInst &I) {
+  assert(0 && "lowerallocations pass didn't work!");
+}
+
+void CWriter::visitAllocaInst(AllocaInst &I) {
+  Out << '(';
+  printType(Out, I.getType());
+  Out << ") alloca(sizeof(";
+  printType(Out, I.getType()->getElementType());
+  Out << ')';
+  if (I.isArrayAllocation()) {
+    Out << " * " ;
+    writeOperand(I.getOperand(0));
+  }
+  Out << ')';
+}
+
+void CWriter::visitFreeInst(FreeInst &I) {
+  assert(0 && "lowerallocations pass didn't work!");
+}
+
+void CWriter::printIndexingExpression(Value *Ptr, gep_type_iterator I,
+                                      gep_type_iterator E) {
+  bool HasImplicitAddress = false;
+  // If accessing a global value with no indexing, avoid *(&GV) syndrome
+  if (isa<GlobalValue>(Ptr)) {
+    HasImplicitAddress = true;
+  } else if (isDirectAlloca(Ptr)) {
+    HasImplicitAddress = true;
+  }
+
+  if (I == E) {
+    if (!HasImplicitAddress)
+      Out << '*';  // Implicit zero first argument: '*x' is equivalent to 'x[0]'
+
+    writeOperandInternal(Ptr);
+    return;
+  }
+
+  const Constant *CI = dyn_cast<Constant>(I.getOperand());
+  if (HasImplicitAddress && (!CI || !CI->isNullValue()))
+    Out << "(&";
+
+  writeOperandInternal(Ptr);
+
+  if (HasImplicitAddress && (!CI || !CI->isNullValue())) {
+    Out << ')';
+    HasImplicitAddress = false;  // HIA is only true if we haven't addressed yet
+  }
+
+  assert(!HasImplicitAddress || (CI && CI->isNullValue()) &&
+         "Can only have implicit address with direct accessing");
+
+  if (HasImplicitAddress) {
+    ++I;
+  } else if (CI && CI->isNullValue()) {
+    gep_type_iterator TmpI = I; ++TmpI;
+
+    // Print out the -> operator if possible...
+    if (TmpI != E && isa<StructType>(*TmpI)) {
+      Out << (HasImplicitAddress ? "." : "->");
+      Out << "field" << cast<ConstantInt>(TmpI.getOperand())->getZExtValue();
+      I = ++TmpI;
+    }
+  }
+
+  for (; I != E; ++I)
+    if (isa<StructType>(*I)) {
+      Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
+    } else {
+      Out << '[';
+      writeOperand(I.getOperand());
+      Out << ']';
+    }
+}
+
+void CWriter::visitLoadInst(LoadInst &I) {
+  Out << '*';
+  if (I.isVolatile()) {
+    Out << "((";
+    printType(Out, I.getType(), false, "volatile*");
+    Out << ")";
+  }
+
+  writeOperand(I.getOperand(0));
+
+  if (I.isVolatile())
+    Out << ')';
+}
+
+void CWriter::visitStoreInst(StoreInst &I) {
+  Out << '*';
+  if (I.isVolatile()) {
+    Out << "((";
+    printType(Out, I.getOperand(0)->getType(), false, " volatile*");
+    Out << ")";
+  }
+  writeOperand(I.getPointerOperand());
+  if (I.isVolatile()) Out << ')';
+  Out << " = ";
+  Value *Operand = I.getOperand(0);
+  Constant *BitMask = 0;
+  if (const IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
+    if (!ITy->isPowerOf2ByteWidth())
+      // We have a bit width that doesn't match an even power-of-2 byte
+      // size. Consequently we must & the value with the type's bit mask
+      BitMask = ConstantInt::get(ITy, ITy->getBitMask());
+  if (BitMask)
+    Out << "((";
+  writeOperand(Operand);
+  if (BitMask) {
+    Out << ") & ";
+    printConstant(BitMask);
+    Out << ")"; 
+  }
+}
+
+void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) {
+  Out << '&';
+  printIndexingExpression(I.getPointerOperand(), gep_type_begin(I),
+                          gep_type_end(I));
+}
+
+void CWriter::visitVAArgInst(VAArgInst &I) {
+  Out << "va_arg(*(va_list*)";
+  writeOperand(I.getOperand(0));
+  Out << ", ";
+  printType(Out, I.getType());
+  Out << ");\n ";
+}
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool CTargetMachine::addPassesToEmitWholeFile(PassManager &PM,
+                                              std::ostream &o,
+                                              CodeGenFileType FileType,
+                                              bool Fast) {
+  if (FileType != TargetMachine::AssemblyFile) return true;
+
+  PM.add(createLowerGCPass());
+  PM.add(createLowerAllocationsPass(true));
+  PM.add(createLowerInvokePass());
+  PM.add(createCFGSimplificationPass());   // clean up after lower invoke.
+  PM.add(new CBackendNameAllUsedStructsAndMergeFunctions());
+  PM.add(new CWriter(o));
+  return false;
+}

diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h
new file mode 100644
index 0000000..38c738e
--- /dev/null
+++ b/lib/Target/CBackend/CTargetMachine.h

@@ -0,0 +1,41 @@
+//===-- CTargetMachine.h - TargetMachine for the C backend ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TargetMachine that is used by the C backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CTARGETMACHINE_H
+#define CTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+struct CTargetMachine : public TargetMachine {
+  const TargetData DataLayout;       // Calculates type size & alignment
+
+  CTargetMachine(const Module &M, const std::string &FS)
+    : DataLayout(&M) {}
+
+  virtual bool WantsWholeFile() const { return true; }
+  virtual bool addPassesToEmitWholeFile(PassManager &PM, std::ostream &Out,
+                                        CodeGenFileType FileType, bool Fast);
+
+  // This class always works, but shouldn't be the default in most cases.
+  static unsigned getModuleMatchQuality(const Module &M) { return 1; }
+  
+  virtual const TargetData *getTargetData() const { return &DataLayout; }
+};
+
+} // End llvm namespace
+
+
+#endif

diff --git a/lib/Target/CBackend/Makefile b/lib/Target/CBackend/Makefile
new file mode 100644
index 0000000..fea2494
--- /dev/null
+++ b/lib/Target/CBackend/Makefile

@@ -0,0 +1,14 @@
+##===- lib/Target/CBackend/Makefile ------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by the LLVM research group and is distributed under
+# the University of Illinois Open Source License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMCBackend
+include $(LEVEL)/Makefile.common
+
+CompileCommonOpts += -Wno-format

diff --git a/lib/Target/CellSPU/README.txt b/lib/Target/CellSPU/README.txt
new file mode 100644
index 0000000..cf2a974
--- /dev/null
+++ b/lib/Target/CellSPU/README.txt

@@ -0,0 +1,10 @@
+//===- README.txt - Notes for improving CellSPU-specific code gen ---------===//
+
+TODO:
+* Check in the actual code.
+
+===-------------------------------------------------------------------------===
+
+Note: The CellSPU work is work-in-progress and "alpha" quality code. No code
+has been officially checked into the llvm repo, but this will happen Real Soon,
+Real Soon Now.

diff --git a/lib/Target/IA64/IA64.h b/lib/Target/IA64/IA64.h
new file mode 100644
index 0000000..e5b84e6
--- /dev/null
+++ b/lib/Target/IA64/IA64.h

@@ -0,0 +1,54 @@
+//===-- IA64.h - Top-level interface for IA64 representation ------*- C++ -*-===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the IA64
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_IA64_H
+#define TARGET_IA64_H
+
+#include <iosfwd>
+
+namespace llvm {
+
+class IA64TargetMachine;
+class FunctionPass;
+
+/// createIA64DAGToDAGInstructionSelector - This pass converts an LLVM
+/// function into IA64 machine code in a sane, DAG->DAG transform.
+///
+FunctionPass *createIA64DAGToDAGInstructionSelector(IA64TargetMachine &TM);
+
+/// createIA64BundlingPass - This pass adds stop bits and bundles
+/// instructions.
+///
+FunctionPass *createIA64BundlingPass(IA64TargetMachine &TM);
+
+/// createIA64CodePrinterPass - Returns a pass that prints the IA64
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *createIA64CodePrinterPass(std::ostream &o, IA64TargetMachine &tm);
+
+} // End llvm namespace
+
+// Defines symbolic names for IA64 registers.  This defines a mapping from
+// register name to register number.
+//
+#include "IA64GenRegisterNames.inc"
+
+// Defines symbolic names for the IA64 instructions.
+//
+#include "IA64GenInstrNames.inc"
+
+#endif
+
+

diff --git a/lib/Target/IA64/IA64.td b/lib/Target/IA64/IA64.td
new file mode 100644
index 0000000..2e231d4
--- /dev/null
+++ b/lib/Target/IA64/IA64.td

@@ -0,0 +1,39 @@
+//===-- IA64.td - Target definition file for Intel IA64 -------------------===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel IA64 architecture,
+// also known variously as ia64, IA-64, IPF, "the Itanium architecture" etc.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "../Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "IA64RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "IA64InstrInfo.td"
+
+def IA64InstrInfo : InstrInfo { }
+
+def IA64 : Target {
+  // Our instruction set
+  let InstructionSet = IA64InstrInfo;
+
+}
+
+

diff --git a/lib/Target/IA64/IA64AsmPrinter.cpp b/lib/Target/IA64/IA64AsmPrinter.cpp
new file mode 100644
index 0000000..d576c4c
--- /dev/null
+++ b/lib/Target/IA64/IA64AsmPrinter.cpp

@@ -0,0 +1,360 @@
+//===-- IA64AsmPrinter.cpp - Print out IA64 LLVM as assembly --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to assembly accepted by the GNU binutils 'gas'
+// assembler. The Intel 'ias' and HP-UX 'as' assemblers *may* choke on this
+// output, but if so that's a bug I'd like to hear about: please file a bug
+// report in bugzilla. FYI, the not too bad 'ias' assembler is bundled with
+// the Intel C/C++ compiler for Itanium Linux.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "IA64.h"
+#include "IA64TargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  struct IA64AsmPrinter : public AsmPrinter {
+    std::set<std::string> ExternalFunctionNames, ExternalObjectNames;
+
+    IA64AsmPrinter(std::ostream &O, TargetMachine &TM, const TargetAsmInfo *T)
+      : AsmPrinter(O, TM, T) {
+    }
+
+    virtual const char *getPassName() const {
+      return "IA64 Assembly Printer";
+    }
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.  This method returns true if the
+    /// machine instruction was sufficiently described to print it, otherwise it
+    /// returns false.
+    bool printInstruction(const MachineInstr *MI);
+
+    // This method is used by the tablegen'erated instruction printer.
+    void printOperand(const MachineInstr *MI, unsigned OpNo){
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (MO.getType() == MachineOperand::MO_Register) {
+        assert(MRegisterInfo::isPhysicalRegister(MO.getReg())&&"Not physref??");
+        //XXX Bug Workaround: See note in Printer::doInitialization about %.
+        O << TM.getRegisterInfo()->get(MO.getReg()).Name;
+      } else {
+        printOp(MO);
+      }
+    }
+
+    void printS8ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      int val=(unsigned int)MI->getOperand(OpNo).getImmedValue();
+      if(val>=128) val=val-256; // if negative, flip sign
+      O << val;
+    }
+    void printS14ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      int val=(unsigned int)MI->getOperand(OpNo).getImmedValue();
+      if(val>=8192) val=val-16384; // if negative, flip sign
+      O << val;
+    }
+    void printS22ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      int val=(unsigned int)MI->getOperand(OpNo).getImmedValue();
+      if(val>=2097152) val=val-4194304; // if negative, flip sign
+      O << val;
+    }
+    void printU64ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      O << (uint64_t)MI->getOperand(OpNo).getImmedValue();
+    }
+    void printS64ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+// XXX : nasty hack to avoid GPREL22 "relocation truncated to fit" linker
+// errors - instead of add rX = @gprel(CPI<whatever>), r1;; we now
+// emit movl rX = @gprel(CPI<whatever);;
+//      add  rX = rX, r1; 
+// this gives us 64 bits instead of 22 (for the add long imm) to play
+// with, which shuts up the linker. The problem is that the constant
+// pool entries aren't immediates at this stage, so we check here. 
+// If it's an immediate, print it the old fashioned way. If it's
+// not, we print it as a constant pool index. 
+      if(MI->getOperand(OpNo).isImmediate()) {
+        O << (int64_t)MI->getOperand(OpNo).getImmedValue();
+      } else { // this is a constant pool reference: FIXME: assert this
+        printOp(MI->getOperand(OpNo));
+      }
+    }
+
+    void printGlobalOperand(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo), false); // this is NOT a br.call instruction
+    }
+
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo), true); // this is a br.call instruction
+    }
+
+    std::string getSectionForFunction(const Function &F) const;
+
+    void printMachineInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO, bool isBRCALLinsn= false);
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+  };
+} // end of anonymous namespace
+
+
+// Include the auto-generated portion of the assembly writer.
+#include "IA64GenAsmWriter.inc"
+
+
+std::string IA64AsmPrinter::getSectionForFunction(const Function &F) const {
+  // This means "Allocated instruXions in mem, initialized".
+  return "\n\t.section .text, \"ax\", \"progbits\"\n";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool IA64AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  const Function *F = MF.getFunction();
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+
+  // Print out labels for the function.
+  EmitAlignment(5);
+  O << "\t.global\t" << CurrentFnName << "\n";
+  O << "\t.type\t" << CurrentFnName << ", @function\n";
+  O << CurrentFnName << ":\n";
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block if there are any predecessors.
+    if (I->pred_begin() != I->pred_end()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printMachineInstruction(II);
+    }
+  }
+
+  // We didn't modify anything.
+  return false;
+}
+
+void IA64AsmPrinter::printOp(const MachineOperand &MO,
+                             bool isBRCALLinsn /* = false */) {
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << RI.get(MO.getReg()).Name;
+    return;
+
+  case MachineOperand::MO_Immediate:
+    O << MO.getImmedValue();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMachineBasicBlock());
+    return;
+  case MachineOperand::MO_ConstantPoolIndex: {
+    O << "@gprel(" << TAI->getPrivateGlobalPrefix()
+      << "CPI" << getFunctionNumber() << "_"
+      << MO.getConstantPoolIndex() << ")";
+    return;
+  }
+
+  case MachineOperand::MO_GlobalAddress: {
+
+    // functions need @ltoff(@fptr(fn_name)) form
+    GlobalValue *GV = MO.getGlobal();
+    Function *F = dyn_cast<Function>(GV);
+
+    bool Needfptr=false; // if we're computing an address @ltoff(X), do
+                         // we need to decorate it so it becomes
+                         // @ltoff(@fptr(X)) ?
+    if (F && !isBRCALLinsn /*&& F->isDeclaration()*/)
+      Needfptr=true;
+
+    // if this is the target of a call instruction, we should define
+    // the function somewhere (GNU gas has no problem without this, but
+    // Intel ias rightly complains of an 'undefined symbol')
+
+    if (F /*&& isBRCALLinsn*/ && F->isDeclaration())
+      ExternalFunctionNames.insert(Mang->getValueName(MO.getGlobal()));
+    else
+      if (GV->isDeclaration()) // e.g. stuff like 'stdin'
+        ExternalObjectNames.insert(Mang->getValueName(MO.getGlobal()));
+
+    if (!isBRCALLinsn)
+      O << "@ltoff(";
+    if (Needfptr)
+      O << "@fptr(";
+    O << Mang->getValueName(MO.getGlobal());
+    
+    if (Needfptr && !isBRCALLinsn)
+      O << "#))"; // close both fptr( and ltoff(
+    else {
+      if (Needfptr)
+        O << "#)"; // close only fptr(
+      if (!isBRCALLinsn)
+        O << "#)"; // close only ltoff(
+    }
+    
+    int Offset = MO.getOffset();
+    if (Offset > 0)
+      O << " + " << Offset;
+    else if (Offset < 0)
+      O << " - " << -Offset;
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    ExternalFunctionNames.insert(MO.getSymbolName());
+    return;
+  default:
+    O << "<AsmPrinter: unknown operand type: " << MO.getType() << " >"; return;
+  }
+}
+
+/// printMachineInstruction -- Print out a single IA64 LLVM instruction
+/// MI to the current output stream.
+///
+void IA64AsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+}
+
+bool IA64AsmPrinter::doInitialization(Module &M) {
+  AsmPrinter::doInitialization(M);
+
+  O << "\n.ident \"LLVM-ia64\"\n\n"
+    << "\t.psr    lsb\n"  // should be "msb" on HP-UX, for starters
+    << "\t.radix  C\n"
+    << "\t.psr    abi64\n"; // we only support 64 bits for now
+  return false;
+}
+
+bool IA64AsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+  
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (I->hasInitializer()) {   // External global require no code
+      // Check to see if this is a special global used by LLVM, if so, emit it.
+      if (EmitSpecialLLVMGlobal(I))
+        continue;
+      
+      O << "\n\n";
+      std::string name = Mang->getValueName(I);
+      Constant *C = I->getInitializer();
+      unsigned Size = TD->getTypeSize(C->getType());
+      unsigned Align = TD->getPreferredTypeAlignmentShift(C->getType());
+      
+      if (C->isNullValue() &&
+          (I->hasLinkOnceLinkage() || I->hasInternalLinkage() ||
+           I->hasWeakLinkage() /* FIXME: Verify correct */)) {
+        SwitchToDataSection(".data", I);
+        if (I->hasInternalLinkage()) {
+          O << "\t.lcomm " << name << "#," << TD->getTypeSize(C->getType())
+          << "," << (1 << Align);
+          O << "\n";
+        } else {
+          O << "\t.common " << name << "#," << TD->getTypeSize(C->getType())
+          << "," << (1 << Align);
+          O << "\n";
+        }
+      } else {
+        switch (I->getLinkage()) {
+          case GlobalValue::LinkOnceLinkage:
+          case GlobalValue::WeakLinkage:   // FIXME: Verify correct for weak.
+                                           // Nonnull linkonce -> weak
+            O << "\t.weak " << name << "\n";
+            O << "\t.section\t.llvm.linkonce.d." << name
+              << ", \"aw\", \"progbits\"\n";
+            SwitchToDataSection("", I);
+            break;
+          case GlobalValue::AppendingLinkage:
+            // FIXME: appending linkage variables should go into a section of
+            // their name or something.  For now, just emit them as external.
+          case GlobalValue::ExternalLinkage:
+            // If external or appending, declare as a global symbol
+            O << "\t.global " << name << "\n";
+            // FALL THROUGH
+          case GlobalValue::InternalLinkage:
+            SwitchToDataSection(C->isNullValue() ? ".bss" : ".data", I);
+            break;
+          case GlobalValue::GhostLinkage:
+            cerr << "GhostLinkage cannot appear in IA64AsmPrinter!\n";
+            abort();
+          case GlobalValue::DLLImportLinkage:
+            cerr << "DLLImport linkage is not supported by this target!\n";
+            abort();
+          case GlobalValue::DLLExportLinkage:
+            cerr << "DLLExport linkage is not supported by this target!\n";
+            abort();
+          default:
+            assert(0 && "Unknown linkage type!");            
+        }
+        
+        EmitAlignment(Align);
+        O << "\t.type " << name << ",@object\n";
+        O << "\t.size " << name << "," << Size << "\n";
+        O << name << ":\t\t\t\t// " << *C << "\n";
+        EmitGlobalConstant(C);
+      }
+    }
+      
+      // we print out ".global X \n .type X, @function" for each external function
+      O << "\n\n// br.call targets referenced (and not defined) above: \n";
+  for (std::set<std::string>::iterator i = ExternalFunctionNames.begin(),
+       e = ExternalFunctionNames.end(); i!=e; ++i) {
+    O << "\t.global " << *i << "\n\t.type " << *i << ", @function\n";
+  }
+  O << "\n\n";
+  
+  // we print out ".global X \n .type X, @object" for each external object
+  O << "\n\n// (external) symbols referenced (and not defined) above: \n";
+  for (std::set<std::string>::iterator i = ExternalObjectNames.begin(),
+       e = ExternalObjectNames.end(); i!=e; ++i) {
+    O << "\t.global " << *i << "\n\t.type " << *i << ", @object\n";
+  }
+  O << "\n\n";
+  
+  AsmPrinter::doFinalization(M);
+  return false; // success
+}
+
+/// createIA64CodePrinterPass - Returns a pass that prints the IA64
+/// assembly code for a MachineFunction to the given output stream, using
+/// the given target machine description.
+///
+FunctionPass *llvm::createIA64CodePrinterPass(std::ostream &o,
+                                              IA64TargetMachine &tm) {
+  return new IA64AsmPrinter(o, tm, tm.getTargetAsmInfo());
+}
+
+

diff --git a/lib/Target/IA64/IA64Bundling.cpp b/lib/Target/IA64/IA64Bundling.cpp
new file mode 100644
index 0000000..6c9fa29
--- /dev/null
+++ b/lib/Target/IA64/IA64Bundling.cpp

@@ -0,0 +1,118 @@
+//===-- IA64Bundling.cpp - IA-64 instruction bundling pass. ------------ --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Add stops where required to prevent read-after-write and write-after-write
+// dependencies, for both registers and memory addresses. There are exceptions:
+//
+//    - Compare instructions (cmp*, tbit, tnat, fcmp, frcpa) are OK with
+//      WAW dependencies so long as they all target p0, or are of parallel
+//      type (.and*/.or*)
+//
+// FIXME: bundling, for now, is left to the assembler.
+// FIXME: this might be an appropriate place to translate between different
+//        instructions that do the same thing, if this helps bundling.
+// 
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ia64-codegen"
+#include "IA64.h"
+#include "IA64InstrInfo.h"
+#include "IA64TargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(StopBitsAdded, "Number of stop bits added");
+
+namespace {
+  struct IA64BundlingPass : public MachineFunctionPass {
+    static char ID;
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    IA64TargetMachine &TM;
+
+    IA64BundlingPass(IA64TargetMachine &tm) 
+      : MachineFunctionPass((intptr_t)&ID), TM(tm) { }
+
+    virtual const char *getPassName() const {
+      return "IA64 (Itanium) Bundling Pass";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+    // XXX: ugly global, but pending writes can cross basic blocks. Note that
+    // taken branches end instruction groups. So we only need to worry about
+    // 'fallthrough' code
+    std::set<unsigned> PendingRegWrites;
+  };
+  char IA64BundlingPass::ID = 0;
+} // end of anonymous namespace
+
+/// createIA64BundlingPass - Returns a pass that adds STOP (;;) instructions
+/// and arranges the result into bundles.
+///
+FunctionPass *llvm::createIA64BundlingPass(IA64TargetMachine &tm) {
+  return new IA64BundlingPass(tm);
+}
+
+/// runOnMachineBasicBlock - add stops and bundle this MBB.
+///
+bool IA64BundlingPass::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+    MachineInstr *CurrentInsn = I++;
+    std::set<unsigned> CurrentReads, CurrentWrites, OrigWrites;
+
+    for(unsigned i=0; i < CurrentInsn->getNumOperands(); i++) {
+      MachineOperand &MO=CurrentInsn->getOperand(i);
+      if(MO.isRegister()) {
+        if(MO.isUse()) { // TODO: exclude p0
+          CurrentReads.insert(MO.getReg());
+        }
+        if(MO.isDef()) { // TODO: exclude p0
+          CurrentWrites.insert(MO.getReg());
+          OrigWrites.insert(MO.getReg()); // FIXME: use a nondestructive
+                                          // set_intersect instead?
+        }
+      }
+    }
+    
+    // CurrentReads/CurrentWrites contain info for the current instruction.
+    // Does it read or write any registers that are pending a write?
+    // (i.e. not separated by a stop)
+    set_intersect(CurrentReads, PendingRegWrites);
+    set_intersect(CurrentWrites, PendingRegWrites);
+    
+    if(! (CurrentReads.empty() && CurrentWrites.empty()) ) {
+      // there is a conflict, insert a stop and reset PendingRegWrites
+      CurrentInsn = BuildMI(MBB, CurrentInsn,
+                            TM.getInstrInfo()->get(IA64::STOP), 0);
+      PendingRegWrites=OrigWrites; // carry over current writes to next insn
+      Changed=true; StopBitsAdded++; // update stats      
+    } else { // otherwise, track additional pending writes
+      set_union(PendingRegWrites, OrigWrites);
+    }
+  } // onto the next insn in the MBB
+
+  return Changed;
+}
+

diff --git a/lib/Target/IA64/IA64ISelDAGToDAG.cpp b/lib/Target/IA64/IA64ISelDAGToDAG.cpp
new file mode 100644
index 0000000..53b704e
--- /dev/null
+++ b/lib/Target/IA64/IA64ISelDAGToDAG.cpp

@@ -0,0 +1,587 @@
+//===---- IA64ISelDAGToDAG.cpp - IA64 pattern matching inst selector ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for IA64,
+// converting a legalized dag to an IA64 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ia64-codegen"
+#include "IA64.h"
+#include "IA64TargetMachine.h"
+#include "IA64ISelLowering.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include <queue>
+#include <set>
+using namespace llvm;
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// IA64DAGToDAGISel - IA64 specific code to select IA64 machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class IA64DAGToDAGISel : public SelectionDAGISel {
+    IA64TargetLowering IA64Lowering;
+    unsigned GlobalBaseReg;
+  public:
+    IA64DAGToDAGISel(IA64TargetMachine &TM)
+      : SelectionDAGISel(IA64Lowering), IA64Lowering(*TM.getTargetLowering()) {}
+    
+    virtual bool runOnFunction(Function &Fn) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      return SelectionDAGISel::runOnFunction(Fn);
+    }
+ 
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDOperand getI64Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+
+    /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+    /// base register.  Return the virtual register that holds this value.
+    // SDOperand getGlobalBaseReg(); TODO: hmm
+    
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    SDNode *Select(SDOperand N);
+    
+    SDNode *SelectIntImmediateExpr(SDOperand LHS, SDOperand RHS,
+                                   unsigned OCHi, unsigned OCLo,
+                                   bool IsArithmetic = false,
+                                   bool Negate = false);
+    SDNode *SelectBitfieldInsert(SDNode *N);
+
+    /// SelectCC - Select a comparison of the specified values with the
+    /// specified condition code, returning the CR# of the expression.
+    SDOperand SelectCC(SDOperand LHS, SDOperand RHS, ISD::CondCode CC);
+
+    /// SelectAddr - Given the specified address, return the two operands for a
+    /// load/store instruction, and return true if it should be an indexed [r+r]
+    /// operation.
+    bool SelectAddr(SDOperand Addr, SDOperand &Op1, SDOperand &Op2);
+
+    /// InstructionSelectBasicBlock - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelectBasicBlock(SelectionDAG &DAG);
+    
+    virtual const char *getPassName() const {
+      return "IA64 (Itanium) DAG->DAG Instruction Selector";
+    } 
+
+// Include the pieces autogenerated from the target description.
+#include "IA64GenDAGISel.inc"
+    
+private:
+    SDNode *SelectDIV(SDOperand Op);
+  };
+}
+
+/// InstructionSelectBasicBlock - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void IA64DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
+  DEBUG(BB->dump());
+
+  // Select target instructions for the DAG.
+  DAG.setRoot(SelectRoot(DAG.getRoot()));
+  DAG.RemoveDeadNodes();
+  
+  // Emit machine code to BB. 
+  ScheduleAndEmitDAG(DAG);
+}
+
+SDNode *IA64DAGToDAGISel::SelectDIV(SDOperand Op) {
+  SDNode *N = Op.Val;
+  SDOperand Chain = N->getOperand(0);
+  SDOperand Tmp1 = N->getOperand(0);
+  SDOperand Tmp2 = N->getOperand(1);
+  AddToISelQueue(Chain);
+
+  AddToISelQueue(Tmp1);
+  AddToISelQueue(Tmp2);
+
+  bool isFP=false;
+
+  if(MVT::isFloatingPoint(Tmp1.getValueType()))
+    isFP=true;
+    
+  bool isModulus=false; // is it a division or a modulus?
+  bool isSigned=false;
+
+  switch(N->getOpcode()) {
+    case ISD::FDIV:
+    case ISD::SDIV:  isModulus=false; isSigned=true;  break;
+    case ISD::UDIV:  isModulus=false; isSigned=false; break;
+    case ISD::FREM:
+    case ISD::SREM:  isModulus=true;  isSigned=true;  break;
+    case ISD::UREM:  isModulus=true;  isSigned=false; break;
+  }
+
+  // TODO: check for integer divides by powers of 2 (or other simple patterns?)
+
+    SDOperand TmpPR, TmpPR2;
+    SDOperand TmpF1, TmpF2, TmpF3, TmpF4, TmpF5, TmpF6, TmpF7, TmpF8;
+    SDOperand TmpF9, TmpF10,TmpF11,TmpF12,TmpF13,TmpF14,TmpF15;
+    SDNode *Result;
+
+    // we'll need copies of F0 and F1
+    SDOperand F0 = CurDAG->getRegister(IA64::F0, MVT::f64);
+    SDOperand F1 = CurDAG->getRegister(IA64::F1, MVT::f64);
+    
+    // OK, emit some code:
+
+    if(!isFP) {
+      // first, load the inputs into FP regs.
+      TmpF1 =
+        SDOperand(CurDAG->getTargetNode(IA64::SETFSIG, MVT::f64, Tmp1), 0);
+      Chain = TmpF1.getValue(1);
+      TmpF2 =
+        SDOperand(CurDAG->getTargetNode(IA64::SETFSIG, MVT::f64, Tmp2), 0);
+      Chain = TmpF2.getValue(1);
+      
+      // next, convert the inputs to FP
+      if(isSigned) {
+        TmpF3 =
+          SDOperand(CurDAG->getTargetNode(IA64::FCVTXF, MVT::f64, TmpF1), 0);
+        Chain = TmpF3.getValue(1);
+        TmpF4 =
+          SDOperand(CurDAG->getTargetNode(IA64::FCVTXF, MVT::f64, TmpF2), 0);
+        Chain = TmpF4.getValue(1);
+      } else { // is unsigned
+        TmpF3 =
+          SDOperand(CurDAG->getTargetNode(IA64::FCVTXUFS1, MVT::f64, TmpF1), 0);
+        Chain = TmpF3.getValue(1);
+        TmpF4 =
+          SDOperand(CurDAG->getTargetNode(IA64::FCVTXUFS1, MVT::f64, TmpF2), 0);
+        Chain = TmpF4.getValue(1);
+      }
+
+    } else { // this is an FP divide/remainder, so we 'leak' some temp
+             // regs and assign TmpF3=Tmp1, TmpF4=Tmp2
+      TmpF3=Tmp1;
+      TmpF4=Tmp2;
+    }
+
+    // we start by computing an approximate reciprocal (good to 9 bits?)
+    // note, this instruction writes _both_ TmpF5 (answer) and TmpPR (predicate)
+    if(isFP)
+      TmpF5 = SDOperand(CurDAG->getTargetNode(IA64::FRCPAS0, MVT::f64, MVT::i1,
+                                              TmpF3, TmpF4), 0);
+    else
+      TmpF5 = SDOperand(CurDAG->getTargetNode(IA64::FRCPAS1, MVT::f64, MVT::i1,
+                                              TmpF3, TmpF4), 0);
+                                  
+    TmpPR = TmpF5.getValue(1);
+    Chain = TmpF5.getValue(2);
+
+    SDOperand minusB;
+    if(isModulus) { // for remainders, it'll be handy to have
+                             // copies of -input_b
+      minusB = SDOperand(CurDAG->getTargetNode(IA64::SUB, MVT::i64,
+                  CurDAG->getRegister(IA64::r0, MVT::i64), Tmp2), 0);
+      Chain = minusB.getValue(1);
+    }
+    
+    SDOperand TmpE0, TmpY1, TmpE1, TmpY2;
+
+    SDOperand OpsE0[] = { TmpF4, TmpF5, F1, TmpPR };
+    TmpE0 = SDOperand(CurDAG->getTargetNode(IA64::CFNMAS1, MVT::f64,
+                                            OpsE0, 4), 0);
+    Chain = TmpE0.getValue(1);
+    SDOperand OpsY1[] = { TmpF5, TmpE0, TmpF5, TmpPR };
+    TmpY1 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64,
+                                            OpsY1, 4), 0);
+    Chain = TmpY1.getValue(1);
+    SDOperand OpsE1[] = { TmpE0, TmpE0, F0, TmpPR };
+    TmpE1 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64,
+                                            OpsE1, 4), 0);
+    Chain = TmpE1.getValue(1);
+    SDOperand OpsY2[] = { TmpY1, TmpE1, TmpY1, TmpPR };
+    TmpY2 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64,
+                                            OpsY2, 4), 0);
+    Chain = TmpY2.getValue(1);
+    
+    if(isFP) { // if this is an FP divide, we finish up here and exit early
+      if(isModulus)
+        assert(0 && "Sorry, try another FORTRAN compiler.");
+ 
+      SDOperand TmpE2, TmpY3, TmpQ0, TmpR0;
+
+      SDOperand OpsE2[] = { TmpE1, TmpE1, F0, TmpPR };
+      TmpE2 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64,
+                                              OpsE2, 4), 0);
+      Chain = TmpE2.getValue(1);
+      SDOperand OpsY3[] = { TmpY2, TmpE2, TmpY2, TmpPR };
+      TmpY3 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64,
+                                              OpsY3, 4), 0);
+      Chain = TmpY3.getValue(1);
+      SDOperand OpsQ0[] = { Tmp1, TmpY3, F0, TmpPR };
+      TmpQ0 =
+        SDOperand(CurDAG->getTargetNode(IA64::CFMADS1, MVT::f64, // double prec!
+                                        OpsQ0, 4), 0);
+      Chain = TmpQ0.getValue(1);
+      SDOperand OpsR0[] = { Tmp2, TmpQ0, Tmp1, TmpPR };
+      TmpR0 =
+        SDOperand(CurDAG->getTargetNode(IA64::CFNMADS1, MVT::f64, // double prec!
+                                        OpsR0, 4), 0);
+      Chain = TmpR0.getValue(1);
+
+// we want Result to have the same target register as the frcpa, so
+// we two-address hack it. See the comment "for this to work..." on
+// page 48 of Intel application note #245415
+      SDOperand Ops[] = { TmpF5, TmpY3, TmpR0, TmpQ0, TmpPR };
+      Result = CurDAG->getTargetNode(IA64::TCFMADS0, MVT::f64, // d.p. s0 rndg!
+                                     Ops, 5);
+      Chain = SDOperand(Result, 1);
+      return Result; // XXX: early exit!
+    } else { // this is *not* an FP divide, so there's a bit left to do:
+    
+      SDOperand TmpQ2, TmpR2, TmpQ3, TmpQ;
+
+      SDOperand OpsQ2[] = { TmpF3, TmpY2, F0, TmpPR };
+      TmpQ2 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64,
+                                              OpsQ2, 4), 0);
+      Chain = TmpQ2.getValue(1);
+      SDOperand OpsR2[] = { TmpF4, TmpQ2, TmpF3, TmpPR };
+      TmpR2 = SDOperand(CurDAG->getTargetNode(IA64::CFNMAS1, MVT::f64,
+                                              OpsR2, 4), 0);
+      Chain = TmpR2.getValue(1);
+      
+// we want TmpQ3 to have the same target register as the frcpa? maybe we
+// should two-address hack it. See the comment "for this to work..." on page
+// 48 of Intel application note #245415
+      SDOperand OpsQ3[] = { TmpF5, TmpR2, TmpY2, TmpQ2, TmpPR };
+      TmpQ3 = SDOperand(CurDAG->getTargetNode(IA64::TCFMAS1, MVT::f64,
+                                         OpsQ3, 5), 0);
+      Chain = TmpQ3.getValue(1);
+
+      // STORY: without these two-address instructions (TCFMAS1 and TCFMADS0)
+      // the FPSWA won't be able to help out in the case of large/tiny
+      // arguments. Other fun bugs may also appear, e.g. 0/x = x, not 0.
+      
+      if(isSigned)
+        TmpQ = SDOperand(CurDAG->getTargetNode(IA64::FCVTFXTRUNCS1,
+                                               MVT::f64, TmpQ3), 0);
+      else
+        TmpQ = SDOperand(CurDAG->getTargetNode(IA64::FCVTFXUTRUNCS1,
+                                               MVT::f64, TmpQ3), 0);
+      
+      Chain = TmpQ.getValue(1);
+
+      if(isModulus) {
+        SDOperand FPminusB =
+          SDOperand(CurDAG->getTargetNode(IA64::SETFSIG, MVT::f64, minusB), 0);
+        Chain = FPminusB.getValue(1);
+        SDOperand Remainder =
+          SDOperand(CurDAG->getTargetNode(IA64::XMAL, MVT::f64,
+                                          TmpQ, FPminusB, TmpF1), 0);
+        Chain = Remainder.getValue(1);
+        Result = CurDAG->getTargetNode(IA64::GETFSIG, MVT::i64, Remainder);
+        Chain = SDOperand(Result, 1);
+      } else { // just an integer divide
+        Result = CurDAG->getTargetNode(IA64::GETFSIG, MVT::i64, TmpQ);
+        Chain = SDOperand(Result, 1);
+      }
+
+      return Result;
+    } // wasn't an FP divide
+}
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+SDNode *IA64DAGToDAGISel::Select(SDOperand Op) {
+  SDNode *N = Op.Val;
+  if (N->getOpcode() >= ISD::BUILTIN_OP_END &&
+      N->getOpcode() < IA64ISD::FIRST_NUMBER)
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+
+  case IA64ISD::BRCALL: { // XXX: this is also a hack!
+    SDOperand Chain = N->getOperand(0);
+    SDOperand InFlag;  // Null incoming flag value.
+
+    AddToISelQueue(Chain);
+    if(N->getNumOperands()==3) { // we have an incoming chain, callee and flag
+      InFlag = N->getOperand(2);
+      AddToISelQueue(InFlag);
+    }
+
+    unsigned CallOpcode;
+    SDOperand CallOperand;
+    
+    // if we can call directly, do so
+    if (GlobalAddressSDNode *GASD =
+      dyn_cast<GlobalAddressSDNode>(N->getOperand(1))) {
+      CallOpcode = IA64::BRCALL_IPREL_GA;
+      CallOperand = CurDAG->getTargetGlobalAddress(GASD->getGlobal(), MVT::i64);
+    } else if (isa<ExternalSymbolSDNode>(N->getOperand(1))) {
+      // FIXME: we currently NEED this case for correctness, to avoid
+      // "non-pic code with imm reloc.n against dynamic symbol" errors
+    CallOpcode = IA64::BRCALL_IPREL_ES;
+    CallOperand = N->getOperand(1);
+  } else {
+    // otherwise we need to load the function descriptor,
+    // load the branch target (function)'s entry point and GP,
+    // branch (call) then restore the GP
+    SDOperand FnDescriptor = N->getOperand(1);
+    AddToISelQueue(FnDescriptor);
+   
+    // load the branch target's entry point [mem] and 
+    // GP value [mem+8]
+    SDOperand targetEntryPoint=
+      SDOperand(CurDAG->getTargetNode(IA64::LD8, MVT::i64, FnDescriptor), 0);
+    Chain = targetEntryPoint.getValue(1);
+    SDOperand targetGPAddr=
+      SDOperand(CurDAG->getTargetNode(IA64::ADDS, MVT::i64, 
+                                      FnDescriptor,
+                                      CurDAG->getConstant(8, MVT::i64)), 0);
+    Chain = targetGPAddr.getValue(1);
+    SDOperand targetGP =
+      SDOperand(CurDAG->getTargetNode(IA64::LD8, MVT::i64, targetGPAddr), 0);
+    Chain = targetGP.getValue(1);
+
+    Chain = CurDAG->getCopyToReg(Chain, IA64::r1, targetGP, InFlag);
+    InFlag = Chain.getValue(1);
+    Chain = CurDAG->getCopyToReg(Chain, IA64::B6, targetEntryPoint, InFlag); // FLAG these?
+    InFlag = Chain.getValue(1);
+    
+    CallOperand = CurDAG->getRegister(IA64::B6, MVT::i64);
+    CallOpcode = IA64::BRCALL_INDIRECT;
+  }
+ 
+   // Finally, once everything is setup, emit the call itself
+   if(InFlag.Val)
+     Chain = SDOperand(CurDAG->getTargetNode(CallOpcode, MVT::Other, MVT::Flag,
+                                             CallOperand, InFlag), 0);
+   else // there might be no arguments
+     Chain = SDOperand(CurDAG->getTargetNode(CallOpcode, MVT::Other, MVT::Flag,
+                                             CallOperand, Chain), 0);
+   InFlag = Chain.getValue(1);
+
+   std::vector<SDOperand> CallResults;
+
+   CallResults.push_back(Chain);
+   CallResults.push_back(InFlag);
+
+   for (unsigned i = 0, e = CallResults.size(); i != e; ++i)
+     ReplaceUses(Op.getValue(i), CallResults[i]);
+   return NULL;
+  }
+  
+  case IA64ISD::GETFD: {
+    SDOperand Input = N->getOperand(0);
+    AddToISelQueue(Input);
+    return CurDAG->getTargetNode(IA64::GETFD, MVT::i64, Input);
+  } 
+  
+  case ISD::FDIV:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+    return SelectDIV(Op);
+ 
+  case ISD::TargetConstantFP: {
+    SDOperand Chain = CurDAG->getEntryNode(); // this is a constant, so..
+
+    SDOperand V;
+    if (cast<ConstantFPSDNode>(N)->isExactlyValue(+0.0)) {
+      V = CurDAG->getCopyFromReg(Chain, IA64::F0, MVT::f64);
+    } else if (cast<ConstantFPSDNode>(N)->isExactlyValue(+1.0)) {
+      V = CurDAG->getCopyFromReg(Chain, IA64::F1, MVT::f64);
+    } else
+      assert(0 && "Unexpected FP constant!");
+    
+    ReplaceUses(SDOperand(N, 0), V);
+    return 0;
+  }
+
+  case ISD::FrameIndex: { // TODO: reduce creepyness
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    if (N->hasOneUse())
+      return CurDAG->SelectNodeTo(N, IA64::MOV, MVT::i64,
+                                  CurDAG->getTargetFrameIndex(FI, MVT::i64));
+    else
+      return CurDAG->getTargetNode(IA64::MOV, MVT::i64,
+                                   CurDAG->getTargetFrameIndex(FI, MVT::i64));
+  }
+
+  case ISD::ConstantPool: { // TODO: nuke the constant pool
+    // (ia64 doesn't need one)
+    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
+    Constant *C = CP->getConstVal();
+    SDOperand CPI = CurDAG->getTargetConstantPool(C, MVT::i64,
+                                                  CP->getAlignment());
+    return CurDAG->getTargetNode(IA64::ADDL_GA, MVT::i64, // ?
+                                 CurDAG->getRegister(IA64::r1, MVT::i64), CPI);
+  }
+
+  case ISD::GlobalAddress: {
+    GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
+    SDOperand GA = CurDAG->getTargetGlobalAddress(GV, MVT::i64);
+    SDOperand Tmp =
+      SDOperand(CurDAG->getTargetNode(IA64::ADDL_GA, MVT::i64, 
+                                      CurDAG->getRegister(IA64::r1,
+                                                          MVT::i64), GA), 0);
+    return CurDAG->getTargetNode(IA64::LD8, MVT::i64, Tmp);
+  }
+  
+/* XXX
+   case ISD::ExternalSymbol: {
+     SDOperand EA = CurDAG->getTargetExternalSymbol(
+       cast<ExternalSymbolSDNode>(N)->getSymbol(),
+       MVT::i64);
+     SDOperand Tmp = CurDAG->getTargetNode(IA64::ADDL_EA, MVT::i64, 
+                                           CurDAG->getRegister(IA64::r1,
+                                                               MVT::i64),
+                                           EA);
+     return CurDAG->getTargetNode(IA64::LD8, MVT::i64, Tmp);
+   }
+*/
+
+  case ISD::LOAD: { // FIXME: load -1, not 1, for bools?
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    SDOperand Chain = LD->getChain();
+    SDOperand Address = LD->getBasePtr();
+    AddToISelQueue(Chain);
+    AddToISelQueue(Address);
+
+    MVT::ValueType TypeBeingLoaded = LD->getLoadedVT();
+    unsigned Opc;
+    switch (TypeBeingLoaded) {
+    default:
+#ifndef NDEBUG
+      N->dump(CurDAG);
+#endif
+      assert(0 && "Cannot load this type!");
+    case MVT::i1: { // this is a bool
+      Opc = IA64::LD1; // first we load a byte, then compare for != 0
+      if(N->getValueType(0) == MVT::i1) { // XXX: early exit!
+        return CurDAG->SelectNodeTo(N, IA64::CMPNE, MVT::i1, MVT::Other, 
+                    SDOperand(CurDAG->getTargetNode(Opc, MVT::i64, Address), 0),
+                                    CurDAG->getRegister(IA64::r0, MVT::i64), 
+                                    Chain);
+      }
+      /* otherwise, we want to load a bool into something bigger: LD1
+         will do that for us, so we just fall through */
+    }
+    case MVT::i8:  Opc = IA64::LD1; break;
+    case MVT::i16: Opc = IA64::LD2; break;
+    case MVT::i32: Opc = IA64::LD4; break;
+    case MVT::i64: Opc = IA64::LD8; break;
+    
+    case MVT::f32: Opc = IA64::LDF4; break;
+    case MVT::f64: Opc = IA64::LDF8; break;
+    }
+
+    // TODO: comment this
+    return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), MVT::Other,
+                                Address, Chain);
+  }
+  
+  case ISD::STORE: {
+    StoreSDNode *ST = cast<StoreSDNode>(N);
+    SDOperand Address = ST->getBasePtr();
+    SDOperand Chain = ST->getChain();
+    AddToISelQueue(Address);
+    AddToISelQueue(Chain);
+   
+    unsigned Opc;
+    if (ISD::isNON_TRUNCStore(N)) {
+      switch (N->getOperand(1).getValueType()) {
+      default: assert(0 && "unknown type in store");
+      case MVT::i1: { // this is a bool
+        Opc = IA64::ST1; // we store either 0 or 1 as a byte 
+        // first load zero!
+        SDOperand Initial = CurDAG->getCopyFromReg(Chain, IA64::r0, MVT::i64);
+        Chain = Initial.getValue(1);
+        // then load 1 into the same reg iff the predicate to store is 1
+        SDOperand Tmp = ST->getValue();
+        AddToISelQueue(Tmp);
+        Tmp =
+          SDOperand(CurDAG->getTargetNode(IA64::TPCADDS, MVT::i64, Initial,
+                                          CurDAG->getTargetConstant(1, MVT::i64),
+                                          Tmp), 0);
+        return CurDAG->SelectNodeTo(N, Opc, MVT::Other, Address, Tmp, Chain);
+      }
+      case MVT::i64: Opc = IA64::ST8;  break;
+      case MVT::f64: Opc = IA64::STF8; break;
+      }
+    } else { // Truncating store
+      switch(ST->getStoredVT()) {
+      default: assert(0 && "unknown type in truncstore");
+      case MVT::i8:  Opc = IA64::ST1;  break;
+      case MVT::i16: Opc = IA64::ST2;  break;
+      case MVT::i32: Opc = IA64::ST4;  break;
+      case MVT::f32: Opc = IA64::STF4; break;
+      }
+    }
+    
+    SDOperand N1 = N->getOperand(1);
+    SDOperand N2 = N->getOperand(2);
+    AddToISelQueue(N1);
+    AddToISelQueue(N2);
+    return CurDAG->SelectNodeTo(N, Opc, MVT::Other, N2, N1, Chain);
+  }
+
+  case ISD::BRCOND: {
+    SDOperand Chain = N->getOperand(0);
+    SDOperand CC = N->getOperand(1);
+    AddToISelQueue(Chain);
+    AddToISelQueue(CC);
+    MachineBasicBlock *Dest =
+      cast<BasicBlockSDNode>(N->getOperand(2))->getBasicBlock();
+    //FIXME - we do NOT need long branches all the time
+    return CurDAG->SelectNodeTo(N, IA64::BRLCOND_NOTCALL, MVT::Other, CC, 
+                                CurDAG->getBasicBlock(Dest), Chain);
+  }
+
+  case ISD::CALLSEQ_START:
+  case ISD::CALLSEQ_END: {
+    int64_t Amt = cast<ConstantSDNode>(N->getOperand(1))->getValue();
+    unsigned Opc = N->getOpcode() == ISD::CALLSEQ_START ?
+      IA64::ADJUSTCALLSTACKDOWN : IA64::ADJUSTCALLSTACKUP;
+    SDOperand N0 = N->getOperand(0);
+    AddToISelQueue(N0);
+    return CurDAG->SelectNodeTo(N, Opc, MVT::Other, getI64Imm(Amt), N0);
+  }
+
+  case ISD::BR:
+    // FIXME: we don't need long branches all the time!
+    SDOperand N0 = N->getOperand(0);
+    AddToISelQueue(N0);
+    return CurDAG->SelectNodeTo(N, IA64::BRL_NOTCALL, MVT::Other, 
+                                N->getOperand(1), N0);
+  }
+  
+  return SelectCode(Op);
+}
+
+
+/// createIA64DAGToDAGInstructionSelector - This pass converts a legalized DAG
+/// into an IA64-specific DAG, ready for instruction scheduling.
+///
+FunctionPass
+*llvm::createIA64DAGToDAGInstructionSelector(IA64TargetMachine &TM) {
+  return new IA64DAGToDAGISel(TM);
+}
+

diff --git a/lib/Target/IA64/IA64ISelLowering.cpp b/lib/Target/IA64/IA64ISelLowering.cpp
new file mode 100644
index 0000000..0237a9a
--- /dev/null
+++ b/lib/Target/IA64/IA64ISelLowering.cpp

@@ -0,0 +1,602 @@
+//===-- IA64ISelLowering.cpp - IA64 DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IA64ISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64ISelLowering.h"
+#include "IA64MachineFunctionInfo.h"
+#include "IA64TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+using namespace llvm;
+
+IA64TargetLowering::IA64TargetLowering(TargetMachine &TM)
+  : TargetLowering(TM) {
+ 
+      // register class for general registers
+      addRegisterClass(MVT::i64, IA64::GRRegisterClass);
+
+      // register class for FP registers
+      addRegisterClass(MVT::f64, IA64::FPRegisterClass);
+
+      // register class for predicate registers
+      addRegisterClass(MVT::i1, IA64::PRRegisterClass);
+
+      setLoadXAction(ISD::EXTLOAD          , MVT::i1   , Promote);
+
+      setLoadXAction(ISD::ZEXTLOAD         , MVT::i1   , Expand);
+
+      setLoadXAction(ISD::SEXTLOAD         , MVT::i1   , Expand);
+      setLoadXAction(ISD::SEXTLOAD         , MVT::i8   , Expand);
+      setLoadXAction(ISD::SEXTLOAD         , MVT::i16  , Expand);
+      setLoadXAction(ISD::SEXTLOAD         , MVT::i32  , Expand);
+
+      setOperationAction(ISD::BRIND            , MVT::Other, Expand);
+      setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
+      setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
+      setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
+
+      // ia64 uses SELECT not SELECT_CC
+      setOperationAction(ISD::SELECT_CC        , MVT::Other,  Expand);
+      
+      // We need to handle ISD::RET for void functions ourselves,
+      // so we get a chance to restore ar.pfs before adding a
+      // br.ret insn
+      setOperationAction(ISD::RET, MVT::Other, Custom);
+
+      setSetCCResultType(MVT::i1);
+      setShiftAmountType(MVT::i64);
+
+      setOperationAction(ISD::FREM             , MVT::f32  , Expand);
+      setOperationAction(ISD::FREM             , MVT::f64  , Expand);
+
+      setOperationAction(ISD::UREM             , MVT::f32  , Expand);
+      setOperationAction(ISD::UREM             , MVT::f64  , Expand);
+
+      setOperationAction(ISD::MEMMOVE          , MVT::Other, Expand);
+      setOperationAction(ISD::MEMSET           , MVT::Other, Expand);
+      setOperationAction(ISD::MEMCPY           , MVT::Other, Expand);
+      
+      setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
+      setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
+
+      // We don't support sin/cos/sqrt
+      setOperationAction(ISD::FSIN , MVT::f64, Expand);
+      setOperationAction(ISD::FCOS , MVT::f64, Expand);
+      setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+      setOperationAction(ISD::FSIN , MVT::f32, Expand);
+      setOperationAction(ISD::FCOS , MVT::f32, Expand);
+      setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+
+      // FIXME: IA64 supports fcopysign natively!
+      setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+      setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+      
+      // We don't have line number support yet.
+      setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+      setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+      setOperationAction(ISD::LABEL, MVT::Other, Expand);
+
+      //IA64 has these, but they are not implemented
+      setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
+      setOperationAction(ISD::CTLZ , MVT::i64  , Expand);
+      setOperationAction(ISD::ROTL , MVT::i64  , Expand);
+      setOperationAction(ISD::ROTR , MVT::i64  , Expand);
+      setOperationAction(ISD::BSWAP, MVT::i64  , Expand);  // mux @rev
+
+      // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+      setOperationAction(ISD::VAARG             , MVT::Other, Custom);
+      setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+      
+      // Use the default implementation.
+      setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+      setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+      setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+      setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+      setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+      // Thread Local Storage
+      setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+
+      setStackPointerRegisterToSaveRestore(IA64::r12);
+
+      setJumpBufSize(704); // on ia64-linux, jmp_bufs are 704 bytes..
+      setJumpBufAlignment(16); // ...and must be 16-byte aligned
+      
+      computeRegisterProperties();
+
+      setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+      addLegalFPImmediate(+0.0);
+      addLegalFPImmediate(+1.0);
+}
+
+const char *IA64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case IA64ISD::GETFD:  return "IA64ISD::GETFD";
+  case IA64ISD::BRCALL: return "IA64ISD::BRCALL";  
+  case IA64ISD::RET_FLAG: return "IA64ISD::RET_FLAG";
+  }
+}
+  
+
+std::vector<SDOperand>
+IA64TargetLowering::LowerArguments(Function &F, SelectionDAG &DAG) {
+  std::vector<SDOperand> ArgValues;
+  //
+  // add beautiful description of IA64 stack frame format
+  // here (from intel 24535803.pdf most likely)
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  
+  GP = MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::i64));
+  SP = MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::i64));
+  RP = MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::i64));
+  
+  MachineBasicBlock& BB = MF.front();
+
+  unsigned args_int[] = {IA64::r32, IA64::r33, IA64::r34, IA64::r35,
+                         IA64::r36, IA64::r37, IA64::r38, IA64::r39};
+
+  unsigned args_FP[] = {IA64::F8, IA64::F9, IA64::F10, IA64::F11,
+                        IA64::F12,IA64::F13,IA64::F14, IA64::F15};
+
+  unsigned argVreg[8];
+  unsigned argPreg[8];
+  unsigned argOpc[8];
+
+  unsigned used_FPArgs = 0; // how many FP args have been used so far?
+
+  unsigned ArgOffset = 0;
+  int count = 0;
+
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+    {
+      SDOperand newroot, argt;
+      if(count < 8) { // need to fix this logic? maybe.
+
+        switch (getValueType(I->getType())) {
+          default:
+            assert(0 && "ERROR in LowerArgs: can't lower this type of arg.\n"); 
+          case MVT::f32:
+            // fixme? (well, will need to for weird FP structy stuff,
+            // see intel ABI docs)
+          case MVT::f64:
+//XXX            BuildMI(&BB, IA64::IDEF, 0, args_FP[used_FPArgs]);
+            MF.addLiveIn(args_FP[used_FPArgs]); // mark this reg as liveIn
+            // floating point args go into f8..f15 as-needed, the increment
+            argVreg[count] =                              // is below..:
+            MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::f64));
+            // FP args go into f8..f15 as needed: (hence the ++)
+            argPreg[count] = args_FP[used_FPArgs++];
+            argOpc[count] = IA64::FMOV;
+            argt = newroot = DAG.getCopyFromReg(DAG.getRoot(), argVreg[count],
+                                                MVT::f64);
+            if (I->getType() == Type::FloatTy)
+              argt = DAG.getNode(ISD::FP_ROUND, MVT::f32, argt);
+            break;
+          case MVT::i1: // NOTE: as far as C abi stuff goes,
+                        // bools are just boring old ints
+          case MVT::i8:
+          case MVT::i16:
+          case MVT::i32:
+          case MVT::i64:
+//XXX            BuildMI(&BB, IA64::IDEF, 0, args_int[count]);
+            MF.addLiveIn(args_int[count]); // mark this register as liveIn
+            argVreg[count] =
+            MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::i64));
+            argPreg[count] = args_int[count];
+            argOpc[count] = IA64::MOV;
+            argt = newroot =
+              DAG.getCopyFromReg(DAG.getRoot(), argVreg[count], MVT::i64);
+            if ( getValueType(I->getType()) != MVT::i64)
+              argt = DAG.getNode(ISD::TRUNCATE, getValueType(I->getType()),
+                  newroot);
+            break;
+        }
+      } else { // more than 8 args go into the frame
+        // Create the frame index object for this incoming parameter...
+        ArgOffset = 16 + 8 * (count - 8);
+        int FI = MFI->CreateFixedObject(8, ArgOffset);
+
+        // Create the SelectionDAG nodes corresponding to a load
+        //from this parameter
+        SDOperand FIN = DAG.getFrameIndex(FI, MVT::i64);
+        argt = newroot = DAG.getLoad(getValueType(I->getType()),
+                                     DAG.getEntryNode(), FIN, NULL, 0);
+      }
+      ++count;
+      DAG.setRoot(newroot.getValue(1));
+      ArgValues.push_back(argt);
+    }
+
+
+  // Create a vreg to hold the output of (what will become)
+  // the "alloc" instruction
+  VirtGPR = MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::i64));
+  BuildMI(&BB, TII->get(IA64::PSEUDO_ALLOC), VirtGPR);
+  // we create a PSEUDO_ALLOC (pseudo)instruction for now
+/*
+  BuildMI(&BB, IA64::IDEF, 0, IA64::r1);
+
+  // hmm:
+  BuildMI(&BB, IA64::IDEF, 0, IA64::r12);
+  BuildMI(&BB, IA64::IDEF, 0, IA64::rp);
+  // ..hmm.
+  
+  BuildMI(&BB, IA64::MOV, 1, GP).addReg(IA64::r1);
+
+  // hmm:
+  BuildMI(&BB, IA64::MOV, 1, SP).addReg(IA64::r12);
+  BuildMI(&BB, IA64::MOV, 1, RP).addReg(IA64::rp);
+  // ..hmm.
+*/
+
+  unsigned tempOffset=0;
+
+  // if this is a varargs function, we simply lower llvm.va_start by
+  // pointing to the first entry
+  if(F.isVarArg()) {
+    tempOffset=0;
+    VarArgsFrameIndex = MFI->CreateFixedObject(8, tempOffset);
+  }
+
+  // here we actually do the moving of args, and store them to the stack
+  // too if this is a varargs function:
+  for (int i = 0; i < count && i < 8; ++i) {
+    BuildMI(&BB, TII->get(argOpc[i]), argVreg[i]).addReg(argPreg[i]);
+    if(F.isVarArg()) {
+      // if this is a varargs function, we copy the input registers to the stack
+      int FI = MFI->CreateFixedObject(8, tempOffset);
+      tempOffset+=8;   //XXX: is it safe to use r22 like this?
+      BuildMI(&BB, TII->get(IA64::MOV), IA64::r22).addFrameIndex(FI);
+      // FIXME: we should use st8.spill here, one day
+      BuildMI(&BB, TII->get(IA64::ST8), IA64::r22).addReg(argPreg[i]);
+    }
+  }
+
+  // Finally, inform the code generator which regs we return values in.
+  // (see the ISD::RET: case in the instruction selector)
+  switch (getValueType(F.getReturnType())) {
+  default: assert(0 && "i have no idea where to return this type!");
+  case MVT::isVoid: break;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+    MF.addLiveOut(IA64::r8);
+    break;
+  case MVT::f32:
+  case MVT::f64:
+    MF.addLiveOut(IA64::F8);
+    break;
+  }
+
+  return ArgValues;
+}
+
+std::pair<SDOperand, SDOperand>
+IA64TargetLowering::LowerCallTo(SDOperand Chain,
+                                const Type *RetTy, bool RetTyIsSigned, 
+                                bool isVarArg, unsigned CallingConv, 
+                                bool isTailCall, SDOperand Callee, 
+                                ArgListTy &Args, SelectionDAG &DAG) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  unsigned NumBytes = 16;
+  unsigned outRegsUsed = 0;
+
+  if (Args.size() > 8) {
+    NumBytes += (Args.size() - 8) * 8;
+    outRegsUsed = 8;
+  } else {
+    outRegsUsed = Args.size();
+  }
+
+  // FIXME? this WILL fail if we ever try to pass around an arg that
+  // consumes more than a single output slot (a 'real' double, int128
+  // some sort of aggregate etc.), as we'll underestimate how many 'outX'
+  // registers we use. Hopefully, the assembler will notice.
+  MF.getInfo<IA64FunctionInfo>()->outRegsUsed=
+    std::max(outRegsUsed, MF.getInfo<IA64FunctionInfo>()->outRegsUsed);
+
+  // keep stack frame 16-byte aligned
+  // assert(NumBytes==((NumBytes+15) & ~15) && 
+  //        "stack frame not 16-byte aligned!");
+  NumBytes = (NumBytes+15) & ~15;
+  
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
+
+  SDOperand StackPtr;
+  std::vector<SDOperand> Stores;
+  std::vector<SDOperand> Converts;
+  std::vector<SDOperand> RegValuesToPass;
+  unsigned ArgOffset = 16;
+  
+  for (unsigned i = 0, e = Args.size(); i != e; ++i)
+    {
+      SDOperand Val = Args[i].Node;
+      MVT::ValueType ObjectVT = Val.getValueType();
+      SDOperand ValToStore(0, 0), ValToConvert(0, 0);
+      unsigned ObjSize=8;
+      switch (ObjectVT) {
+      default: assert(0 && "unexpected argument type!");
+      case MVT::i1:
+      case MVT::i8:
+      case MVT::i16:
+      case MVT::i32: {
+        //promote to 64-bits, sign/zero extending based on type
+        //of the argument
+        ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+        if (Args[i].isSExt)
+          ExtendKind = ISD::SIGN_EXTEND;
+        else if (Args[i].isZExt)
+          ExtendKind = ISD::ZERO_EXTEND;
+        Val = DAG.getNode(ExtendKind, MVT::i64, Val);
+        // XXX: fall through
+      }
+      case MVT::i64:
+        //ObjSize = 8;
+        if(RegValuesToPass.size() >= 8) {
+          ValToStore = Val;
+        } else {
+          RegValuesToPass.push_back(Val);
+        }
+        break;
+      case MVT::f32:
+        //promote to 64-bits
+        Val = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Val);
+        // XXX: fall through
+      case MVT::f64:
+        if(RegValuesToPass.size() >= 8) {
+          ValToStore = Val;
+        } else {
+          RegValuesToPass.push_back(Val);
+          if(1 /* TODO: if(calling external or varadic function)*/ ) {
+            ValToConvert = Val; // additionally pass this FP value as an int
+          }
+        }
+        break;
+      }
+      
+      if(ValToStore.Val) {
+        if(!StackPtr.Val) {
+          StackPtr = DAG.getRegister(IA64::r12, MVT::i64);
+        }
+        SDOperand PtrOff = DAG.getConstant(ArgOffset, getPointerTy());
+        PtrOff = DAG.getNode(ISD::ADD, MVT::i64, StackPtr, PtrOff);
+        Stores.push_back(DAG.getStore(Chain, ValToStore, PtrOff, NULL, 0));
+        ArgOffset += ObjSize;
+      }
+
+      if(ValToConvert.Val) {
+        Converts.push_back(DAG.getNode(IA64ISD::GETFD, MVT::i64, ValToConvert)); 
+      }
+    }
+
+  // Emit all stores, make sure they occur before any copies into physregs.
+  if (!Stores.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &Stores[0],Stores.size());
+
+  static const unsigned IntArgRegs[] = {
+    IA64::out0, IA64::out1, IA64::out2, IA64::out3, 
+    IA64::out4, IA64::out5, IA64::out6, IA64::out7
+  };
+
+  static const unsigned FPArgRegs[] = {
+    IA64::F8,  IA64::F9,  IA64::F10, IA64::F11, 
+    IA64::F12, IA64::F13, IA64::F14, IA64::F15
+  };
+
+  SDOperand InFlag;
+  
+  // save the current GP, SP and RP : FIXME: do we need to do all 3 always?
+  SDOperand GPBeforeCall = DAG.getCopyFromReg(Chain, IA64::r1, MVT::i64, InFlag);
+  Chain = GPBeforeCall.getValue(1);
+  InFlag = Chain.getValue(2);
+  SDOperand SPBeforeCall = DAG.getCopyFromReg(Chain, IA64::r12, MVT::i64, InFlag);
+  Chain = SPBeforeCall.getValue(1);
+  InFlag = Chain.getValue(2);
+  SDOperand RPBeforeCall = DAG.getCopyFromReg(Chain, IA64::rp, MVT::i64, InFlag);
+  Chain = RPBeforeCall.getValue(1);
+  InFlag = Chain.getValue(2);
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing integer args into regs out[0-7]
+  // mapped 1:1 and the FP args into regs F8-F15 "lazily"
+  // TODO: for performance, we should only copy FP args into int regs when we
+  // know this is required (i.e. for varardic or external (unknown) functions)
+
+  // first to the FP->(integer representation) conversions, these are
+  // flagged for now, but shouldn't have to be (TODO)
+  unsigned seenConverts = 0;
+  for (unsigned i = 0, e = RegValuesToPass.size(); i != e; ++i) {
+    if(MVT::isFloatingPoint(RegValuesToPass[i].getValueType())) {
+      Chain = DAG.getCopyToReg(Chain, IntArgRegs[i], Converts[seenConverts++], 
+                               InFlag);
+      InFlag = Chain.getValue(1);
+    }
+  }
+
+  // next copy args into the usual places, these are flagged
+  unsigned usedFPArgs = 0;
+  for (unsigned i = 0, e = RegValuesToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain,
+      MVT::isInteger(RegValuesToPass[i].getValueType()) ?
+        IntArgRegs[i] : FPArgRegs[usedFPArgs++], RegValuesToPass[i], InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+/*
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i64);
+  }
+*/
+
+  std::vector<MVT::ValueType> NodeTys;
+  std::vector<SDOperand> CallOperands;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+  CallOperands.push_back(Chain);
+  CallOperands.push_back(Callee);
+
+  // emit the call itself
+  if (InFlag.Val)
+    CallOperands.push_back(InFlag);
+  else
+    assert(0 && "this should never happen!\n");
+
+  // to make way for a hack:
+  Chain = DAG.getNode(IA64ISD::BRCALL, NodeTys,
+                      &CallOperands[0], CallOperands.size());
+  InFlag = Chain.getValue(1);
+
+  // restore the GP, SP and RP after the call  
+  Chain = DAG.getCopyToReg(Chain, IA64::r1, GPBeforeCall, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, IA64::r12, SPBeforeCall, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, IA64::rp, RPBeforeCall, InFlag);
+  InFlag = Chain.getValue(1);
+ 
+  std::vector<MVT::ValueType> RetVals;
+  RetVals.push_back(MVT::Other);
+  RetVals.push_back(MVT::Flag);
+ 
+  MVT::ValueType RetTyVT = getValueType(RetTy);
+  SDOperand RetVal;
+  if (RetTyVT != MVT::isVoid) {
+    switch (RetTyVT) {
+    default: assert(0 && "Unknown value type to return!");
+    case MVT::i1: { // bools are just like other integers (returned in r8)
+      // we *could* fall through to the truncate below, but this saves a
+      // few redundant predicate ops
+      SDOperand boolInR8 = DAG.getCopyFromReg(Chain, IA64::r8, MVT::i64,InFlag);
+      InFlag = boolInR8.getValue(2);
+      Chain = boolInR8.getValue(1);
+      SDOperand zeroReg = DAG.getCopyFromReg(Chain, IA64::r0, MVT::i64, InFlag);
+      InFlag = zeroReg.getValue(2);
+      Chain = zeroReg.getValue(1);
+      
+      RetVal = DAG.getSetCC(MVT::i1, boolInR8, zeroReg, ISD::SETNE);
+      break;
+    }
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      RetVal = DAG.getCopyFromReg(Chain, IA64::r8, MVT::i64, InFlag);
+      Chain = RetVal.getValue(1);
+      
+      // keep track of whether it is sign or zero extended (todo: bools?)
+/* XXX
+      RetVal = DAG.getNode(RetTy->isSigned() ? ISD::AssertSext :ISD::AssertZext,
+                           MVT::i64, RetVal, DAG.getValueType(RetTyVT));
+*/
+      RetVal = DAG.getNode(ISD::TRUNCATE, RetTyVT, RetVal);
+      break;
+    case MVT::i64:
+      RetVal = DAG.getCopyFromReg(Chain, IA64::r8, MVT::i64, InFlag);
+      Chain = RetVal.getValue(1);
+      InFlag = RetVal.getValue(2); // XXX dead
+      break;
+    case MVT::f32:
+      RetVal = DAG.getCopyFromReg(Chain, IA64::F8, MVT::f64, InFlag);
+      Chain = RetVal.getValue(1);
+      RetVal = DAG.getNode(ISD::TRUNCATE, MVT::f32, RetVal);
+      break;
+    case MVT::f64:
+      RetVal = DAG.getCopyFromReg(Chain, IA64::F8, MVT::f64, InFlag);
+      Chain = RetVal.getValue(1);
+      InFlag = RetVal.getValue(2); // XXX dead
+      break;
+    }
+  }
+  
+  Chain = DAG.getNode(ISD::CALLSEQ_END, MVT::Other, Chain,
+                      DAG.getConstant(NumBytes, getPointerTy()));
+  
+  return std::make_pair(RetVal, Chain);
+}
+
+SDOperand IA64TargetLowering::
+LowerOperation(SDOperand Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Should not custom lower this!");
+  case ISD::GlobalTLSAddress:
+    assert(0 && "TLS not implemented for IA64.");
+  case ISD::RET: {
+    SDOperand AR_PFSVal, Copy;
+    
+    switch(Op.getNumOperands()) {
+     default:
+      assert(0 && "Do not know how to return this many arguments!");
+      abort();
+    case 1: 
+      AR_PFSVal = DAG.getCopyFromReg(Op.getOperand(0), VirtGPR, MVT::i64);
+      AR_PFSVal = DAG.getCopyToReg(AR_PFSVal.getValue(1), IA64::AR_PFS, 
+                                   AR_PFSVal);
+      return DAG.getNode(IA64ISD::RET_FLAG, MVT::Other, AR_PFSVal);
+    case 3: {
+      // Copy the result into the output register & restore ar.pfs
+      MVT::ValueType ArgVT = Op.getOperand(1).getValueType();
+      unsigned ArgReg = MVT::isInteger(ArgVT) ? IA64::r8 : IA64::F8;
+
+      AR_PFSVal = DAG.getCopyFromReg(Op.getOperand(0), VirtGPR, MVT::i64);
+      Copy = DAG.getCopyToReg(AR_PFSVal.getValue(1), ArgReg, Op.getOperand(1),
+                              SDOperand());
+      AR_PFSVal = DAG.getCopyToReg(Copy.getValue(0), IA64::AR_PFS, AR_PFSVal,
+                                   Copy.getValue(1));
+      return DAG.getNode(IA64ISD::RET_FLAG, MVT::Other,
+                         AR_PFSVal, AR_PFSVal.getValue(1));
+    }
+    }
+    return SDOperand();
+  }
+  case ISD::VAARG: {
+    MVT::ValueType VT = getPointerTy();
+    SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2));
+    SDOperand VAList = DAG.getLoad(VT, Op.getOperand(0), Op.getOperand(1), 
+                                   SV->getValue(), SV->getOffset());
+    // Increment the pointer, VAList, to the next vaarg
+    SDOperand VAIncr = DAG.getNode(ISD::ADD, VT, VAList, 
+                                   DAG.getConstant(MVT::getSizeInBits(VT)/8, 
+                                                   VT));
+    // Store the incremented VAList to the legalized pointer
+    VAIncr = DAG.getStore(VAList.getValue(1), VAIncr,
+                          Op.getOperand(1), SV->getValue(), SV->getOffset());
+    // Load the actual argument out of the pointer VAList
+    return DAG.getLoad(Op.getValueType(), VAIncr, VAList, NULL, 0);
+  }
+  case ISD::VASTART: {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, MVT::i64);
+    SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2));
+    return DAG.getStore(Op.getOperand(0), FR, 
+                        Op.getOperand(1), SV->getValue(), SV->getOffset());
+  }
+  // Frame & Return address.  Currently unimplemented
+  case ISD::RETURNADDR:         break;
+  case ISD::FRAMEADDR:          break;
+  }
+  return SDOperand();
+}

diff --git a/lib/Target/IA64/IA64ISelLowering.h b/lib/Target/IA64/IA64ISelLowering.h
new file mode 100644
index 0000000..6bc5534
--- /dev/null
+++ b/lib/Target/IA64/IA64ISelLowering.h

@@ -0,0 +1,74 @@
+//===-- IA64ISelLowering.h - IA64 DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that IA64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_IA64_IA64ISELLOWERING_H
+#define LLVM_TARGET_IA64_IA64ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "IA64.h"
+
+namespace llvm {
+  namespace IA64ISD {
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+IA64::INSTRUCTION_LIST_END,
+
+      /// GETFD - the getf.d instruction takes a floating point operand and
+      /// returns its 64-bit memory representation as an i64
+      GETFD,
+
+      // TODO: explain this hack
+      BRCALL,
+      
+      // RET_FLAG - Return with a flag operand
+      RET_FLAG
+    };
+  }  
+  
+  class IA64TargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    //int ReturnAddrIndex;              // FrameIndex for return slot.
+    unsigned GP, SP, RP; // FIXME - clean this mess up
+	  
+  public:
+    IA64TargetLowering(TargetMachine &TM);
+
+    unsigned VirtGPR; // this is public so it can be accessed in the selector
+                      // for ISD::RET. add an accessor instead? FIXME
+	    
+    const char *getTargetNodeName(unsigned Opcode) const;
+      
+    /// LowerArguments - This hook must be implemented to indicate how we should
+    /// lower the arguments for the specified function, into the specified DAG.
+    virtual std::vector<SDOperand>
+      LowerArguments(Function &F, SelectionDAG &DAG);
+    
+    /// LowerCallTo - This hook lowers an abstract call to a function into an
+    /// actual call.
+    virtual std::pair<SDOperand, SDOperand>
+      LowerCallTo(SDOperand Chain, const Type *RetTy, bool RetTyIsSigned, 
+                  bool isVarArg, unsigned CC, bool isTailCall, 
+                  SDOperand Callee, ArgListTy &Args, SelectionDAG &DAG);
+    
+    /// LowerOperation - for custom lowering specific ops
+    /// (currently, only "ret void")
+    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+    
+// XXX    virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI,
+// XXX                                                      MachineBasicBlock *MBB);
+  };
+}
+
+#endif   // LLVM_TARGET_IA64_IA64ISELLOWERING_H

diff --git a/lib/Target/IA64/IA64InstrBuilder.h b/lib/Target/IA64/IA64InstrBuilder.h
new file mode 100644
index 0000000..f9b5004
--- /dev/null
+++ b/lib/Target/IA64/IA64InstrBuilder.h

@@ -0,0 +1,52 @@
+//===-- IA64PCInstrBuilder.h - Aids for building IA64 insts -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64_INSTRBUILDER_H
+#define IA64_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference has base register ConstantPoolIndex offset which is retained until
+/// either machine code emission or assembly output.  This allows an optional
+/// offset to be added as well.
+///
+inline const MachineInstrBuilder&
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         int Offset = 0) {
+  return MIB.addImm(Offset).addConstantPoolIndex(CPI);
+}
+
+} // End llvm namespace
+
+#endif
+

diff --git a/lib/Target/IA64/IA64InstrFormats.td b/lib/Target/IA64/IA64InstrFormats.td
new file mode 100644
index 0000000..ba6c574
--- /dev/null
+++ b/lib/Target/IA64/IA64InstrFormats.td

@@ -0,0 +1,79 @@
+//===- IA64InstrFormats.td - IA64 Instruction Formats --*- tablegen -*-=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//  - Warning: the stuff in here isn't really being used, so is mostly
+//             junk. It'll get fixed as the JIT gets built.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+class InstIA64<bits<4> op, dag OL, string asmstr> : Instruction { 
+  // IA64 instruction baseline
+  field bits<41> Inst;
+  let Namespace = "IA64";
+  let OperandList = OL;
+  let AsmString = asmstr;
+
+  let Inst{40-37} = op;
+}
+
+//"Each Itanium instruction is categorized into one of six types."
+//We should have:
+// A, I, M, F, B, L+X
+
+class AForm<bits<4> opcode, bits<6> qpReg, dag OL, string asmstr> : 
+  InstIA64<opcode, OL, asmstr> {
+
+  let Inst{5-0} = qpReg;
+}
+
+class AForm_DAG<bits<4> opcode, bits<6> qpReg, dag OL, string asmstr,
+      list<dag> pattern> : 
+  InstIA64<opcode, OL, asmstr> {
+
+  let Pattern = pattern;
+  let Inst{5-0} = qpReg;
+}
+
+let isBranch = 1, isTerminator = 1 in
+class BForm<bits<4> opcode, bits<6> x6, bits<3> btype, dag OL, string asmstr> :
+  InstIA64<opcode, OL, asmstr> {
+
+  let Inst{32-27} = x6;
+  let Inst{8-6} = btype;
+}
+
+class MForm<bits<4> opcode, bits<6> x6, dag OL, string asmstr> :
+  InstIA64<opcode, OL, asmstr> {
+    bits<7> Ra;
+    bits<7> Rb;
+    bits<16> disp;
+
+    let Inst{35-30} = x6;
+//  let Inst{20-16} = Rb;
+    let Inst{15-0} = disp;
+}
+
+class RawForm<bits<4> opcode, bits<26> rest, dag OL, string asmstr> :
+  InstIA64<opcode, OL, asmstr> {
+    let Inst{25-0} = rest;
+}
+
+// Pseudo instructions.
+class PseudoInstIA64<dag OL, string nm> : InstIA64<0, OL, nm>  {
+}
+
+class PseudoInstIA64_DAG<dag OL, string nm, list<dag> pattern>
+  : InstIA64<0, OL, nm> {
+  let Pattern = pattern;
+}
+

diff --git a/lib/Target/IA64/IA64InstrInfo.cpp b/lib/Target/IA64/IA64InstrInfo.cpp
new file mode 100644
index 0000000..a66c9bc
--- /dev/null
+++ b/lib/Target/IA64/IA64InstrInfo.cpp

@@ -0,0 +1,58 @@
+//===- IA64InstrInfo.cpp - IA64 Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64InstrInfo.h"
+#include "IA64.h"
+#include "IA64InstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "IA64GenInstrInfo.inc"
+using namespace llvm;
+
+IA64InstrInfo::IA64InstrInfo()
+  : TargetInstrInfo(IA64Insts, sizeof(IA64Insts)/sizeof(IA64Insts[0])),
+    RI(*this) {
+}
+
+
+bool IA64InstrInfo::isMoveInstr(const MachineInstr& MI,
+                               unsigned& sourceReg,
+                               unsigned& destReg) const {
+  MachineOpCode oc = MI.getOpcode();
+  if (oc == IA64::MOV || oc == IA64::FMOV) {
+  // TODO: this doesn't detect predicate moves
+     assert(MI.getNumOperands() >= 2 &&
+             /* MI.getOperand(0).isRegister() &&
+             MI.getOperand(1).isRegister() && */
+             "invalid register-register move instruction");
+     if( MI.getOperand(0).isRegister() &&
+         MI.getOperand(1).isRegister() ) {
+       // if both operands of the MOV/FMOV are registers, then
+       // yes, this is a move instruction
+       sourceReg = MI.getOperand(1).getReg();
+       destReg = MI.getOperand(0).getReg();
+       return true;
+     }
+  }
+  return false; // we don't consider e.g. %regN = MOV <FrameIndex #x> a
+                // move instruction
+}
+
+unsigned
+IA64InstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB,
+                            const std::vector<MachineOperand> &Cond)const {
+  // Can only insert uncond branches so far.
+  assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!");
+  BuildMI(&MBB, get(IA64::BRL_NOTCALL)).addMBB(TBB);
+  return 1;
+}

diff --git a/lib/Target/IA64/IA64InstrInfo.h b/lib/Target/IA64/IA64InstrInfo.h
new file mode 100644
index 0000000..3bb14e0
--- /dev/null
+++ b/lib/Target/IA64/IA64InstrInfo.h

@@ -0,0 +1,49 @@
+//===- IA64InstrInfo.h - IA64 Instruction Information ----------*- C++ -*- ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64INSTRUCTIONINFO_H
+#define IA64INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "IA64RegisterInfo.h"
+
+namespace llvm {
+
+class IA64InstrInfo : public TargetInstrInfo {
+  const IA64RegisterInfo RI;
+public:
+  IA64InstrInfo();
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MRegisterInfo &getRegisterInfo() const { return RI; }
+
+  //
+  // Return true if the instruction is a register to register move and
+  // leave the source and dest operands in the passed parameters.
+  //
+  virtual bool isMoveInstr(const MachineInstr& MI,
+                           unsigned& sourceReg,
+                           unsigned& destReg) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const std::vector<MachineOperand> &Cond) const;
+
+};
+
+} // End llvm namespace
+
+#endif
+

diff --git a/lib/Target/IA64/IA64InstrInfo.td b/lib/Target/IA64/IA64InstrInfo.td
new file mode 100644
index 0000000..57f5f66
--- /dev/null
+++ b/lib/Target/IA64/IA64InstrInfo.td

@@ -0,0 +1,744 @@
+//===- IA64InstrInfo.td - Describe the IA64 Instruction Set -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the IA64 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+include "IA64InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// IA-64 specific DAG Nodes.
+//
+
+def IA64getfd : SDNode<"IA64ISD::GETFD", SDTFPToIntOp, []>;
+
+def SDT_IA64RetFlag : SDTypeProfile<0, 0, []>;
+def retflag         : SDNode<"IA64ISD::RET_FLAG", SDT_IA64RetFlag,
+	                   [SDNPHasChain, SDNPOptInFlag]>;
+
+//===---------
+// Instruction types
+
+class isA { bit A=1; } // I or M unit
+class isM { bit M=1; } // M unit
+class isI { bit I=1; } // I unit
+class isB { bit B=1; } // B unit
+class isF { bit F=1; } // F unit
+class isLX { bit LX=1; } // I/B
+
+//===---------
+
+def u2imm : Operand<i8>;
+def u6imm : Operand<i8>;
+def s8imm : Operand<i8> {
+  let PrintMethod = "printS8ImmOperand";
+}
+def s14imm  : Operand<i64> {
+  let PrintMethod = "printS14ImmOperand";
+}
+def s22imm  : Operand<i64> {
+  let PrintMethod = "printS22ImmOperand";
+}
+def u64imm  : Operand<i64> {
+  let PrintMethod = "printU64ImmOperand";
+}
+def s64imm  : Operand<i64> {
+  let PrintMethod = "printS64ImmOperand";
+}
+
+let PrintMethod = "printGlobalOperand" in
+  def globaladdress : Operand<i64>;
+
+// the asmprinter needs to know about calls
+let PrintMethod = "printCallOperand" in
+  def calltarget : Operand<i64>;
+  
+/* new daggy action!!! */
+
+def is32ones : PatLeaf<(i64 imm), [{
+  // is32ones predicate - True if the immediate is 0x00000000FFFFFFFF 
+  // Used to create ZXT4s appropriately 
+  uint64_t v = (uint64_t)N->getValue();
+  return (v == 0x00000000FFFFFFFFLL);
+}]>;
+
+// isMIXable predicates - True if the immediate is
+// 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF
+// etc, through 0x00000000FFFFFFFF
+// Used to test for the suitability of mix* 
+def isMIX1Lable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getValue()==0xFF00FF00FF00FF00LL);
+}]>;
+def isMIX1Rable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getValue()==0x00FF00FF00FF00FFLL);
+}]>;
+def isMIX2Lable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getValue()==0xFFFF0000FFFF0000LL);
+}]>;
+def isMIX2Rable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getValue()==0x0000FFFF0000FFFFLL);
+}]>;
+def isMIX4Lable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getValue()==0xFFFFFFFF00000000LL);
+}]>;
+def isMIX4Rable: PatLeaf<(i64 imm), [{
+  return((uint64_t)N->getValue()==0x00000000FFFFFFFFLL);
+}]>;
+
+def isSHLADDimm: PatLeaf<(i64 imm), [{
+  // isSHLADDimm predicate - True if the immediate is exactly 1, 2, 3 or 4
+  // - 0 is *not* okay.
+  // Used to create shladd instructions appropriately
+  int64_t v = (int64_t)N->getValue();
+  return (v >= 1 && v <= 4);
+}]>;
+
+def immSExt14  : PatLeaf<(i64 imm), [{
+  // immSExt14 predicate - True if the immediate fits in a 14-bit sign extended
+  // field.  Used by instructions like 'adds'.
+  int64_t v = (int64_t)N->getValue();
+  return (v <= 8191 && v >= -8192);
+}]>;
+
+// imm64 predicate - True if the immediate fits in a 64-bit 
+// field - i.e., true. used to keep movl happy
+def imm64  : PatLeaf<(i64 imm)>; 
+
+def ADD  : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+           "add $dst = $src1, $src2",
+	   [(set GR:$dst, (add GR:$src1, GR:$src2))]>, isA;
+
+def ADD1 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+           "add $dst = $src1, $src2, 1",
+	   [(set GR:$dst, (add (add GR:$src1, GR:$src2), 1))]>, isA;
+
+def ADDS : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, s14imm:$imm),
+           "adds $dst = $imm, $src1",
+	   [(set GR:$dst, (add GR:$src1, immSExt14:$imm))]>, isA;
+ 
+def MOVL : AForm_DAG<0x03, 0x0b, (ops GR:$dst, s64imm:$imm),
+           "movl $dst = $imm",
+	   [(set GR:$dst, imm64:$imm)]>, isLX;
+
+def ADDL_GA : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, globaladdress:$imm),
+           "addl $dst = $imm, $src1",
+	   []>, isA;
+
+// hmm 
+def ADDL_EA : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, calltarget:$imm),
+           "addl $dst = $imm, $src1",
+	   []>, isA;
+ 
+def SUB  : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+           "sub $dst = $src1, $src2",
+	   [(set GR:$dst, (sub GR:$src1, GR:$src2))]>, isA;
+
+def SUB1 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+           "sub $dst = $src1, $src2, 1",
+	   [(set GR:$dst, (add (sub GR: $src1, GR:$src2), -1))]>, isA;
+
+let isTwoAddress = 1 in {
+def TPCADDIMM22 : AForm<0x03, 0x0b,
+  (ops GR:$dst, GR:$src1, s22imm:$imm, PR:$qp),
+    "($qp) add $dst = $imm, $dst">, isA;
+def TPCADDS : AForm_DAG<0x03, 0x0b,
+  (ops GR:$dst, GR:$src1, s14imm:$imm, PR:$qp),
+    "($qp) adds $dst = $imm, $dst",
+    []>, isA;
+def TPCMPIMM8NE : AForm<0x03, 0x0b,
+  (ops PR:$dst, PR:$src1, s22imm:$imm, GR:$src2, PR:$qp),
+    "($qp) cmp.ne $dst , p0 = $imm, $src2">, isA;
+}
+
+// zero extend a bool (predicate reg) into an integer reg
+def ZXTb : Pat<(zext PR:$src),
+          (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src)>;
+def AXTb : Pat<(anyext PR:$src),
+          (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src)>; 
+
+// normal sign/zero-extends
+def SXT1 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "sxt1 $dst = $src",
+           [(set GR:$dst, (sext_inreg GR:$src, i8))]>, isI;
+def ZXT1 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "zxt1 $dst = $src",
+           [(set GR:$dst, (and GR:$src, 255))]>, isI;
+def SXT2 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "sxt2 $dst = $src",
+           [(set GR:$dst, (sext_inreg GR:$src, i16))]>, isI;
+def ZXT2 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "zxt2 $dst = $src",
+           [(set GR:$dst, (and GR:$src, 65535))]>, isI;
+def SXT4 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "sxt4 $dst = $src",
+           [(set GR:$dst, (sext_inreg GR:$src, i32))]>, isI;
+def ZXT4 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "zxt4 $dst = $src",
+           [(set GR:$dst, (and GR:$src, is32ones))]>, isI;
+
+// fixme: shrs vs shru?
+def MIX1L : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "mix1.l $dst = $src1, $src2",
+	  [(set GR:$dst, (or (and GR:$src1, isMIX1Lable),
+	                     (and (srl GR:$src2, (i64 8)), isMIX1Lable)))]>, isI;
+
+def MIX2L : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "mix2.l $dst = $src1, $src2",
+	  [(set GR:$dst, (or (and GR:$src1, isMIX2Lable),
+	                     (and (srl GR:$src2, (i64 16)), isMIX2Lable)))]>, isI;
+
+def MIX4L : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "mix4.l $dst = $src1, $src2",
+	  [(set GR:$dst, (or (and GR:$src1, isMIX4Lable),
+	                     (and (srl GR:$src2, (i64 32)), isMIX4Lable)))]>, isI;
+
+def MIX1R : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "mix1.r $dst = $src1, $src2",
+	  [(set GR:$dst, (or (and (shl GR:$src1, (i64 8)), isMIX1Rable),
+	                     (and GR:$src2, isMIX1Rable)))]>, isI;
+
+def MIX2R : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "mix2.r $dst = $src1, $src2",
+	  [(set GR:$dst, (or (and (shl GR:$src1, (i64 16)), isMIX2Rable),
+	                     (and GR:$src2, isMIX2Rable)))]>, isI;
+
+def MIX4R : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "mix4.r $dst = $src1, $src2",
+	  [(set GR:$dst, (or (and (shl GR:$src1, (i64 32)), isMIX4Rable),
+	                     (and GR:$src2, isMIX4Rable)))]>, isI;
+
+def GETFSIGD : AForm_DAG<0x03, 0x0b, (ops GR:$dst, FP:$src),
+  "getf.sig $dst = $src",
+  []>, isM;
+
+def SETFSIGD : AForm_DAG<0x03, 0x0b, (ops FP:$dst, GR:$src),
+  "setf.sig $dst = $src",
+  []>, isM;
+
+def XMALD : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3),
+  "xma.l $dst = $src1, $src2, $src3",
+  []>, isF;
+def XMAHD : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3),
+  "xma.h $dst = $src1, $src2, $src3",
+  []>, isF;
+def XMAHUD : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3),
+  "xma.hu $dst = $src1, $src2, $src3",
+  []>, isF;
+
+// pseudocode for integer multiplication 
+def : Pat<(mul GR:$src1, GR:$src2),
+           (GETFSIGD (XMALD (SETFSIGD GR:$src1), (SETFSIGD GR:$src2), F0))>;
+def : Pat<(mulhs GR:$src1, GR:$src2),
+           (GETFSIGD (XMAHD (SETFSIGD GR:$src1), (SETFSIGD GR:$src2), F0))>;
+def : Pat<(mulhu GR:$src1, GR:$src2),
+           (GETFSIGD (XMAHUD (SETFSIGD GR:$src1), (SETFSIGD GR:$src2), F0))>;
+
+// TODO: addp4 (addp4 dst = src, r0 is a 32-bit add)
+// has imm form, too
+
+// def ADDS : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, s14imm:$imm),
+//   "adds $dst = $imm, $src1">;
+
+def AND   : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "and $dst = $src1, $src2",
+	  [(set GR:$dst, (and GR:$src1, GR:$src2))]>, isA;
+def ANDCM : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "andcm $dst = $src1, $src2",
+	  [(set GR:$dst, (and GR:$src1, (not GR:$src2)))]>, isA;
+// TODO: and/andcm/or/xor/add/sub/shift immediate forms
+def OR    : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "or $dst = $src1, $src2",
+	  [(set GR:$dst, (or GR:$src1, GR:$src2))]>, isA;
+
+def pOR   : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2, PR:$qp),
+          "($qp) or $dst = $src1, $src2">, isA;
+
+// the following are all a bit unfortunate: we throw away the complement
+// of the compare!
+def CMPEQ : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2),
+          "cmp.eq $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (seteq GR:$src1, GR:$src2))]>, isA;
+def CMPGT : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2),
+          "cmp.gt $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setgt GR:$src1, GR:$src2))]>, isA;
+def CMPGE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2),
+          "cmp.ge $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setge GR:$src1, GR:$src2))]>, isA;
+def CMPLT : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2),
+          "cmp.lt $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setlt GR:$src1, GR:$src2))]>, isA;
+def CMPLE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2),
+          "cmp.le $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setle GR:$src1, GR:$src2))]>, isA;
+def CMPNE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2),
+          "cmp.ne $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setne GR:$src1, GR:$src2))]>, isA;
+def CMPLTU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2),
+          "cmp.ltu $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setult GR:$src1, GR:$src2))]>, isA;
+def CMPGTU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2),
+          "cmp.gtu $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setugt GR:$src1, GR:$src2))]>, isA;
+def CMPLEU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2),
+          "cmp.leu $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setule GR:$src1, GR:$src2))]>, isA;
+def CMPGEU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2),
+          "cmp.geu $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setuge GR:$src1, GR:$src2))]>, isA;
+
+// and we do the whole thing again for FP compares!
+def FCMPEQ : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2),
+          "fcmp.eq $dst, p0 = $src1, $src2",
+          [(set PR:$dst, (seteq FP:$src1, FP:$src2))]>, isF;
+def FCMPGT : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2),
+          "fcmp.gt $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setgt FP:$src1, FP:$src2))]>, isF;
+def FCMPGE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2),
+          "fcmp.ge $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setge FP:$src1, FP:$src2))]>, isF;
+def FCMPLT : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2),
+          "fcmp.lt $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setlt FP:$src1, FP:$src2))]>, isF;
+def FCMPLE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2),
+          "fcmp.le $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setle FP:$src1, FP:$src2))]>, isF;
+def FCMPNE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2),
+          "fcmp.neq $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setne FP:$src1, FP:$src2))]>, isF;
+def FCMPLTU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2),
+          "fcmp.lt $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setult FP:$src1, FP:$src2))]>, isF;
+def FCMPGTU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2),
+          "fcmp.gt $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setugt FP:$src1, FP:$src2))]>, isF;
+def FCMPLEU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2),
+          "fcmp.le $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setule FP:$src1, FP:$src2))]>, isF;
+def FCMPGEU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2),
+          "fcmp.ge $dst, p0 = $src1, $src2",
+	  [(set PR:$dst, (setuge FP:$src1, FP:$src2))]>, isF;
+
+def PCMPEQUNCR0R0 : AForm<0x03, 0x0b, (ops PR:$dst, PR:$qp),
+    "($qp) cmp.eq.unc $dst, p0 = r0, r0">, isA;
+
+def : Pat<(trunc GR:$src),  // truncate i64 to i1
+          (CMPNE GR:$src, r0)>; // $src!=0? If so, PR:$dst=true
+	  
+let isTwoAddress=1 in {
+  def TPCMPEQR0R0 : AForm<0x03, 0x0b, (ops PR:$dst, PR:$bogus, PR:$qp),
+    "($qp) cmp.eq $dst, p0 = r0, r0">, isA;
+  def TPCMPNER0R0 : AForm<0x03, 0x0b, (ops PR:$dst, PR:$bogus, PR:$qp),
+    "($qp) cmp.ne $dst, p0 = r0, r0">, isA;
+}
+
+/* our pseudocode for OR on predicates is:
+pC = pA OR pB
+-------------
+(pA) cmp.eq.unc pC,p0 = r0,r0  // pC = pA
+ ;;
+(pB) cmp.eq pC,p0 = r0,r0 // if (pB) pC = 1 */
+
+def bOR   : Pat<(or PR:$src1, PR:$src2),
+          (TPCMPEQR0R0 (PCMPEQUNCR0R0 PR:$src1), PR:$src2)>;
+
+/* our pseudocode for AND on predicates is:
+ *
+(pA) cmp.eq.unc pC,p0 = r0,r0   // pC = pA
+     cmp.eq pTemp,p0 = r0,r0    // pTemp = NOT pB
+     ;;
+(pB) cmp.ne pTemp,p0 = r0,r0
+     ;;
+(pTemp)cmp.ne pC,p0 = r0,r0    // if (NOT pB) pC = 0  */
+
+def bAND  : Pat<(and PR:$src1, PR:$src2),
+          ( TPCMPNER0R0 (PCMPEQUNCR0R0 PR:$src1),
+	    (TPCMPNER0R0 (CMPEQ r0, r0), PR:$src2) )>;
+
+/* one possible routine for XOR on predicates is:
+
+      // Compute px = py ^ pz
+        // using sum of products: px = (py & !pz) | (pz & !py)
+        // Uses 5 instructions in 3 cycles.
+        // cycle 1
+(pz)    cmp.eq.unc      px = r0, r0     // px = pz
+(py)    cmp.eq.unc      pt = r0, r0     // pt = py
+        ;;
+        // cycle 2
+(pt)    cmp.ne.and      px = r0, r0     // px = px & !pt (px = pz & !pt)
+(pz)    cmp.ne.and      pt = r0, r0     // pt = pt & !pz
+        ;;
+        } { .mmi
+        // cycle 3
+(pt)    cmp.eq.or       px = r0, r0     // px = px | pt
+
+*** Another, which we use here, requires one scratch GR. it is:
+
+        mov             rt = 0          // initialize rt off critical path
+        ;;
+
+        // cycle 1
+(pz)    cmp.eq.unc      px = r0, r0     // px = pz
+(pz)    mov             rt = 1          // rt = pz
+        ;;
+        // cycle 2
+(py)    cmp.ne          px = 1, rt      // if (py) px = !pz
+
+.. these routines kindly provided by Jim Hull
+*/
+  
+def bXOR  : Pat<(xor PR:$src1, PR:$src2),
+          (TPCMPIMM8NE (PCMPEQUNCR0R0 PR:$src2), 1,
+	               (TPCADDS (ADDS r0, 0), 1, PR:$src2),
+                        PR:$src1)>;
+
+def XOR   : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "xor $dst = $src1, $src2",
+	  [(set GR:$dst, (xor GR:$src1, GR:$src2))]>, isA;
+
+def SHLADD: AForm_DAG<0x03, 0x0b, (ops GR:$dst,GR:$src1,s64imm:$imm,GR:$src2),
+          "shladd $dst = $src1, $imm, $src2",
+          [(set GR:$dst, (add GR:$src2, (shl GR:$src1, isSHLADDimm:$imm)))]>, isA;
+
+def SHL   : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "shl $dst = $src1, $src2",
+	  [(set GR:$dst, (shl GR:$src1, GR:$src2))]>, isI;
+
+def SHRU  : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "shr.u $dst = $src1, $src2",
+	  [(set GR:$dst, (srl GR:$src1, GR:$src2))]>, isI;
+
+def SHRS  : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2),
+          "shr $dst = $src1, $src2",
+	  [(set GR:$dst, (sra GR:$src1, GR:$src2))]>, isI;
+
+def MOV : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src), "mov $dst = $src">, isA;
+def FMOV : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "mov $dst = $src">, isF; // XXX: there _is_ no fmov
+def PMOV : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src, PR:$qp),
+  "($qp) mov $dst = $src">, isA;
+
+def SPILL_ALL_PREDICATES_TO_GR : AForm<0x03, 0x0b, (ops GR:$dst),
+  "mov $dst = pr">, isI;
+def FILL_ALL_PREDICATES_FROM_GR : AForm<0x03, 0x0b, (ops GR:$src),
+  "mov pr = $src">, isI;
+
+let isTwoAddress = 1 in {
+  def CMOV : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src2, GR:$src, PR:$qp),
+    "($qp) mov $dst = $src">, isA;
+}
+
+def PFMOV : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src, PR:$qp),
+  "($qp) mov $dst = $src">, isF;
+
+let isTwoAddress = 1 in {
+  def CFMOV : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src2, FP:$src, PR:$qp),
+    "($qp) mov $dst = $src">, isF;
+}
+
+def SELECTINT : Pat<(select PR:$which, GR:$src1, GR:$src2),
+          (CMOV (MOV GR:$src2), GR:$src1, PR:$which)>; // note order!
+def SELECTFP : Pat<(select PR:$which, FP:$src1, FP:$src2),
+          (CFMOV (FMOV FP:$src2), FP:$src1, PR:$which)>; // note order!
+// TODO: can do this faster, w/o using any integer regs (see pattern isel)
+def SELECTBOOL : Pat<(select PR:$which, PR:$src1, PR:$src2), // note order!
+          (CMPNE (CMOV
+            (MOV (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src2)),
+            (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src1), PR:$which), r0)>;
+
+// load constants of various sizes // FIXME: prettyprint -ve constants
+def : Pat<(i64 immSExt14:$imm), (ADDS r0, immSExt14:$imm)>;
+def : Pat<(i1 -1), (CMPEQ r0, r0)>; // TODO: this should just be a ref to p0
+def : Pat<(i1  0), (CMPNE r0, r0)>; // TODO: any instruction actually *using*
+                                    //       this predicate should be killed!
+
+// TODO: support postincrement (reg, imm9) loads+stores - this needs more
+// tablegen support
+
+def IDEF : PseudoInstIA64<(ops variable_ops), "// IDEF">;
+
+def IDEF_GR_D : PseudoInstIA64_DAG<(ops GR:$reg), "// $reg = IDEF",
+    [(set GR:$reg, (undef))]>;
+def IDEF_FP_D : PseudoInstIA64_DAG<(ops FP:$reg), "// $reg = IDEF",
+    [(set FP:$reg, (undef))]>;
+def IDEF_PR_D : PseudoInstIA64_DAG<(ops PR:$reg), "// $reg = IDEF",
+    [(set PR:$reg, (undef))]>;
+
+def IUSE : PseudoInstIA64<(ops variable_ops), "// IUSE">;
+def ADJUSTCALLSTACKUP : PseudoInstIA64<(ops variable_ops),
+                                        "// ADJUSTCALLSTACKUP">;
+def ADJUSTCALLSTACKDOWN : PseudoInstIA64<(ops variable_ops),
+                                         "// ADJUSTCALLSTACKDOWN">;
+def PSEUDO_ALLOC : PseudoInstIA64<(ops GR:$foo), "// PSEUDO_ALLOC">;
+
+def ALLOC : AForm<0x03, 0x0b,
+  (ops GR:$dst, i8imm:$inputs, i8imm:$locals, i8imm:$outputs, i8imm:$rotating),
+    "alloc $dst = ar.pfs,$inputs,$locals,$outputs,$rotating">, isM;
+
+let isTwoAddress = 1 in {
+  def TCMPNE : AForm<0x03, 0x0b,
+  (ops PR:$dst, PR:$src2, GR:$src3, GR:$src4),
+    "cmp.ne $dst, p0 = $src3, $src4">, isA;
+  
+  def TPCMPEQOR : AForm<0x03, 0x0b,
+  (ops PR:$dst, PR:$src2, GR:$src3, GR:$src4, PR:$qp),
+    "($qp) cmp.eq.or $dst, p0 = $src3, $src4">, isA;
+  
+  def TPCMPNE : AForm<0x03, 0x0b,
+  (ops PR:$dst, PR:$src2, GR:$src3, GR:$src4, PR:$qp),
+    "($qp) cmp.ne $dst, p0 = $src3, $src4">, isA;
+  
+  def TPCMPEQ : AForm<0x03, 0x0b,
+  (ops PR:$dst, PR:$src2, GR:$src3, GR:$src4, PR:$qp),
+    "($qp) cmp.eq $dst, p0 = $src3, $src4">, isA;
+}
+
+def MOVSIMM14 : AForm<0x03, 0x0b, (ops GR:$dst, s14imm:$imm),
+  "mov $dst = $imm">, isA;
+def MOVSIMM22 : AForm<0x03, 0x0b, (ops GR:$dst, s22imm:$imm),
+  "mov $dst = $imm">, isA;
+def MOVLIMM64 : AForm<0x03, 0x0b, (ops GR:$dst, s64imm:$imm),
+  "movl $dst = $imm">, isLX;
+
+def SHLI : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, u6imm:$imm), 
+  "shl $dst = $src1, $imm">, isI;
+def SHRUI : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, u6imm:$imm),
+  "shr.u $dst = $src1, $imm">, isI;
+def SHRSI : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, u6imm:$imm),
+  "shr $dst = $src1, $imm">, isI;
+
+def EXTRU : AForm<0x03, 0x0b,
+  (ops GR:$dst, GR:$src1, u6imm:$imm1, u6imm:$imm2),
+  "extr.u $dst = $src1, $imm1, $imm2">, isI;
+
+def DEPZ : AForm<0x03, 0x0b,
+  (ops GR:$dst, GR:$src1, u6imm:$imm1, u6imm:$imm2),
+  "dep.z $dst = $src1, $imm1, $imm2">, isI;
+
+def PCMPEQOR : AForm<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2, PR:$qp),
+  "($qp) cmp.eq.or $dst, p0 = $src1, $src2">, isA;
+def PCMPEQUNC : AForm<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2, PR:$qp),
+  "($qp) cmp.eq.unc $dst, p0 = $src1, $src2">, isA;
+def PCMPNE : AForm<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2, PR:$qp),
+  "($qp) cmp.ne $dst, p0 = $src1, $src2">, isA;
+
+// two destinations! 
+def BCMPEQ : AForm<0x03, 0x0b, (ops PR:$dst1, PR:$dst2, GR:$src1, GR:$src2),
+  "cmp.eq $dst1, dst2 = $src1, $src2">, isA;
+
+def ADDIMM14 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, s14imm:$imm),
+  "adds $dst = $imm, $src1">, isA;
+
+def ADDIMM22 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, s22imm:$imm),
+  "add $dst = $imm, $src1">, isA;
+def CADDIMM22 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, s22imm:$imm, PR:$qp),
+  "($qp) add $dst = $imm, $src1">, isA;
+
+def SUBIMM8 : AForm<0x03, 0x0b, (ops GR:$dst, s8imm:$imm, GR:$src2),
+  "sub $dst = $imm, $src2">, isA;
+
+let isStore = 1, noResults = 1 in {
+  def ST1 : AForm<0x03, 0x0b, (ops GR:$dstPtr, GR:$value),
+    "st1 [$dstPtr] = $value">, isM;
+  def ST2 : AForm<0x03, 0x0b, (ops GR:$dstPtr, GR:$value),
+    "st2 [$dstPtr] = $value">, isM;
+  def ST4 : AForm<0x03, 0x0b, (ops GR:$dstPtr, GR:$value),
+    "st4 [$dstPtr] = $value">, isM;
+  def ST8 : AForm<0x03, 0x0b, (ops GR:$dstPtr, GR:$value),
+    "st8 [$dstPtr] = $value">, isM;
+  def STF4 : AForm<0x03, 0x0b, (ops GR:$dstPtr, FP:$value),
+    "stfs [$dstPtr] = $value">, isM;
+  def STF8 : AForm<0x03, 0x0b, (ops GR:$dstPtr, FP:$value),
+    "stfd [$dstPtr] = $value">, isM;
+  def STF_SPILL : AForm<0x03, 0x0b, (ops GR:$dstPtr, FP:$value),
+    "stf.spill [$dstPtr] = $value">, isM;
+}
+
+let isLoad = 1 in {
+  def LD1 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$srcPtr),
+    "ld1 $dst = [$srcPtr]">, isM;
+  def LD2 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$srcPtr),
+    "ld2 $dst = [$srcPtr]">, isM;
+  def LD4 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$srcPtr),
+    "ld4 $dst = [$srcPtr]">, isM;
+  def LD8 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$srcPtr),
+    "ld8 $dst = [$srcPtr]">, isM;
+  def LDF4 : AForm<0x03, 0x0b, (ops FP:$dst, GR:$srcPtr),
+    "ldfs $dst = [$srcPtr]">, isM;
+  def LDF8 : AForm<0x03, 0x0b, (ops FP:$dst, GR:$srcPtr),
+    "ldfd $dst = [$srcPtr]">, isM;
+  def LDF_FILL : AForm<0x03, 0x0b, (ops FP:$dst, GR:$srcPtr),
+    "ldf.fill $dst = [$srcPtr]">, isM;
+}
+
+def POPCNT : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src),
+  "popcnt $dst = $src",
+  [(set GR:$dst, (ctpop GR:$src))]>, isI;
+
+// some FP stuff:  // TODO: single-precision stuff?
+def FADD : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2),
+  "fadd $dst = $src1, $src2",
+  [(set FP:$dst, (fadd FP:$src1, FP:$src2))]>, isF;
+def FADDS: AForm<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2),
+  "fadd.s $dst = $src1, $src2">, isF;
+def FSUB : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2),
+  "fsub $dst = $src1, $src2",
+  [(set FP:$dst, (fsub FP:$src1, FP:$src2))]>, isF;
+def FMPY : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2),
+  "fmpy $dst = $src1, $src2",
+  [(set FP:$dst, (fmul FP:$src1, FP:$src2))]>, isF;
+def FMA : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3),
+  "fma $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (fadd (fmul FP:$src1, FP:$src2), FP:$src3))]>, isF;
+def FMS : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3),
+  "fms $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (fsub (fmul FP:$src1, FP:$src2), FP:$src3))]>, isF;
+def FNMA : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3),
+  "fnma $dst = $src1, $src2, $src3",
+  [(set FP:$dst, (fneg (fadd (fmul FP:$src1, FP:$src2), FP:$src3)))]>, isF;
+def FABS : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fabs $dst = $src",
+  [(set FP:$dst, (fabs FP:$src))]>, isF;
+def FNEG : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fneg $dst = $src",
+  [(set FP:$dst, (fneg FP:$src))]>, isF;
+def FNEGABS : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fnegabs $dst = $src",
+  [(set FP:$dst, (fneg (fabs FP:$src)))]>, isF;
+
+let isTwoAddress=1 in {
+def TCFMAS1 : AForm<0x03, 0x0b,
+  (ops FP:$dst, FP:$bogussrc, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.s1 $dst = $src1, $src2, $src3">, isF;
+def TCFMADS0 : AForm<0x03, 0x0b,
+  (ops FP:$dst, FP:$bogussrc, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.d.s0 $dst = $src1, $src2, $src3">, isF;
+}
+
+def CFMAS1 : AForm<0x03, 0x0b,
+  (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.s1 $dst = $src1, $src2, $src3">, isF;
+def CFNMAS1 : AForm<0x03, 0x0b,
+  (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fnma.s1 $dst = $src1, $src2, $src3">, isF;
+
+def CFMADS1 : AForm<0x03, 0x0b,
+  (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.d.s1 $dst = $src1, $src2, $src3">, isF;
+def CFMADS0 : AForm<0x03, 0x0b,
+  (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fma.d.s0 $dst = $src1, $src2, $src3">, isF;
+def CFNMADS1 : AForm<0x03, 0x0b,
+  (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3, PR:$qp),
+    "($qp) fnma.d.s1 $dst = $src1, $src2, $src3">, isF;
+
+def FRCPAS0 : AForm<0x03, 0x0b, (ops FP:$dstFR, PR:$dstPR, FP:$src1, FP:$src2),
+  "frcpa.s0 $dstFR, $dstPR = $src1, $src2">, isF;
+def FRCPAS1 : AForm<0x03, 0x0b, (ops FP:$dstFR, PR:$dstPR, FP:$src1, FP:$src2),
+  "frcpa.s1 $dstFR, $dstPR = $src1, $src2">, isF;
+
+def XMAL : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3),
+  "xma.l $dst = $src1, $src2, $src3">, isF;
+
+def FCVTXF : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fcvt.xf $dst = $src">, isF;
+def FCVTXUF : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fcvt.xuf $dst = $src">, isF;
+def FCVTXUFS1 : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fcvt.xuf.s1 $dst = $src">, isF;
+def FCVTFX : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fcvt.fx $dst = $src">, isF;
+def FCVTFXU : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fcvt.fxu $dst = $src">, isF;
+
+def FCVTFXTRUNC : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fcvt.fx.trunc $dst = $src">, isF;
+def FCVTFXUTRUNC : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fcvt.fxu.trunc $dst = $src">, isF;
+
+def FCVTFXTRUNCS1 : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fcvt.fx.trunc.s1 $dst = $src">, isF;
+def FCVTFXUTRUNCS1 : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fcvt.fxu.trunc.s1 $dst = $src">, isF;
+
+def FNORMD : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src),
+  "fnorm.d $dst = $src">, isF;
+
+def GETFD : AForm<0x03, 0x0b, (ops GR:$dst, FP:$src),
+  "getf.d $dst = $src">, isM;
+def SETFD : AForm<0x03, 0x0b, (ops FP:$dst, GR:$src),
+  "setf.d $dst = $src">, isM;
+
+def GETFSIG : AForm<0x03, 0x0b, (ops GR:$dst, FP:$src),
+  "getf.sig $dst = $src">, isM;
+def SETFSIG : AForm<0x03, 0x0b, (ops FP:$dst, GR:$src),
+  "setf.sig $dst = $src">, isM;
+
+// these four FP<->int conversion patterns need checking/cleaning
+def SINT_TO_FP : Pat<(sint_to_fp GR:$src),
+  (FNORMD (FCVTXF (SETFSIG GR:$src)))>;
+def UINT_TO_FP : Pat<(uint_to_fp GR:$src),
+  (FNORMD (FCVTXUF (SETFSIG GR:$src)))>;
+def FP_TO_SINT : Pat<(i64 (fp_to_sint FP:$src)),
+  (GETFSIG (FCVTFXTRUNC FP:$src))>;
+def FP_TO_UINT : Pat<(i64 (fp_to_uint FP:$src)),
+  (GETFSIG (FCVTFXUTRUNC FP:$src))>;
+
+
+let isTerminator = 1, isBranch = 1, noResults = 1 in {
+  def BRL_NOTCALL : RawForm<0x03, 0xb0, (ops i64imm:$dst),
+    "(p0) brl.cond.sptk $dst">, isB;
+  def BRLCOND_NOTCALL : RawForm<0x03, 0xb0, (ops PR:$qp, i64imm:$dst),
+    "($qp) brl.cond.sptk $dst">, isB;
+  def BRCOND_NOTCALL : RawForm<0x03, 0xb0, (ops PR:$qp, GR:$dst),
+    "($qp) br.cond.sptk $dst">, isB;
+}
+
+let isCall = 1, noResults = 1, /* isTerminator = 1, isBranch = 1, */
+  Uses = [out0,out1,out2,out3,out4,out5,out6,out7],
+// all calls clobber non-callee-saved registers, and for now, they are these:
+  Defs = [r2,r3,r8,r9,r10,r11,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,
+  r25,r26,r27,r28,r29,r30,r31,
+  p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,
+  F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,
+  F32,F33,F34,F35,F36,F37,F38,F39,F40,F41,F42,F43,F44,F45,F46,F47,F48,F49,
+  F50,F51,F52,F53,F54,F55,F56,
+  F57,F58,F59,F60,F61,F62,F63,F64,F65,F66,F67,F68,F69,F70,F71,F72,F73,F74,
+  F75,F76,F77,F78,F79,F80,F81,
+  F82,F83,F84,F85,F86,F87,F88,F89,F90,F91,F92,F93,F94,F95,F96,F97,F98,F99,
+  F100,F101,F102,F103,F104,F105,
+  F106,F107,F108,F109,F110,F111,F112,F113,F114,F115,F116,F117,F118,F119,
+  F120,F121,F122,F123,F124,F125,F126,F127,
+  out0,out1,out2,out3,out4,out5,out6,out7] in {
+// old pattern call
+  def BRCALL: RawForm<0x03, 0xb0, (ops calltarget:$dst),
+  "br.call.sptk rp = $dst">, isB;   // FIXME: teach llvm about branch regs?
+// new daggy stuff!  
+
+// calls a globaladdress
+  def BRCALL_IPREL_GA : RawForm<0x03, 0xb0, (ops calltarget:$dst),
+  "br.call.sptk rp = $dst">, isB;       // FIXME: teach llvm about branch regs?
+// calls an externalsymbol
+  def BRCALL_IPREL_ES : RawForm<0x03, 0xb0, (ops calltarget:$dst),
+  "br.call.sptk rp = $dst">, isB;       // FIXME: teach llvm about branch regs?
+// calls through a function descriptor
+  def BRCALL_INDIRECT : RawForm<0x03, 0xb0, (ops GR:$branchreg),
+  "br.call.sptk rp = $branchreg">, isB; // FIXME: teach llvm about branch regs?
+  def BRLCOND_CALL : RawForm<0x03, 0xb0, (ops PR:$qp, i64imm:$dst),
+    "($qp) brl.cond.call.sptk $dst">, isB;
+  def BRCOND_CALL : RawForm<0x03, 0xb0, (ops PR:$qp, GR:$dst),
+    "($qp) br.cond.call.sptk $dst">, isB;
+}
+
+// Return branch:
+let isTerminator = 1, isReturn = 1, noResults = 1 in
+  def RET : AForm_DAG<0x03, 0x0b, (ops),
+            "br.ret.sptk.many rp",
+            [(retflag)]>, isB; // return
+def : Pat<(ret), (RET)>;
+
+// the evil stop bit of despair
+def STOP : PseudoInstIA64<(ops variable_ops), ";;">;
+

diff --git a/lib/Target/IA64/IA64MachineFunctionInfo.h b/lib/Target/IA64/IA64MachineFunctionInfo.h
new file mode 100644
index 0000000..fb93056
--- /dev/null
+++ b/lib/Target/IA64/IA64MachineFunctionInfo.h

@@ -0,0 +1,34 @@
+//===-- IA64MachineFunctionInfo.h - IA64-specific information ---*- C++ -*-===//
+//===--                   for MachineFunction                 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares IA64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64MACHINEFUNCTIONINFO_H
+#define IA64MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+//#include "IA64JITInfo.h"
+
+namespace llvm {
+
+class IA64FunctionInfo : public MachineFunctionInfo {
+
+public:
+  unsigned outRegsUsed; // how many 'out' registers are used
+  // by this machinefunction? (used to compute the appropriate
+  // entry in the 'alloc' instruction at the top of the
+  // machinefunction)
+  IA64FunctionInfo(MachineFunction& MF) { outRegsUsed=0; };
+
+};
+
+} // End llvm namespace
+
+#endif
+

diff --git a/lib/Target/IA64/IA64RegisterInfo.cpp b/lib/Target/IA64/IA64RegisterInfo.cpp
new file mode 100644
index 0000000..08327f2
--- /dev/null
+++ b/lib/Target/IA64/IA64RegisterInfo.cpp

@@ -0,0 +1,388 @@
+//===- IA64RegisterInfo.cpp - IA64 Register Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the MRegisterInfo class.  This
+// file is responsible for the frame pointer elimination optimization on IA64.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64.h"
+#include "IA64RegisterInfo.h"
+#include "IA64InstrBuilder.h"
+#include "IA64MachineFunctionInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+IA64RegisterInfo::IA64RegisterInfo(const TargetInstrInfo &tii)
+  : IA64GenRegisterInfo(IA64::ADJUSTCALLSTACKDOWN, IA64::ADJUSTCALLSTACKUP),
+    TII(tii) {}
+
+void IA64RegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned SrcReg, int FrameIdx,
+                                           const TargetRegisterClass *RC) const{
+
+  if (RC == IA64::FPRegisterClass) {
+    BuildMI(MBB, MI, TII.get(IA64::STF_SPILL)).addFrameIndex(FrameIdx)
+      .addReg(SrcReg, false, false, true);
+  } else if (RC == IA64::GRRegisterClass) {
+    BuildMI(MBB, MI, TII.get(IA64::ST8)).addFrameIndex(FrameIdx)
+      .addReg(SrcReg, false, false, true);
+  } else if (RC == IA64::PRRegisterClass) {
+    /* we use IA64::r2 as a temporary register for doing this hackery. */
+    // first we load 0:
+    BuildMI(MBB, MI, TII.get(IA64::MOV), IA64::r2).addReg(IA64::r0);
+    // then conditionally add 1:
+    BuildMI(MBB, MI, TII.get(IA64::CADDIMM22), IA64::r2).addReg(IA64::r2)
+      .addImm(1).addReg(SrcReg, false, false, true);
+    // and then store it to the stack
+    BuildMI(MBB, MI, TII.get(IA64::ST8)).addFrameIndex(FrameIdx).addReg(IA64::r2);
+  } else assert(0 &&
+      "sorry, I don't know how to store this sort of reg in the stack\n");
+}
+
+void IA64RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MI,
+                                            unsigned DestReg, int FrameIdx,
+                                            const TargetRegisterClass *RC)const{
+
+  if (RC == IA64::FPRegisterClass) {
+    BuildMI(MBB, MI, TII.get(IA64::LDF_FILL), DestReg).addFrameIndex(FrameIdx);
+  } else if (RC == IA64::GRRegisterClass) {
+    BuildMI(MBB, MI, TII.get(IA64::LD8), DestReg).addFrameIndex(FrameIdx);
+ } else if (RC == IA64::PRRegisterClass) {
+   // first we load a byte from the stack into r2, our 'predicate hackery'
+   // scratch reg
+   BuildMI(MBB, MI, TII.get(IA64::LD8), IA64::r2).addFrameIndex(FrameIdx);
+   // then we compare it to zero. If it _is_ zero, compare-not-equal to
+   // r0 gives us 0, which is what we want, so that's nice.
+   BuildMI(MBB, MI, TII.get(IA64::CMPNE), DestReg).addReg(IA64::r2).addReg(IA64::r0);
+ } else assert(0 &&
+     "sorry, I don't know how to load this sort of reg from the stack\n");
+}
+
+void IA64RegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *RC) const {
+
+  if(RC == IA64::PRRegisterClass ) // if a bool, we use pseudocode
+    // (SrcReg) DestReg = cmp.eq.unc(r0, r0)
+    BuildMI(MBB, MI, TII.get(IA64::PCMPEQUNC), DestReg)
+      .addReg(IA64::r0).addReg(IA64::r0).addReg(SrcReg);
+  else // otherwise, MOV works (for both gen. regs and FP regs)
+    BuildMI(MBB, MI, TII.get(IA64::MOV), DestReg).addReg(SrcReg);
+}
+
+void IA64RegisterInfo::reMaterialize(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned DestReg,
+                                     const MachineInstr *Orig) const {
+  MachineInstr *MI = Orig->clone();
+  MI->getOperand(0).setReg(DestReg);
+  MBB.insert(I, MI);
+}
+
+const unsigned* IA64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = {
+    IA64::r5,  0
+  };
+  return CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+IA64RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &IA64::GRRegClass,  0
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector IA64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(IA64::r0);
+  Reserved.set(IA64::r1);
+  Reserved.set(IA64::r2);
+  Reserved.set(IA64::r5);
+  Reserved.set(IA64::r12);
+  Reserved.set(IA64::r13);
+  Reserved.set(IA64::r22);
+  Reserved.set(IA64::rp);
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+bool IA64RegisterInfo::hasFP(const MachineFunction &MF) const {
+  return NoFramePointerElim || MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void IA64RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (hasFP(MF)) {
+    // If we have a frame pointer, turn the adjcallstackup instruction into a
+    // 'sub SP, <amt>' and the adjcallstackdown instruction into 'add SP,
+    // <amt>'
+    MachineInstr *Old = I;
+    unsigned Amount = Old->getOperand(0).getImmedValue();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      MachineInstr *New;
+      if (Old->getOpcode() == IA64::ADJUSTCALLSTACKDOWN) {
+        New=BuildMI(TII.get(IA64::ADDIMM22), IA64::r12).addReg(IA64::r12)
+          .addImm(-Amount);
+      } else {
+        assert(Old->getOpcode() == IA64::ADJUSTCALLSTACKUP);
+        New=BuildMI(TII.get(IA64::ADDIMM22), IA64::r12).addReg(IA64::r12)
+          .addImm(Amount);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+void IA64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                           int SPAdj, RegScavenger *RS)const{
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+
+  bool FP = hasFP(MF);
+
+  while (!MI.getOperand(i).isFrameIndex()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getFrameIndex();
+
+  // choose a base register: ( hasFP? framepointer : stack pointer )
+  unsigned BaseRegister = FP ? IA64::r5 : IA64::r12;
+  // Add the base register
+  MI.getOperand(i).ChangeToRegister(BaseRegister, false);
+
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  // If we're not using a Frame Pointer that has been set to the value of the
+  // SP before having the stack size subtracted from it, then add the stack size
+  // to Offset to get the correct offset.
+  Offset += MF.getFrameInfo()->getStackSize();
+
+  // XXX: we use 'r22' as another hack+slash temporary register here :(
+  if (Offset <= 8191 && Offset >= -8192) { // smallish offset
+    // Fix up the old:
+    MI.getOperand(i).ChangeToRegister(IA64::r22, false);
+    //insert the new
+    MachineInstr* nMI=BuildMI(TII.get(IA64::ADDIMM22), IA64::r22)
+      .addReg(BaseRegister).addImm(Offset);
+    MBB.insert(II, nMI);
+  } else { // it's big
+    //fix up the old:
+    MI.getOperand(i).ChangeToRegister(IA64::r22, false);
+    MachineInstr* nMI;
+    nMI=BuildMI(TII.get(IA64::MOVLIMM64), IA64::r22).addImm(Offset);
+    MBB.insert(II, nMI);
+    nMI=BuildMI(TII.get(IA64::ADD), IA64::r22).addReg(BaseRegister)
+      .addReg(IA64::r22);
+    MBB.insert(II, nMI);
+  }
+
+}
+
+void IA64RegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineInstr *MI;
+  bool FP = hasFP(MF);
+
+  // first, we handle the 'alloc' instruction, that should be right up the
+  // top of any function
+  static const unsigned RegsInOrder[96] = { // there are 96 GPRs the
+                                            // RSE worries about
+        IA64::r32, IA64::r33, IA64::r34, IA64::r35,
+        IA64::r36, IA64::r37, IA64::r38, IA64::r39, IA64::r40, IA64::r41,
+        IA64::r42, IA64::r43, IA64::r44, IA64::r45, IA64::r46, IA64::r47,
+        IA64::r48, IA64::r49, IA64::r50, IA64::r51, IA64::r52, IA64::r53,
+        IA64::r54, IA64::r55, IA64::r56, IA64::r57, IA64::r58, IA64::r59,
+        IA64::r60, IA64::r61, IA64::r62, IA64::r63, IA64::r64, IA64::r65,
+        IA64::r66, IA64::r67, IA64::r68, IA64::r69, IA64::r70, IA64::r71,
+        IA64::r72, IA64::r73, IA64::r74, IA64::r75, IA64::r76, IA64::r77,
+        IA64::r78, IA64::r79, IA64::r80, IA64::r81, IA64::r82, IA64::r83,
+        IA64::r84, IA64::r85, IA64::r86, IA64::r87, IA64::r88, IA64::r89,
+        IA64::r90, IA64::r91, IA64::r92, IA64::r93, IA64::r94, IA64::r95,
+        IA64::r96, IA64::r97, IA64::r98, IA64::r99, IA64::r100, IA64::r101,
+        IA64::r102, IA64::r103, IA64::r104, IA64::r105, IA64::r106, IA64::r107,
+        IA64::r108, IA64::r109, IA64::r110, IA64::r111, IA64::r112, IA64::r113,
+        IA64::r114, IA64::r115, IA64::r116, IA64::r117, IA64::r118, IA64::r119,
+        IA64::r120, IA64::r121, IA64::r122, IA64::r123, IA64::r124, IA64::r125,
+        IA64::r126, IA64::r127 };
+
+  unsigned numStackedGPRsUsed=0;
+  for(int i=0; i<96; i++) {
+    if(MF.isPhysRegUsed(RegsInOrder[i]))
+      numStackedGPRsUsed=i+1; // (i+1 and not ++ - consider fn(fp, fp, int)
+  }
+
+  unsigned numOutRegsUsed=MF.getInfo<IA64FunctionInfo>()->outRegsUsed;
+
+  // XXX FIXME : this code should be a bit more reliable (in case there _isn't_
+  // a pseudo_alloc in the MBB)
+  unsigned dstRegOfPseudoAlloc;
+  for(MBBI = MBB.begin(); /*MBBI->getOpcode() != IA64::PSEUDO_ALLOC*/; ++MBBI) {
+    assert(MBBI != MBB.end());
+    if(MBBI->getOpcode() == IA64::PSEUDO_ALLOC) {
+      dstRegOfPseudoAlloc=MBBI->getOperand(0).getReg();
+      break;
+    }
+  }
+
+  MI=BuildMI(TII.get(IA64::ALLOC)).addReg(dstRegOfPseudoAlloc).addImm(0).  \
+     addImm(numStackedGPRsUsed).addImm(numOutRegsUsed).addImm(0);
+  MBB.insert(MBBI, MI);
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned NumBytes = MFI->getStackSize();
+
+  if(FP)
+    NumBytes += 8; // reserve space for the old FP
+
+  // Do we need to allocate space on the stack?
+  if (NumBytes == 0)
+    return;
+
+  // Add 16 bytes at the bottom of the stack (scratch area)
+  // and round the size to a multiple of the alignment.
+  unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned Size = 16 + (FP ? 8 : 0);
+  NumBytes = (NumBytes+Size+Align-1)/Align*Align;
+
+  // Update frame info to pretend that this is part of the stack...
+  MFI->setStackSize(NumBytes);
+
+  // adjust stack pointer: r12 -= numbytes
+  if (NumBytes <= 8191) {
+    MI=BuildMI(TII.get(IA64::ADDIMM22),IA64::r12).addReg(IA64::r12).
+      addImm(-NumBytes);
+    MBB.insert(MBBI, MI);
+  } else { // we use r22 as a scratch register here
+    MI=BuildMI(TII.get(IA64::MOVLIMM64), IA64::r22).addImm(-NumBytes);
+    // FIXME: MOVLSI32 expects a _u_32imm
+    MBB.insert(MBBI, MI);  // first load the decrement into r22
+    MI=BuildMI(TII.get(IA64::ADD), IA64::r12).addReg(IA64::r12).addReg(IA64::r22);
+    MBB.insert(MBBI, MI);  // then add (subtract) it to r12 (stack ptr)
+  }
+
+  // now if we need to, save the old FP and set the new
+  if (FP) {
+    MI = BuildMI(TII.get(IA64::ST8)).addReg(IA64::r12).addReg(IA64::r5);
+    MBB.insert(MBBI, MI);
+    // this must be the last instr in the prolog ?  (XXX: why??)
+    MI = BuildMI(TII.get(IA64::MOV), IA64::r5).addReg(IA64::r12);
+    MBB.insert(MBBI, MI);
+  }
+
+}
+
+void IA64RegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  MachineInstr *MI;
+  assert(MBBI->getOpcode() == IA64::RET &&
+         "Can only insert epilog into returning blocks");
+
+  bool FP = hasFP(MF);
+
+  // Get the number of bytes allocated from the FrameInfo...
+  unsigned NumBytes = MFI->getStackSize();
+
+  //now if we need to, restore the old FP
+  if (FP)
+  {
+    //copy the FP into the SP (discards allocas)
+    MI=BuildMI(TII.get(IA64::MOV), IA64::r12).addReg(IA64::r5);
+    MBB.insert(MBBI, MI);
+    //restore the FP
+    MI=BuildMI(TII.get(IA64::LD8), IA64::r5).addReg(IA64::r5);
+    MBB.insert(MBBI, MI);
+  }
+
+  if (NumBytes != 0)
+  {
+    if (NumBytes <= 8191) {
+      MI=BuildMI(TII.get(IA64::ADDIMM22),IA64::r12).addReg(IA64::r12).
+        addImm(NumBytes);
+      MBB.insert(MBBI, MI);
+    } else {
+      MI=BuildMI(TII.get(IA64::MOVLIMM64), IA64::r22).addImm(NumBytes);
+      MBB.insert(MBBI, MI);
+      MI=BuildMI(TII.get(IA64::ADD), IA64::r12).addReg(IA64::r12).
+        addReg(IA64::r22);
+      MBB.insert(MBBI, MI);
+    }
+  }
+
+}
+
+unsigned IA64RegisterInfo::getRARegister() const {
+  assert(0 && "What is the return address register");
+  return 0;
+}
+
+unsigned IA64RegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? IA64::r5 : IA64::r12;
+}
+
+unsigned IA64RegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned IA64RegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+#include "IA64GenRegisterInfo.inc"
+

diff --git a/lib/Target/IA64/IA64RegisterInfo.h b/lib/Target/IA64/IA64RegisterInfo.h
new file mode 100644
index 0000000..162ad5a
--- /dev/null
+++ b/lib/Target/IA64/IA64RegisterInfo.h

@@ -0,0 +1,81 @@
+//===- IA64RegisterInfo.h - IA64 Register Information Impl ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the IA64 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64REGISTERINFO_H
+#define IA64REGISTERINFO_H
+
+#include "llvm/Target/MRegisterInfo.h"
+#include "IA64GenRegisterInfo.h.inc"
+
+namespace llvm { class llvm::Type; }
+
+namespace llvm {
+
+class TargetInstrInfo;
+
+struct IA64RegisterInfo : public IA64GenRegisterInfo {
+  const TargetInstrInfo &TII;
+
+  IA64RegisterInfo(const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, int FrameIndex,
+                           const TargetRegisterClass *RC) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC) const;
+
+  void copyRegToReg(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator MI,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *RC) const;
+
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, const MachineInstr *Orig) const;
+
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+} // End llvm namespace
+
+#endif
+

diff --git a/lib/Target/IA64/IA64RegisterInfo.td b/lib/Target/IA64/IA64RegisterInfo.td
new file mode 100644
index 0000000..087c18f
--- /dev/null
+++ b/lib/Target/IA64/IA64RegisterInfo.td

@@ -0,0 +1,508 @@
+//===- IA64RegisterInfo.td - Describe the IA64 Register File ----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the IA64 register file, defining the registers
+// themselves, aliases between the registers, and the register classes built
+// out of the registers.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+
+class IA64Register<string n> : Register<n> {
+    let Namespace = "IA64";
+}
+
+// GR - One of 128 32-bit general registers
+class GR<bits<7> num, string n> : IA64Register<n> {
+    field bits<7> Num = num;
+}
+
+// FP - One of 128 82-bit floating-point registers
+class FP<bits<7> num, string n> : IA64Register<n> {
+    field bits<7> Num = num;
+}
+
+// PR - One of 64 1-bit predicate registers
+class PR<bits<6> num, string n> : IA64Register<n> {
+    field bits<6> Num = num;
+}
+
+/* general registers */
+def r0 : GR< 0, "r0">, DwarfRegNum<0>;
+def r1 : GR< 1, "r1">, DwarfRegNum<1>;
+def r2 : GR< 2, "r2">, DwarfRegNum<2>;
+def r3 : GR< 3, "r3">, DwarfRegNum<3>;
+def r4 : GR< 4, "r4">, DwarfRegNum<4>;
+def r5 : GR< 5, "r5">, DwarfRegNum<5>;
+def r6 : GR< 6, "r6">, DwarfRegNum<6>;
+def r7 : GR< 7, "r7">, DwarfRegNum<7>;
+def r8 : GR< 8, "r8">, DwarfRegNum<8>;
+def r9 : GR< 9, "r9">, DwarfRegNum<9>;
+def r10 : GR< 10, "r10">, DwarfRegNum<10>;
+def r11 : GR< 11, "r11">, DwarfRegNum<11>;
+def r12 : GR< 12, "r12">, DwarfRegNum<12>;
+def r13 : GR< 13, "r13">, DwarfRegNum<13>;
+def r14 : GR< 14, "r14">, DwarfRegNum<14>;
+def r15 : GR< 15, "r15">, DwarfRegNum<15>;
+def r16 : GR< 16, "r16">, DwarfRegNum<16>;
+def r17 : GR< 17, "r17">, DwarfRegNum<17>;
+def r18 : GR< 18, "r18">, DwarfRegNum<18>;
+def r19 : GR< 19, "r19">, DwarfRegNum<19>;
+def r20 : GR< 20, "r20">, DwarfRegNum<20>;
+def r21 : GR< 21, "r21">, DwarfRegNum<21>;
+def r22 : GR< 22, "r22">, DwarfRegNum<22>;
+def r23 : GR< 23, "r23">, DwarfRegNum<23>;
+def r24 : GR< 24, "r24">, DwarfRegNum<24>;
+def r25 : GR< 25, "r25">, DwarfRegNum<25>;
+def r26 : GR< 26, "r26">, DwarfRegNum<26>;
+def r27 : GR< 27, "r27">, DwarfRegNum<27>;
+def r28 : GR< 28, "r28">, DwarfRegNum<28>;
+def r29 : GR< 29, "r29">, DwarfRegNum<29>;
+def r30 : GR< 30, "r30">, DwarfRegNum<30>;
+def r31 : GR< 31, "r31">, DwarfRegNum<31>;
+def r32 : GR< 32, "r32">, DwarfRegNum<32>;
+def r33 : GR< 33, "r33">, DwarfRegNum<33>;
+def r34 : GR< 34, "r34">, DwarfRegNum<34>;
+def r35 : GR< 35, "r35">, DwarfRegNum<35>;
+def r36 : GR< 36, "r36">, DwarfRegNum<36>;
+def r37 : GR< 37, "r37">, DwarfRegNum<37>;
+def r38 : GR< 38, "r38">, DwarfRegNum<38>;
+def r39 : GR< 39, "r39">, DwarfRegNum<39>;
+def r40 : GR< 40, "r40">, DwarfRegNum<40>;
+def r41 : GR< 41, "r41">, DwarfRegNum<41>;
+def r42 : GR< 42, "r42">, DwarfRegNum<42>;
+def r43 : GR< 43, "r43">, DwarfRegNum<43>;
+def r44 : GR< 44, "r44">, DwarfRegNum<44>;
+def r45 : GR< 45, "r45">, DwarfRegNum<45>;
+def r46 : GR< 46, "r46">, DwarfRegNum<46>;
+def r47 : GR< 47, "r47">, DwarfRegNum<47>;
+def r48 : GR< 48, "r48">, DwarfRegNum<48>;
+def r49 : GR< 49, "r49">, DwarfRegNum<49>;
+def r50 : GR< 50, "r50">, DwarfRegNum<50>;
+def r51 : GR< 51, "r51">, DwarfRegNum<51>;
+def r52 : GR< 52, "r52">, DwarfRegNum<52>;
+def r53 : GR< 53, "r53">, DwarfRegNum<53>;
+def r54 : GR< 54, "r54">, DwarfRegNum<54>;
+def r55 : GR< 55, "r55">, DwarfRegNum<55>;
+def r56 : GR< 56, "r56">, DwarfRegNum<56>;
+def r57 : GR< 57, "r57">, DwarfRegNum<57>;
+def r58 : GR< 58, "r58">, DwarfRegNum<58>;
+def r59 : GR< 59, "r59">, DwarfRegNum<59>;
+def r60 : GR< 60, "r60">, DwarfRegNum<60>;
+def r61 : GR< 61, "r61">, DwarfRegNum<61>;
+def r62 : GR< 62, "r62">, DwarfRegNum<62>;
+def r63 : GR< 63, "r63">, DwarfRegNum<63>;
+def r64 : GR< 64, "r64">, DwarfRegNum<64>;
+def r65 : GR< 65, "r65">, DwarfRegNum<65>;
+def r66 : GR< 66, "r66">, DwarfRegNum<66>;
+def r67 : GR< 67, "r67">, DwarfRegNum<67>;
+def r68 : GR< 68, "r68">, DwarfRegNum<68>;
+def r69 : GR< 69, "r69">, DwarfRegNum<69>;
+def r70 : GR< 70, "r70">, DwarfRegNum<70>;
+def r71 : GR< 71, "r71">, DwarfRegNum<71>;
+def r72 : GR< 72, "r72">, DwarfRegNum<72>;
+def r73 : GR< 73, "r73">, DwarfRegNum<73>;
+def r74 : GR< 74, "r74">, DwarfRegNum<74>;
+def r75 : GR< 75, "r75">, DwarfRegNum<75>;
+def r76 : GR< 76, "r76">, DwarfRegNum<76>;
+def r77 : GR< 77, "r77">, DwarfRegNum<77>;
+def r78 : GR< 78, "r78">, DwarfRegNum<78>;
+def r79 : GR< 79, "r79">, DwarfRegNum<79>;
+def r80 : GR< 80, "r80">, DwarfRegNum<80>;
+def r81 : GR< 81, "r81">, DwarfRegNum<81>;
+def r82 : GR< 82, "r82">, DwarfRegNum<82>;
+def r83 : GR< 83, "r83">, DwarfRegNum<83>;
+def r84 : GR< 84, "r84">, DwarfRegNum<84>;
+def r85 : GR< 85, "r85">, DwarfRegNum<85>;
+def r86 : GR< 86, "r86">, DwarfRegNum<86>;
+def r87 : GR< 87, "r87">, DwarfRegNum<87>;
+def r88 : GR< 88, "r88">, DwarfRegNum<88>;
+def r89 : GR< 89, "r89">, DwarfRegNum<89>;
+def r90 : GR< 90, "r90">, DwarfRegNum<90>;
+def r91 : GR< 91, "r91">, DwarfRegNum<91>;
+def r92 : GR< 92, "r92">, DwarfRegNum<92>;
+def r93 : GR< 93, "r93">, DwarfRegNum<93>;
+def r94 : GR< 94, "r94">, DwarfRegNum<94>;
+def r95 : GR< 95, "r95">, DwarfRegNum<95>;
+def r96 : GR< 96, "r96">, DwarfRegNum<96>;
+def r97 : GR< 97, "r97">, DwarfRegNum<97>;
+def r98 : GR< 98, "r98">, DwarfRegNum<98>;
+def r99 : GR< 99, "r99">, DwarfRegNum<99>;
+def r100 : GR< 100, "r100">, DwarfRegNum<100>;
+def r101 : GR< 101, "r101">, DwarfRegNum<101>;
+def r102 : GR< 102, "r102">, DwarfRegNum<102>;
+def r103 : GR< 103, "r103">, DwarfRegNum<103>;
+def r104 : GR< 104, "r104">, DwarfRegNum<104>;
+def r105 : GR< 105, "r105">, DwarfRegNum<105>;
+def r106 : GR< 106, "r106">, DwarfRegNum<106>;
+def r107 : GR< 107, "r107">, DwarfRegNum<107>;
+def r108 : GR< 108, "r108">, DwarfRegNum<108>;
+def r109 : GR< 109, "r109">, DwarfRegNum<109>;
+def r110 : GR< 110, "r110">, DwarfRegNum<110>;
+def r111 : GR< 111, "r111">, DwarfRegNum<111>;
+def r112 : GR< 112, "r112">, DwarfRegNum<112>;
+def r113 : GR< 113, "r113">, DwarfRegNum<113>;
+def r114 : GR< 114, "r114">, DwarfRegNum<114>;
+def r115 : GR< 115, "r115">, DwarfRegNum<115>;
+def r116 : GR< 116, "r116">, DwarfRegNum<116>;
+def r117 : GR< 117, "r117">, DwarfRegNum<117>;
+def r118 : GR< 118, "r118">, DwarfRegNum<118>;
+def r119 : GR< 119, "r119">, DwarfRegNum<119>;
+def r120 : GR< 120, "r120">, DwarfRegNum<120>;
+def r121 : GR< 121, "r121">, DwarfRegNum<121>;
+def r122 : GR< 122, "r122">, DwarfRegNum<122>;
+def r123 : GR< 123, "r123">, DwarfRegNum<123>;
+def r124 : GR< 124, "r124">, DwarfRegNum<124>;
+def r125 : GR< 125, "r125">, DwarfRegNum<125>;
+def r126 : GR< 126, "r126">, DwarfRegNum<126>;
+def r127 : GR< 127, "r127">, DwarfRegNum<127>;
+
+/* floating-point registers */
+def F0 : FP< 0, "f0">, DwarfRegNum<128>;
+def F1 : FP< 1, "f1">, DwarfRegNum<129>;
+def F2 : FP< 2, "f2">, DwarfRegNum<130>;
+def F3 : FP< 3, "f3">, DwarfRegNum<131>;
+def F4 : FP< 4, "f4">, DwarfRegNum<132>;
+def F5 : FP< 5, "f5">, DwarfRegNum<133>;
+def F6 : FP< 6, "f6">, DwarfRegNum<134>;
+def F7 : FP< 7, "f7">, DwarfRegNum<135>;
+def F8 : FP< 8, "f8">, DwarfRegNum<136>;
+def F9 : FP< 9, "f9">, DwarfRegNum<137>;
+def F10 : FP< 10, "f10">, DwarfRegNum<138>;
+def F11 : FP< 11, "f11">, DwarfRegNum<139>;
+def F12 : FP< 12, "f12">, DwarfRegNum<140>;
+def F13 : FP< 13, "f13">, DwarfRegNum<141>;
+def F14 : FP< 14, "f14">, DwarfRegNum<142>;
+def F15 : FP< 15, "f15">, DwarfRegNum<143>;
+def F16 : FP< 16, "f16">, DwarfRegNum<144>;
+def F17 : FP< 17, "f17">, DwarfRegNum<145>;
+def F18 : FP< 18, "f18">, DwarfRegNum<146>;
+def F19 : FP< 19, "f19">, DwarfRegNum<147>;
+def F20 : FP< 20, "f20">, DwarfRegNum<148>;
+def F21 : FP< 21, "f21">, DwarfRegNum<149>;
+def F22 : FP< 22, "f22">, DwarfRegNum<150>;
+def F23 : FP< 23, "f23">, DwarfRegNum<151>;
+def F24 : FP< 24, "f24">, DwarfRegNum<152>;
+def F25 : FP< 25, "f25">, DwarfRegNum<153>;
+def F26 : FP< 26, "f26">, DwarfRegNum<154>;
+def F27 : FP< 27, "f27">, DwarfRegNum<155>;
+def F28 : FP< 28, "f28">, DwarfRegNum<156>;
+def F29 : FP< 29, "f29">, DwarfRegNum<157>;
+def F30 : FP< 30, "f30">, DwarfRegNum<158>;
+def F31 : FP< 31, "f31">, DwarfRegNum<159>;
+def F32 : FP< 32, "f32">, DwarfRegNum<160>;
+def F33 : FP< 33, "f33">, DwarfRegNum<161>;
+def F34 : FP< 34, "f34">, DwarfRegNum<162>;
+def F35 : FP< 35, "f35">, DwarfRegNum<163>;
+def F36 : FP< 36, "f36">, DwarfRegNum<164>;
+def F37 : FP< 37, "f37">, DwarfRegNum<165>;
+def F38 : FP< 38, "f38">, DwarfRegNum<166>;
+def F39 : FP< 39, "f39">, DwarfRegNum<167>;
+def F40 : FP< 40, "f40">, DwarfRegNum<168>;
+def F41 : FP< 41, "f41">, DwarfRegNum<169>;
+def F42 : FP< 42, "f42">, DwarfRegNum<170>;
+def F43 : FP< 43, "f43">, DwarfRegNum<171>;
+def F44 : FP< 44, "f44">, DwarfRegNum<172>;
+def F45 : FP< 45, "f45">, DwarfRegNum<173>;
+def F46 : FP< 46, "f46">, DwarfRegNum<174>;
+def F47 : FP< 47, "f47">, DwarfRegNum<175>;
+def F48 : FP< 48, "f48">, DwarfRegNum<176>;
+def F49 : FP< 49, "f49">, DwarfRegNum<177>;
+def F50 : FP< 50, "f50">, DwarfRegNum<178>;
+def F51 : FP< 51, "f51">, DwarfRegNum<179>;
+def F52 : FP< 52, "f52">, DwarfRegNum<180>;
+def F53 : FP< 53, "f53">, DwarfRegNum<181>;
+def F54 : FP< 54, "f54">, DwarfRegNum<182>;
+def F55 : FP< 55, "f55">, DwarfRegNum<183>;
+def F56 : FP< 56, "f56">, DwarfRegNum<184>;
+def F57 : FP< 57, "f57">, DwarfRegNum<185>;
+def F58 : FP< 58, "f58">, DwarfRegNum<186>;
+def F59 : FP< 59, "f59">, DwarfRegNum<187>;
+def F60 : FP< 60, "f60">, DwarfRegNum<188>;
+def F61 : FP< 61, "f61">, DwarfRegNum<189>;
+def F62 : FP< 62, "f62">, DwarfRegNum<190>;
+def F63 : FP< 63, "f63">, DwarfRegNum<191>;
+def F64 : FP< 64, "f64">, DwarfRegNum<192>;
+def F65 : FP< 65, "f65">, DwarfRegNum<193>;
+def F66 : FP< 66, "f66">, DwarfRegNum<194>;
+def F67 : FP< 67, "f67">, DwarfRegNum<195>;
+def F68 : FP< 68, "f68">, DwarfRegNum<196>;
+def F69 : FP< 69, "f69">, DwarfRegNum<197>;
+def F70 : FP< 70, "f70">, DwarfRegNum<198>;
+def F71 : FP< 71, "f71">, DwarfRegNum<199>;
+def F72 : FP< 72, "f72">, DwarfRegNum<200>;
+def F73 : FP< 73, "f73">, DwarfRegNum<201>;
+def F74 : FP< 74, "f74">, DwarfRegNum<202>;
+def F75 : FP< 75, "f75">, DwarfRegNum<203>;
+def F76 : FP< 76, "f76">, DwarfRegNum<204>;
+def F77 : FP< 77, "f77">, DwarfRegNum<205>;
+def F78 : FP< 78, "f78">, DwarfRegNum<206>;
+def F79 : FP< 79, "f79">, DwarfRegNum<207>;
+def F80 : FP< 80, "f80">, DwarfRegNum<208>;
+def F81 : FP< 81, "f81">, DwarfRegNum<209>;
+def F82 : FP< 82, "f82">, DwarfRegNum<210>;
+def F83 : FP< 83, "f83">, DwarfRegNum<211>;
+def F84 : FP< 84, "f84">, DwarfRegNum<212>;
+def F85 : FP< 85, "f85">, DwarfRegNum<213>;
+def F86 : FP< 86, "f86">, DwarfRegNum<214>;
+def F87 : FP< 87, "f87">, DwarfRegNum<215>;
+def F88 : FP< 88, "f88">, DwarfRegNum<216>;
+def F89 : FP< 89, "f89">, DwarfRegNum<217>;
+def F90 : FP< 90, "f90">, DwarfRegNum<218>;
+def F91 : FP< 91, "f91">, DwarfRegNum<219>;
+def F92 : FP< 92, "f92">, DwarfRegNum<220>;
+def F93 : FP< 93, "f93">, DwarfRegNum<221>;
+def F94 : FP< 94, "f94">, DwarfRegNum<222>;
+def F95 : FP< 95, "f95">, DwarfRegNum<223>;
+def F96 : FP< 96, "f96">, DwarfRegNum<224>;
+def F97 : FP< 97, "f97">, DwarfRegNum<225>;
+def F98 : FP< 98, "f98">, DwarfRegNum<226>;
+def F99 : FP< 99, "f99">, DwarfRegNum<227>;
+def F100 : FP< 100, "f100">, DwarfRegNum<228>;
+def F101 : FP< 101, "f101">, DwarfRegNum<229>;
+def F102 : FP< 102, "f102">, DwarfRegNum<230>;
+def F103 : FP< 103, "f103">, DwarfRegNum<231>;
+def F104 : FP< 104, "f104">, DwarfRegNum<232>;
+def F105 : FP< 105, "f105">, DwarfRegNum<233>;
+def F106 : FP< 106, "f106">, DwarfRegNum<234>;
+def F107 : FP< 107, "f107">, DwarfRegNum<235>;
+def F108 : FP< 108, "f108">, DwarfRegNum<236>;
+def F109 : FP< 109, "f109">, DwarfRegNum<237>;
+def F110 : FP< 110, "f110">, DwarfRegNum<238>;
+def F111 : FP< 111, "f111">, DwarfRegNum<239>;
+def F112 : FP< 112, "f112">, DwarfRegNum<240>;
+def F113 : FP< 113, "f113">, DwarfRegNum<241>;
+def F114 : FP< 114, "f114">, DwarfRegNum<242>;
+def F115 : FP< 115, "f115">, DwarfRegNum<243>;
+def F116 : FP< 116, "f116">, DwarfRegNum<244>;
+def F117 : FP< 117, "f117">, DwarfRegNum<245>;
+def F118 : FP< 118, "f118">, DwarfRegNum<246>;
+def F119 : FP< 119, "f119">, DwarfRegNum<247>;
+def F120 : FP< 120, "f120">, DwarfRegNum<248>;
+def F121 : FP< 121, "f121">, DwarfRegNum<249>;
+def F122 : FP< 122, "f122">, DwarfRegNum<250>;
+def F123 : FP< 123, "f123">, DwarfRegNum<251>;
+def F124 : FP< 124, "f124">, DwarfRegNum<252>;
+def F125 : FP< 125, "f125">, DwarfRegNum<253>;
+def F126 : FP< 126, "f126">, DwarfRegNum<254>;
+def F127 : FP< 127, "f127">, DwarfRegNum<255>;
+
+/* predicate registers */
+def p0 : PR< 0, "p0">, DwarfRegNum<256>;
+def p1 : PR< 1, "p1">, DwarfRegNum<257>;
+def p2 : PR< 2, "p2">, DwarfRegNum<258>;
+def p3 : PR< 3, "p3">, DwarfRegNum<259>;
+def p4 : PR< 4, "p4">, DwarfRegNum<260>;
+def p5 : PR< 5, "p5">, DwarfRegNum<261>;
+def p6 : PR< 6, "p6">, DwarfRegNum<262>;
+def p7 : PR< 7, "p7">, DwarfRegNum<263>;
+def p8 : PR< 8, "p8">, DwarfRegNum<264>;
+def p9 : PR< 9, "p9">, DwarfRegNum<265>;
+def p10 : PR< 10, "p10">, DwarfRegNum<266>;
+def p11 : PR< 11, "p11">, DwarfRegNum<267>;
+def p12 : PR< 12, "p12">, DwarfRegNum<268>;
+def p13 : PR< 13, "p13">, DwarfRegNum<269>;
+def p14 : PR< 14, "p14">, DwarfRegNum<270>;
+def p15 : PR< 15, "p15">, DwarfRegNum<271>;
+def p16 : PR< 16, "p16">, DwarfRegNum<272>;
+def p17 : PR< 17, "p17">, DwarfRegNum<273>;
+def p18 : PR< 18, "p18">, DwarfRegNum<274>;
+def p19 : PR< 19, "p19">, DwarfRegNum<275>;
+def p20 : PR< 20, "p20">, DwarfRegNum<276>;
+def p21 : PR< 21, "p21">, DwarfRegNum<277>;
+def p22 : PR< 22, "p22">, DwarfRegNum<278>;
+def p23 : PR< 23, "p23">, DwarfRegNum<279>;
+def p24 : PR< 24, "p24">, DwarfRegNum<280>;
+def p25 : PR< 25, "p25">, DwarfRegNum<281>;
+def p26 : PR< 26, "p26">, DwarfRegNum<282>;
+def p27 : PR< 27, "p27">, DwarfRegNum<283>;
+def p28 : PR< 28, "p28">, DwarfRegNum<284>;
+def p29 : PR< 29, "p29">, DwarfRegNum<285>;
+def p30 : PR< 30, "p30">, DwarfRegNum<286>;
+def p31 : PR< 31, "p31">, DwarfRegNum<287>;
+def p32 : PR< 32, "p32">, DwarfRegNum<288>;
+def p33 : PR< 33, "p33">, DwarfRegNum<289>;
+def p34 : PR< 34, "p34">, DwarfRegNum<290>;
+def p35 : PR< 35, "p35">, DwarfRegNum<291>;
+def p36 : PR< 36, "p36">, DwarfRegNum<292>;
+def p37 : PR< 37, "p37">, DwarfRegNum<293>;
+def p38 : PR< 38, "p38">, DwarfRegNum<294>;
+def p39 : PR< 39, "p39">, DwarfRegNum<295>;
+def p40 : PR< 40, "p40">, DwarfRegNum<296>;
+def p41 : PR< 41, "p41">, DwarfRegNum<297>;
+def p42 : PR< 42, "p42">, DwarfRegNum<298>;
+def p43 : PR< 43, "p43">, DwarfRegNum<299>;
+def p44 : PR< 44, "p44">, DwarfRegNum<300>;
+def p45 : PR< 45, "p45">, DwarfRegNum<301>;
+def p46 : PR< 46, "p46">, DwarfRegNum<302>;
+def p47 : PR< 47, "p47">, DwarfRegNum<303>;
+def p48 : PR< 48, "p48">, DwarfRegNum<304>;
+def p49 : PR< 49, "p49">, DwarfRegNum<305>;
+def p50 : PR< 50, "p50">, DwarfRegNum<306>;
+def p51 : PR< 51, "p51">, DwarfRegNum<307>;
+def p52 : PR< 52, "p52">, DwarfRegNum<308>;
+def p53 : PR< 53, "p53">, DwarfRegNum<309>;
+def p54 : PR< 54, "p54">, DwarfRegNum<310>;
+def p55 : PR< 55, "p55">, DwarfRegNum<311>;
+def p56 : PR< 56, "p56">, DwarfRegNum<312>;
+def p57 : PR< 57, "p57">, DwarfRegNum<313>;
+def p58 : PR< 58, "p58">, DwarfRegNum<314>;
+def p59 : PR< 59, "p59">, DwarfRegNum<315>;
+def p60 : PR< 60, "p60">, DwarfRegNum<316>;
+def p61 : PR< 61, "p61">, DwarfRegNum<317>;
+def p62 : PR< 62, "p62">, DwarfRegNum<318>;
+def p63 : PR< 63, "p63">, DwarfRegNum<319>;
+
+// XXX : this is temporary, we'll eventually have the output registers
+// in the general purpose register class too?
+def out0 : GR<0, "out0">, DwarfRegNum<120>;
+def out1 : GR<1, "out1">, DwarfRegNum<121>;
+def out2 : GR<2, "out2">, DwarfRegNum<122>;
+def out3 : GR<3, "out3">, DwarfRegNum<123>;
+def out4 : GR<4, "out4">, DwarfRegNum<124>;
+def out5 : GR<5, "out5">, DwarfRegNum<125>;
+def out6 : GR<6, "out6">, DwarfRegNum<126>;
+def out7 : GR<7, "out7">, DwarfRegNum<127>;
+
+// application (special) registers:
+
+// "previous function state" application register
+def AR_PFS : GR<0, "ar.pfs">, DwarfRegNum<331>;
+
+// "return pointer" (this is really branch register b0)
+def rp : GR<0, "rp">, DwarfRegNum<-1>;
+
+// branch reg 6
+def B6 : GR<0, "b6">, DwarfRegNum<326>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// these are the scratch (+stacked) general registers
+// FIXME/XXX  we also reserve a frame pointer (r5)
+// FIXME/XXX  we also reserve r2 for spilling/filling predicates
+// in IA64RegisterInfo.cpp
+// FIXME/XXX  we also reserve r22 for calculating addresses
+// in IA64RegisterInfo.cpp
+
+def GR : RegisterClass<"IA64", [i64], 64, 
+       [
+       
+//FIXME!: for both readability and performance, we don't want the out
+//        registers to be the first ones allocated
+
+        out7, out6, out5, out4, out3, out2, out1, out0,
+        r3,  r8,  r9,  r10, r11, r14, r15,
+        r16, r17, r18, r19, r20, r21, r23,
+        r24, r25, r26, r27, r28, r29, r30, r31,
+        r32, r33, r34, r35, r36, r37, r38, r39,
+        r40, r41, r42, r43, r44, r45, r46, r47,
+        r48, r49, r50, r51, r52, r53, r54, r55,
+        r56, r57, r58, r59, r60, r61, r62, r63,
+        r64, r65, r66, r67, r68, r69, r70, r71,
+        r72, r73, r74, r75, r76, r77, r78, r79,
+        r80, r81, r82, r83, r84, r85, r86, r87,
+        r88, r89, r90, r91, r92, r93, r94, r95,
+        r96, r97, r98, r99, r100, r101, r102, r103,
+        r104, r105, r106, r107, r108, r109, r110, r111,
+        r112, r113, r114, r115, r116, r117, r118, r119,
+        r120, r121, r122, r123, r124, r125, r126, r127,
+	r0, r1, r2, r5, r12, r13, r22, rp]> // the last 16 are special (look down)
+  {
+    let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GRClass::iterator
+    GRClass::allocation_order_begin(const MachineFunction &MF) const {
+	// hide the 8 out? registers appropriately:
+	return begin()+(8-(MF.getInfo<IA64FunctionInfo>()->outRegsUsed));
+      }
+
+      GRClass::iterator
+      GRClass::allocation_order_end(const MachineFunction &MF) const {
+	int numReservedRegs=8; // the 8 special registers r0,r1,r2,r5,r12,r13 etc
+
+	// we also can't allocate registers for use as locals if they're
+	// already required as 'out' registers
+	numReservedRegs+=MF.getInfo<IA64FunctionInfo>()->outRegsUsed;
+	
+	return end()-numReservedRegs; // hide registers appropriately
+      }
+  }];
+}
+
+
+// these are the scratch (+stacked) FP registers
+
+def FP : RegisterClass<"IA64", [f64], 64, 
+       [F6, F7, 
+	F8, F9, F10, F11, F12, F13, F14, F15, 
+	F32, F33, F34, F35, F36, F37, F38, F39, 
+	F40, F41, F42, F43, F44, F45, F46, F47, 
+	F48, F49, F50, F51, F52, F53, F54, F55, 
+	F56, F57, F58, F59, F60, F61, F62, F63, 
+	F64, F65, F66, F67, F68, F69, F70, F71, 
+	F72, F73, F74, F75, F76, F77, F78, F79, 
+	F80, F81, F82, F83, F84, F85, F86, F87, 
+	F88, F89, F90, F91, F92, F93, F94, F95, 
+	F96, F97, F98, F99, F100, F101, F102, F103, 
+	F104, F105, F106, F107, F108, F109, F110, F111, 
+	F112, F113, F114, F115, F116, F117, F118, F119, 
+	F120, F121, F122, F123, F124, F125, F126, F127,
+	F0, F1]> // these last two are hidden
+  {
+// the 128s here are to make stf.spill/ldf.fill happy,
+// when storing full (82-bit) FP regs to stack slots
+// we need to 16-byte align
+    let Size=128;
+    let Alignment=128;
+
+    let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FPClass::iterator
+    FPClass::allocation_order_begin(const MachineFunction &MF) const {
+	return begin(); // we don't hide any FP regs from the start
+      }
+
+      FPClass::iterator
+      FPClass::allocation_order_end(const MachineFunction &MF) const {
+	return end()-2; // we hide regs F0, F1 from the end 
+      }
+  }];
+}
+
+// these are the predicate registers, p0 (1/TRUE) is not here
+def PR : RegisterClass<"IA64", [i1], 64, 
+
+// for now, let's be wimps and only have the scratch predicate regs
+ [p6, p7, p8, p9, p10, p11, p12, p13, p14, p15]> {
+   let Size = 64;
+ }
+
+/*
+ [p1, p2, p3, p4, p5, p6, p7,
+  p8, p9, p10, p11, p12, p13, p14, p15,
+  p16, p17, p18, p19, p20, p21, p22, p23,
+  p24, p25, p26, p27, p28, p29, p30, p31,
+  p32, p33, p34, p35, p36, p37, p38, p39,
+  p40, p41, p42, p43, p44, p45, p46, p47,
+  p48, p49, p50, p51, p52, p53, p54, p55,
+  p56, p57, p58, p59, p60, p61, p62, p63]>;
+  */

diff --git a/lib/Target/IA64/IA64TargetAsmInfo.cpp b/lib/Target/IA64/IA64TargetAsmInfo.cpp
new file mode 100644
index 0000000..1a7e2b2
--- /dev/null
+++ b/lib/Target/IA64/IA64TargetAsmInfo.cpp

@@ -0,0 +1,34 @@
+//===-- IA64TargetAsmInfo.cpp - IA64 asm properties -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the IA64TargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64TargetAsmInfo.h"
+
+using namespace llvm;
+
+IA64TargetAsmInfo::IA64TargetAsmInfo(const IA64TargetMachine &TM) {
+  CommentString = "//";
+  Data8bitsDirective = "\tdata1\t";     // FIXME: check that we are
+  Data16bitsDirective = "\tdata2.ua\t"; // disabling auto-alignment
+  Data32bitsDirective = "\tdata4.ua\t"; // properly
+  Data64bitsDirective = "\tdata8.ua\t";
+  ZeroDirective = "\t.skip\t";
+  AsciiDirective = "\tstring\t";
+
+  GlobalVarAddrPrefix="";
+  GlobalVarAddrSuffix="";
+  FunctionAddrPrefix="@fptr(";
+  FunctionAddrSuffix=")";
+  
+  // FIXME: would be nice to have rodata (no 'w') when appropriate?
+  ConstantPoolSection = "\n\t.section .data, \"aw\", \"progbits\"\n";
+}

diff --git a/lib/Target/IA64/IA64TargetAsmInfo.h b/lib/Target/IA64/IA64TargetAsmInfo.h
new file mode 100644
index 0000000..681253c
--- /dev/null
+++ b/lib/Target/IA64/IA64TargetAsmInfo.h

@@ -0,0 +1,31 @@
+//=====-- IA64TargetAsmInfo.h - IA64 asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the IA64TargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef IA64TARGETASMINFO_H
+#define IA64TARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class IA64TargetMachine;
+
+  struct IA64TargetAsmInfo : public TargetAsmInfo {
+    IA64TargetAsmInfo(const IA64TargetMachine &TM);
+  };
+
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/IA64/IA64TargetMachine.cpp b/lib/Target/IA64/IA64TargetMachine.cpp
new file mode 100644
index 0000000..51beaa1
--- /dev/null
+++ b/lib/Target/IA64/IA64TargetMachine.cpp

@@ -0,0 +1,91 @@
+//===-- IA64TargetMachine.cpp - Define TargetMachine for IA64 -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IA64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IA64TargetAsmInfo.h"
+#include "IA64TargetMachine.h"
+#include "IA64.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+/// IA64TargetMachineModule - Note that this is used on hosts that cannot link
+/// in a library unless there are references into the library.  In particular,
+/// it seems that it is not possible to get things to work on Win32 without
+/// this.  Though it is unused, do not remove it.
+extern "C" int IA64TargetMachineModule;
+int IA64TargetMachineModule = 0;
+
+namespace {
+  RegisterTarget<IA64TargetMachine> X("ia64", "  IA-64 (Itanium)");
+}
+
+const TargetAsmInfo *IA64TargetMachine::createTargetAsmInfo() const {
+  return new IA64TargetAsmInfo(*this);
+}
+
+unsigned IA64TargetMachine::getModuleMatchQuality(const Module &M) {
+  // we match [iI][aA]*64
+  bool seenIA64=false;
+  std::string TT = M.getTargetTriple();
+
+  if (TT.size() >= 4) {
+    if( (TT[0]=='i' || TT[0]=='I') &&
+        (TT[1]=='a' || TT[1]=='A') ) {
+      for(unsigned int i=2; i<(TT.size()-1); i++)
+        if(TT[i]=='6' && TT[i+1]=='4')
+          seenIA64=true;
+    }
+
+    if (seenIA64)
+      return 20; // strong match
+  }
+  // If the target triple is something non-ia64, we don't match.
+  if (!TT.empty()) return 0;
+
+#if defined(__ia64__) || defined(__IA64__)
+  return 5;
+#else
+  return 0;
+#endif
+}
+
+/// IA64TargetMachine ctor - Create an LP64 architecture model
+///
+IA64TargetMachine::IA64TargetMachine(const Module &M, const std::string &FS)
+  : DataLayout("e"),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0),
+    TLInfo(*this) { // FIXME? check this stuff
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool IA64TargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) {
+  PM.add(createIA64DAGToDAGInstructionSelector(*this));
+  return false;
+}
+
+bool IA64TargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) {
+  // Make sure everything is bundled happily
+  PM.add(createIA64BundlingPass(*this));
+  return true;
+}
+bool IA64TargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                           std::ostream &Out) {
+  PM.add(createIA64CodePrinterPass(Out, *this));
+  return false;
+}
+

diff --git a/lib/Target/IA64/IA64TargetMachine.h b/lib/Target/IA64/IA64TargetMachine.h
new file mode 100644
index 0000000..538a330
--- /dev/null
+++ b/lib/Target/IA64/IA64TargetMachine.h

@@ -0,0 +1,60 @@
+//===-- IA64TargetMachine.h - Define TargetMachine for IA64 ---*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Duraid Madina and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the IA64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_IA64TARGETMACHINE_H
+#define LLVM_TARGET_IA64TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "IA64InstrInfo.h"
+#include "IA64ISelLowering.h"
+
+namespace llvm {
+
+class IA64TargetMachine : public LLVMTargetMachine {
+  const TargetData DataLayout;       // Calculates type size & alignment
+  IA64InstrInfo      InstrInfo;
+  TargetFrameInfo    FrameInfo;
+  //IA64JITInfo      JITInfo;
+  IA64TargetLowering TLInfo;
+  
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+
+public:
+  IA64TargetMachine(const Module &M, const std::string &FS);
+
+  virtual const IA64InstrInfo      *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo    *getFrameInfo() const { return &FrameInfo; }
+  virtual       IA64TargetLowering *getTargetLowering() const { 
+    return const_cast<IA64TargetLowering*>(&TLInfo);
+  }
+  virtual const MRegisterInfo    *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  
+  static unsigned getModuleMatchQuality(const Module &M);
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);
+  virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast);
+  virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                  std::ostream &Out);
+};
+} // End llvm namespace
+
+#endif
+
+

diff --git a/lib/Target/IA64/Makefile b/lib/Target/IA64/Makefile
new file mode 100644
index 0000000..f519cf9
--- /dev/null
+++ b/lib/Target/IA64/Makefile

@@ -0,0 +1,18 @@
+##===- lib/Target/IA64/Makefile -----------------------------*- Makefile -*-===##
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by Duraid Madina and is distributed under the
+# University of Illinois Open Source License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMIA64
+TARGET = IA64
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = IA64GenRegisterInfo.h.inc IA64GenRegisterNames.inc \
+                IA64GenRegisterInfo.inc IA64GenInstrNames.inc \
+                IA64GenInstrInfo.inc IA64GenAsmWriter.inc \
+		IA64GenDAGISel.inc
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/IA64/README b/lib/Target/IA64/README
new file mode 100644
index 0000000..852d512
--- /dev/null
+++ b/lib/Target/IA64/README

@@ -0,0 +1,106 @@
+*** README for the LLVM IA64 Backend "Version 0.01" - March 18, 2005
+*** Quote for this version:
+
+      "Kaori and Hitomi are naughty!!"
+
+
+Congratulations, you have found:
+
+**************************************************************** 
+* @@@       @@@       @@@  @@@  @@@@@@@@@@                     *
+* @@@       @@@       @@@  @@@  @@@@@@@@@@@                    *
+* @@!       @@!       @@!  @@@  @@! @@! @@!                    *
+* !@!       !@!       !@!  @!@  !@! !@! !@!                    *
+* @!!       @!!       @!@  !@!  @!! !!@ @!@                    *
+* !!!       !!!       !@!  !!!  !@!   ! !@!                    *
+* !!:       !!:       :!:  !!:  !!:     !!:                    *
+*  :!:       :!:       ::!!:!   :!:     :!:                    *
+*  :: ::::   :: ::::    ::::    :::     ::                     *
+* : :: : :  : :: : :     :       :      :                      *
+*                                                              *
+*                                                              *
+*  @@@@@@   @@@  @@@       @@@   @@@@@@     @@@@@@       @@@   *
+* @@@@@@@@  @@@@ @@@       @@@  @@@@@@@@   @@@@@@@      @@@@   *
+* @@!  @@@  @@!@!@@@       @@!  @@!  @@@  !@@          @@!@!   *
+* !@!  @!@  !@!!@!@!       !@!  !@!  @!@  !@!         !@!!@!   *
+* @!@  !@!  @!@ !!@!       !!@  @!@!@!@!  !!@@!@!    @!! @!!   *
+* !@!  !!!  !@!  !!!       !!!  !!!@!!!!  @!!@!!!!  !!!  !@!   *
+* !!:  !!!  !!:  !!!       !!:  !!:  !!!  !:!  !:!  :!!:!:!!:  *
+* :!:  !:!  :!:  !:!       :!:  :!:  !:!  :!:  !:!  !:::!!:::  *
+* ::::: ::   ::   ::        ::  ::   :::  :::: :::       :::   *
+*  : :  :   ::    :        :     :   : :   :: : :        :::   *
+*                                                              *
+****************************************************************
+* Bow down, bow down, before the power of IA64! Or be crushed, *
+* be crushed, by its jolly registers of doom!!                 *
+****************************************************************
+
+DEVELOPMENT PLAN:
+
+   _  you are       2005           maybe 2005       2006            2006 and
+  /   here            |               |              |               beyond 
+ v                    v               v              v                |
+                                                                      v
+CLEAN UP        ADD INSTRUCTION      ADD          PLAY WITH
+INSTRUCTION --> SCHEDULING AND  -->  JIT    -->    DYNAMIC     --> FUTURE WORK
+SELECTION       BUNDLING            SUPPORT     REOPTIMIZATION
+
+DISCLAIMER AND PROMISE:
+
+As of the time of this release, you are probably better off using Intel C/C++
+or GCC. The performance of the code emitted right now is, in a word,
+terrible. Check back in a few months - the story will be different then,
+I guarantee it.
+
+TODO:
+
+  - stop passing FP args in both FP *and* integer regs when not required
+  - allocate low (nonstacked) registers more aggressively
+  - clean up and thoroughly test the isel patterns.
+  - fix stacked register allocation order: (for readability) we don't want
+    the out? registers being the first ones used
+  - fix up floating point
+    (nb http://gcc.gnu.org/wiki?pagename=ia64%20floating%20point )
+  - bundling!
+    (we will avoid the mess that is:
+     http://gcc.gnu.org/ml/gcc/2003-12/msg00832.html )
+  - instruction scheduling (hmmmm! ;)
+  - write truly inspirational documentation
+  - if-conversion (predicate database/knowledge? etc etc)
+  - counted loop support
+  - make integer + FP mul/div more clever (we have fixed pseudocode atm)
+  - track and use comparison complements
+
+INFO:
+
+  - we are strictly LP64 here, no support for ILP32 on HP-UX. Linux users
+    don't need to worry about this.
+  - i have instruction scheduling/bundling pseudocode, that really works
+    (has been tested, albeit at the perl-script level).
+    so, before you go write your own, send me an email!
+
+KNOWN DEFECTS AT THE CURRENT TIME:
+
+  - C++ vtables contain naked function pointers, not function descriptors,
+  which is bad. see http://llvm.cs.uiuc.edu/bugs/show_bug.cgi?id=406
+  - varargs are broken
+  - alloca doesn't work (indeed, stack frame layout is bogus)
+  - no support for big-endian environments
+  - (not really the backend, but...) the CFE has some issues on IA64.
+    these will probably be fixed soon.
+  
+ACKNOWLEDGEMENTS:
+
+  - Chris Lattner (x100)
+  - Other LLVM developers ("hey, that looks familiar")
+
+CONTACT:
+
+  - You can email me at duraid@octopus.com.au. If you find a small bug,
+    just email me. If you find a big bug, please file a bug report
+    in bugzilla! http://llvm.cs.uiuc.edu is your one stop shop for all
+    things LLVM.
+
+
+
+

diff --git a/lib/Target/MRegisterInfo.cpp b/lib/Target/MRegisterInfo.cpp
new file mode 100644
index 0000000..3af611d
--- /dev/null
+++ b/lib/Target/MRegisterInfo.cpp

@@ -0,0 +1,81 @@
+//===- MRegisterInfo.cpp - Target Register Information Implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MRegisterInfo interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/MRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/ADT/BitVector.h"
+
+using namespace llvm;
+
+MRegisterInfo::MRegisterInfo(const TargetRegisterDesc *D, unsigned NR,
+                             regclass_iterator RCB, regclass_iterator RCE,
+                             int CFSO, int CFDO)
+  : Desc(D), NumRegs(NR), RegClassBegin(RCB), RegClassEnd(RCE) {
+  assert(NumRegs < FirstVirtualRegister &&
+         "Target has too many physical registers!");
+
+  CallFrameSetupOpcode   = CFSO;
+  CallFrameDestroyOpcode = CFDO;
+}
+
+MRegisterInfo::~MRegisterInfo() {}
+
+/// getAllocatableSetForRC - Toggle the bits that represent allocatable
+/// registers for the specific register class.
+static void getAllocatableSetForRC(MachineFunction &MF,
+                                   const TargetRegisterClass *RC, BitVector &R){  
+  for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
+         E = RC->allocation_order_end(MF); I != E; ++I)
+    R.set(*I);
+}
+
+BitVector MRegisterInfo::getAllocatableSet(MachineFunction &MF,
+                                           const TargetRegisterClass *RC) const {
+  BitVector Allocatable(NumRegs);
+  if (RC) {
+    getAllocatableSetForRC(MF, RC, Allocatable);
+    return Allocatable;
+  }
+
+  for (MRegisterInfo::regclass_iterator I = regclass_begin(),
+         E = regclass_end(); I != E; ++I)
+    getAllocatableSetForRC(MF, *I, Allocatable);
+  return Allocatable;
+}
+
+/// getLocation - This method should return the actual location of a frame
+/// variable given the frame index.  The location is returned in ML.
+/// Subclasses should override this method for special handling of frame
+/// variables and then call MRegisterInfo::getLocation for the default action.
+void MRegisterInfo::getLocation(MachineFunction &MF, unsigned Index,
+                        MachineLocation &ML) const {
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ML.set(getFrameRegister(MF),
+         MFI->getObjectOffset(Index) +
+         MFI->getStackSize() -
+         TFI.getOffsetOfLocalArea() +
+         MFI->getOffsetAdjustment());
+}
+
+/// getInitialFrameState - Returns a list of machine moves that are assumed
+/// on entry to a function.
+void
+MRegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const {
+  // Default is to do nothing.
+}
+

diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
new file mode 100644
index 0000000..5859adf
--- /dev/null
+++ b/lib/Target/MSIL/MSILWriter.cpp

@@ -0,0 +1,1657 @@
+//===-- MSILWriter.cpp - Library for converting LLVM code to MSIL ---------===//
+//
+//		       The LLVM Compiler Infrastructure
+//
+// This file was developed by Roman Samoilov and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library converts LLVM code to MSIL code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSILWriter.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/ParameterAttributes.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/StringExtras.h"
+
+namespace {
+  // TargetMachine for the MSIL 
+  struct VISIBILITY_HIDDEN MSILTarget : public TargetMachine {
+    const TargetData DataLayout;       // Calculates type size & alignment
+
+    MSILTarget(const Module &M, const std::string &FS)
+      : DataLayout(&M) {}
+
+    virtual bool WantsWholeFile() const { return true; }
+    virtual bool addPassesToEmitWholeFile(PassManager &PM, std::ostream &Out,
+                                         CodeGenFileType FileType, bool Fast);
+
+    // This class always works, but shouldn't be the default in most cases.
+    static unsigned getModuleMatchQuality(const Module &M) { return 1; }
+
+    virtual const TargetData *getTargetData() const { return &DataLayout; }
+  };
+}
+
+
+RegisterTarget<MSILTarget> X("msil", "  MSIL backend");
+
+bool MSILModule::runOnModule(Module &M) {
+  ModulePtr = &M;
+  TD = &getAnalysis<TargetData>();
+  bool Changed = false;
+  // Find named types.  
+  TypeSymbolTable& Table = M.getTypeSymbolTable();
+  std::set<const Type *> Types = getAnalysis<FindUsedTypes>().getTypes();
+  for (TypeSymbolTable::iterator I = Table.begin(), E = Table.end(); I!=E; ) {
+    if (!isa<StructType>(I->second) && !isa<OpaqueType>(I->second))
+      Table.remove(I++);
+    else {
+      std::set<const Type *>::iterator T = Types.find(I->second);
+      if (T==Types.end())
+        Table.remove(I++);
+      else {
+        Types.erase(T);
+        ++I;
+      }
+    }
+  }
+  // Find unnamed types.
+  unsigned RenameCounter = 0;
+  for (std::set<const Type *>::const_iterator I = Types.begin(),
+       E = Types.end(); I!=E; ++I)
+    if (const StructType *STy = dyn_cast<StructType>(*I)) {
+      while (ModulePtr->addTypeName("unnamed$"+utostr(RenameCounter), STy))
+        ++RenameCounter;
+      Changed = true;
+    }
+  // Pointer for FunctionPass.
+  UsedTypes = &getAnalysis<FindUsedTypes>().getTypes();
+  return Changed;
+}
+
+char MSILModule::ID = 0;
+char MSILWriter::ID = 0;
+
+bool MSILWriter::runOnFunction(Function &F) {
+  if (F.isDeclaration()) return false;
+  LInfo = &getAnalysis<LoopInfo>();
+  printFunction(F);
+  return false;
+}
+
+
+bool MSILWriter::doInitialization(Module &M) {
+  ModulePtr = &M;
+  Mang = new Mangler(M);
+  Out << ".assembly extern mscorlib {}\n";
+  Out << ".assembly MSIL {}\n\n";
+  Out << "// External\n";
+  printExternals();
+  Out << "// Declarations\n";
+  printDeclarations(M.getTypeSymbolTable());
+  Out << "// Definitions\n";
+  printGlobalVariables();
+  Out << "// Startup code\n";
+  printModuleStartup();
+  return false;
+}
+
+
+bool MSILWriter::doFinalization(Module &M) {
+  delete Mang;
+  return false;
+}
+
+
+void MSILWriter::printModuleStartup() {
+  Out <<
+  ".method static public int32 $MSIL_Startup() {\n"
+  "\t.entrypoint\n"
+  "\t.locals (native int i)\n"
+  "\t.locals (native int argc)\n"
+  "\t.locals (native int ptr)\n"
+  "\t.locals (void* argv)\n"
+  "\t.locals (string[] args)\n"
+  "\tcall\tstring[] [mscorlib]System.Environment::GetCommandLineArgs()\n"
+  "\tdup\n"
+  "\tstloc\targs\n"
+  "\tldlen\n"
+  "\tconv.i4\n"
+  "\tdup\n"
+  "\tstloc\targc\n";
+  printPtrLoad(TD->getPointerSize());
+  Out <<
+  "\tmul\n"
+  "\tlocalloc\n"
+  "\tstloc\targv\n"
+  "\tldc.i4.0\n"
+  "\tstloc\ti\n"
+  "L_01:\n"
+  "\tldloc\ti\n"
+  "\tldloc\targc\n"
+  "\tceq\n"
+  "\tbrtrue\tL_02\n"
+  "\tldloc\targs\n"
+  "\tldloc\ti\n"
+  "\tldelem.ref\n"
+  "\tcall\tnative int [mscorlib]System.Runtime.InteropServices.Marshal::"
+           "StringToHGlobalAnsi(string)\n"
+  "\tstloc\tptr\n"
+  "\tldloc\targv\n"
+  "\tldloc\ti\n";
+  printPtrLoad(TD->getPointerSize());
+  Out << 
+  "\tmul\n"
+  "\tadd\n"
+  "\tldloc\tptr\n"
+  "\tstind.i\n"
+  "\tldloc\ti\n"
+  "\tldc.i4.1\n"
+  "\tadd\n"
+  "\tstloc\ti\n"
+  "\tbr\tL_01\n"
+  "L_02:\n"
+  "\tcall void $MSIL_Init()\n";
+
+  // Call user 'main' function.
+  const Function* F = ModulePtr->getFunction("main");
+  if (!F || F->isDeclaration()) {
+    Out << "\tldc.i4.0\n\tret\n}\n";
+    return;
+  }
+  bool BadSig = true;;
+  std::string Args("");
+  Function::const_arg_iterator Arg1,Arg2;
+
+  switch (F->arg_size()) {
+  case 0:
+    BadSig = false;
+    break;
+  case 1:
+    Arg1 = F->arg_begin();
+    if (Arg1->getType()->isInteger()) {
+      Out << "\tldloc\targc\n";
+      Args = getTypeName(Arg1->getType());
+      BadSig = false;
+    }
+    break;
+  case 2:
+    Arg1 = Arg2 = F->arg_begin(); ++Arg2;
+    if (Arg1->getType()->isInteger() && 
+        Arg2->getType()->getTypeID() == Type::PointerTyID) {
+      Out << "\tldloc\targc\n\tldloc\targv\n";
+      Args = getTypeName(Arg1->getType())+","+getTypeName(Arg2->getType());
+      BadSig = false;
+    }
+    break;
+  default:
+    BadSig = true;
+  }
+
+  bool RetVoid = (F->getReturnType()->getTypeID() == Type::VoidTyID);
+  if (BadSig || !F->getReturnType()->isInteger() && !RetVoid) {
+    Out << "\tldc.i4.0\n";
+  } else {
+    Out << "\tcall\t" << getTypeName(F->getReturnType()) <<
+      getConvModopt(F->getCallingConv()) << "main(" << Args << ")\n";
+    if (RetVoid)
+      Out << "\tldc.i4.0\n";
+    else
+      Out << "\tconv.i4\n";
+  }
+  Out << "\tret\n}\n";
+}
+
+bool MSILWriter::isZeroValue(const Value* V) {
+  if (const Constant *C = dyn_cast<Constant>(V))
+    return C->isNullValue();
+  return false;
+}
+
+
+std::string MSILWriter::getValueName(const Value* V) {
+  // Name into the quotes allow control and space characters.
+  return "'"+Mang->getValueName(V)+"'";
+}
+
+
+std::string MSILWriter::getLabelName(const std::string& Name) {
+  if (Name.find('.')!=std::string::npos) {
+    std::string Tmp(Name);
+    // Replace unaccepable characters in the label name.
+    for (std::string::iterator I = Tmp.begin(), E = Tmp.end(); I!=E; ++I)
+      if (*I=='.') *I = '@';
+    return Tmp;
+  }
+  return Name;
+}
+
+
+std::string MSILWriter::getLabelName(const Value* V) {
+  return getLabelName(Mang->getValueName(V));
+}
+
+
+std::string MSILWriter::getConvModopt(unsigned CallingConvID) {
+  switch (CallingConvID) {
+  case CallingConv::C:
+  case CallingConv::Cold:
+  case CallingConv::Fast:
+    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) ";
+  case CallingConv::X86_FastCall:
+    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvFastcall) ";
+  case CallingConv::X86_StdCall:
+    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvStdcall) ";
+  default:
+    cerr << "CallingConvID = " << CallingConvID << '\n';
+    assert(0 && "Unsupported calling convention");
+  }
+}
+
+
+std::string MSILWriter::getArrayTypeName(Type::TypeID TyID, const Type* Ty) {
+  std::string Tmp = "";
+  const Type* ElemTy = Ty;
+  assert(Ty->getTypeID()==TyID && "Invalid type passed");
+  // Walk trought array element types.
+  for (;;) {
+    // Multidimensional array.
+    if (ElemTy->getTypeID()==TyID) {
+      if (const ArrayType* ATy = dyn_cast<ArrayType>(ElemTy))
+        Tmp += utostr(ATy->getNumElements());
+      else if (const VectorType* VTy = dyn_cast<VectorType>(ElemTy))
+        Tmp += utostr(VTy->getNumElements());
+      ElemTy = cast<SequentialType>(ElemTy)->getElementType();
+    }
+    // Base element type found.
+    if (ElemTy->getTypeID()!=TyID) break;
+    Tmp += ",";
+  }
+  return getTypeName(ElemTy, false, true)+"["+Tmp+"]";
+}
+
+
+std::string MSILWriter::getPrimitiveTypeName(const Type* Ty, bool isSigned) {
+  unsigned NumBits = 0;
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:
+    return "void ";
+  case Type::IntegerTyID:
+    NumBits = getBitWidth(Ty);
+    if(NumBits==1)
+      return "bool ";
+    if (!isSigned)
+      return "unsigned int"+utostr(NumBits)+" ";
+    return "int"+utostr(NumBits)+" ";
+  case Type::FloatTyID:
+    return "float32 ";
+  case Type::DoubleTyID:
+    return "float64 "; 
+  default:
+    cerr << "Type = " << *Ty << '\n';
+    assert(0 && "Invalid primitive type");
+  }
+}
+
+
+std::string MSILWriter::getTypeName(const Type* Ty, bool isSigned,
+                                    bool isNested) {
+  if (Ty->isPrimitiveType() || Ty->isInteger())
+    return getPrimitiveTypeName(Ty,isSigned);
+  // FIXME: "OpaqueType" support
+  switch (Ty->getTypeID()) {
+  case Type::PointerTyID:
+    return "void* ";
+  case Type::StructTyID:
+    if (isNested)
+      return ModulePtr->getTypeName(Ty);
+    return "valuetype '"+ModulePtr->getTypeName(Ty)+"' ";
+  case Type::ArrayTyID:
+    if (isNested)
+      return getArrayTypeName(Ty->getTypeID(),Ty);
+    return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' ";
+  case Type::VectorTyID:
+    if (isNested)
+      return getArrayTypeName(Ty->getTypeID(),Ty);
+    return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' ";
+  default:
+    cerr << "Type = " << *Ty << '\n';
+    assert(0 && "Invalid type in getTypeName()");
+  }
+}
+
+
+MSILWriter::ValueType MSILWriter::getValueLocation(const Value* V) {
+  // Function argument
+  if (isa<Argument>(V))
+    return ArgumentVT;
+  // Function
+  else if (const Function* F = dyn_cast<Function>(V))
+    return F->hasInternalLinkage() ? InternalVT : GlobalVT;
+  // Variable
+  else if (const GlobalVariable* G = dyn_cast<GlobalVariable>(V))
+    return G->hasInternalLinkage() ? InternalVT : GlobalVT;
+  // Constant
+  else if (isa<Constant>(V))
+    return isa<ConstantExpr>(V) ? ConstExprVT : ConstVT;
+  // Local variable
+  return LocalVT;
+}
+
+
+std::string MSILWriter::getTypePostfix(const Type* Ty, bool Expand,
+                                       bool isSigned) {
+  unsigned NumBits = 0;
+  switch (Ty->getTypeID()) {
+  // Integer constant, expanding for stack operations.
+  case Type::IntegerTyID:
+    NumBits = getBitWidth(Ty);
+    // Expand integer value to "int32" or "int64".
+    if (Expand) return (NumBits<=32 ? "i4" : "i8");
+    if (NumBits==1) return "i1";
+    return (isSigned ? "i" : "u")+utostr(NumBits/8);
+  // Float constant.
+  case Type::FloatTyID:
+    return "r4";
+  case Type::DoubleTyID:
+    return "r8";
+  case Type::PointerTyID:
+    return "i"+utostr(TD->getTypeSize(Ty));
+  default:
+    cerr << "TypeID = " << Ty->getTypeID() << '\n';
+    assert(0 && "Invalid type in TypeToPostfix()");
+  }
+}
+
+
+void MSILWriter::printConvToPtr() {
+  switch (ModulePtr->getPointerSize()) {
+  case Module::Pointer32:
+    printSimpleInstruction("conv.u4");
+    break;
+  case Module::Pointer64:
+    printSimpleInstruction("conv.u8");
+    break;
+  default:
+    assert(0 && "Module use not supporting pointer size");
+  }
+}
+
+
+void MSILWriter::printPtrLoad(uint64_t N) {
+  switch (ModulePtr->getPointerSize()) {
+  case Module::Pointer32:
+    printSimpleInstruction("ldc.i4",utostr(N).c_str());
+    // FIXME: Need overflow test?
+    if (!isUInt32(N)) {
+      cerr << "Value = " << utostr(N) << '\n';
+      assert(0 && "32-bit pointer overflowed");
+    }
+    break;
+  case Module::Pointer64:
+    printSimpleInstruction("ldc.i8",utostr(N).c_str());
+    break;
+  default:
+    assert(0 && "Module use not supporting pointer size");
+  }
+}
+
+
+void MSILWriter::printValuePtrLoad(const Value* V) {
+  printValueLoad(V);
+  printConvToPtr();
+}
+
+
+void MSILWriter::printConstLoad(const Constant* C) {
+  if (const ConstantInt* CInt = dyn_cast<ConstantInt>(C)) {
+    // Integer constant
+    Out << "\tldc." << getTypePostfix(C->getType(),true) << '\t';
+    if (CInt->isMinValue(true))
+      Out << CInt->getSExtValue();
+    else
+      Out << CInt->getZExtValue();
+  } else if (const ConstantFP* FP = dyn_cast<ConstantFP>(C)) {
+    // Float constant
+    uint64_t X;
+    unsigned Size;
+    if (FP->getType()->getTypeID()==Type::FloatTyID) {
+      X = FloatToBits(FP->getValue());
+      Size = 4;  
+    } else {
+      X = DoubleToBits(FP->getValue());
+      Size = 8;  
+    }
+    Out << "\tldc.r" << Size << "\t( " << utohexstr(X) << ')';
+  } else if (isa<UndefValue>(C)) {
+    // Undefined constant value = NULL.
+    printPtrLoad(0);
+  } else {
+    cerr << "Constant = " << *C << '\n';
+    assert(0 && "Invalid constant value");
+  }
+  Out << '\n';
+}
+
+
+void MSILWriter::printValueLoad(const Value* V) {
+  MSILWriter::ValueType Location = getValueLocation(V);
+  switch (Location) {
+  // Global variable or function address.
+  case GlobalVT:
+  case InternalVT:
+    if (const Function* F = dyn_cast<Function>(V)) {
+      std::string Name = getConvModopt(F->getCallingConv())+getValueName(F);
+      printSimpleInstruction("ldftn",
+        getCallSignature(F->getFunctionType(),NULL,Name).c_str());
+    } else {
+      std::string Tmp;
+      const Type* ElemTy = cast<PointerType>(V->getType())->getElementType();
+      if (Location==GlobalVT && cast<GlobalVariable>(V)->hasDLLImportLinkage()) {
+        Tmp = "void* "+getValueName(V);
+        printSimpleInstruction("ldsfld",Tmp.c_str());
+      } else {
+        Tmp = getTypeName(ElemTy)+getValueName(V);
+        printSimpleInstruction("ldsflda",Tmp.c_str());
+      }
+    }
+    break;
+  // Function argument.
+  case ArgumentVT:
+    printSimpleInstruction("ldarg",getValueName(V).c_str());
+    break;
+  // Local function variable.
+  case LocalVT:
+    printSimpleInstruction("ldloc",getValueName(V).c_str());
+    break;
+  // Constant value.
+  case ConstVT:
+    if (isa<ConstantPointerNull>(V))
+      printPtrLoad(0);
+    else
+      printConstLoad(cast<Constant>(V));
+    break;
+  // Constant expression.
+  case ConstExprVT:
+    printConstantExpr(cast<ConstantExpr>(V));
+    break;
+  default:
+    cerr << "Value = " << *V << '\n';
+    assert(0 && "Invalid value location");
+  }
+}
+
+
+void MSILWriter::printValueSave(const Value* V) {
+  switch (getValueLocation(V)) {
+  case ArgumentVT:
+    printSimpleInstruction("starg",getValueName(V).c_str());
+    break;
+  case LocalVT:
+    printSimpleInstruction("stloc",getValueName(V).c_str());
+    break;
+  default:
+    cerr << "Value  = " << *V << '\n';
+    assert(0 && "Invalid value location");
+  }
+}
+
+
+void MSILWriter::printBinaryInstruction(const char* Name, const Value* Left,
+                                        const Value* Right) {
+  printValueLoad(Left);
+  printValueLoad(Right);
+  Out << '\t' << Name << '\n';
+}
+
+
+void MSILWriter::printSimpleInstruction(const char* Inst, const char* Operand) {
+  if(Operand) 
+    Out << '\t' << Inst << '\t' << Operand << '\n';
+  else
+    Out << '\t' << Inst << '\n';
+}
+
+
+void MSILWriter::printPHICopy(const BasicBlock* Src, const BasicBlock* Dst) {
+  for (BasicBlock::const_iterator I = Dst->begin(), E = Dst->end();
+       isa<PHINode>(I); ++I) {
+    const PHINode* Phi = cast<PHINode>(I);
+    const Value* Val = Phi->getIncomingValueForBlock(Src);
+    if (isa<UndefValue>(Val)) continue;
+    printValueLoad(Val);
+    printValueSave(Phi);
+  }
+}
+
+
+void MSILWriter::printBranchToBlock(const BasicBlock* CurrBB,
+                                    const BasicBlock* TrueBB,
+                                    const BasicBlock* FalseBB) {
+  if (TrueBB==FalseBB) {
+    // "TrueBB" and "FalseBB" destination equals
+    printPHICopy(CurrBB,TrueBB);
+    printSimpleInstruction("pop");
+    printSimpleInstruction("br",getLabelName(TrueBB).c_str());
+  } else if (FalseBB==NULL) {
+    // If "FalseBB" not used the jump have condition
+    printPHICopy(CurrBB,TrueBB);
+    printSimpleInstruction("brtrue",getLabelName(TrueBB).c_str());
+  } else if (TrueBB==NULL) {
+    // If "TrueBB" not used the jump is unconditional
+    printPHICopy(CurrBB,FalseBB);
+    printSimpleInstruction("br",getLabelName(FalseBB).c_str());
+  } else {
+    // Copy PHI instructions for each block
+    std::string TmpLabel;
+    // Print PHI instructions for "TrueBB"
+    if (isa<PHINode>(TrueBB->begin())) {
+      TmpLabel = getLabelName(TrueBB)+"$phi_"+utostr(getUniqID());
+      printSimpleInstruction("brtrue",TmpLabel.c_str());
+    } else {
+      printSimpleInstruction("brtrue",getLabelName(TrueBB).c_str());
+    }
+    // Print PHI instructions for "FalseBB"
+    if (isa<PHINode>(FalseBB->begin())) {
+      printPHICopy(CurrBB,FalseBB);
+      printSimpleInstruction("br",getLabelName(FalseBB).c_str());
+    } else {
+      printSimpleInstruction("br",getLabelName(FalseBB).c_str());
+    }
+    if (isa<PHINode>(TrueBB->begin())) {
+      // Handle "TrueBB" PHI Copy
+      Out << TmpLabel << ":\n";
+      printPHICopy(CurrBB,TrueBB);
+      printSimpleInstruction("br",getLabelName(TrueBB).c_str());
+    }
+  }
+}
+
+
+void MSILWriter::printBranchInstruction(const BranchInst* Inst) {
+  if (Inst->isUnconditional()) {
+    printBranchToBlock(Inst->getParent(),NULL,Inst->getSuccessor(0));
+  } else {
+    printValueLoad(Inst->getCondition());
+    printBranchToBlock(Inst->getParent(),Inst->getSuccessor(0),
+                       Inst->getSuccessor(1));
+  }
+}
+
+
+void MSILWriter::printSelectInstruction(const Value* Cond, const Value* VTrue,
+                                        const Value* VFalse) {
+  std::string TmpLabel = std::string("select$true_")+utostr(getUniqID());
+  printValueLoad(VTrue);
+  printValueLoad(Cond);
+  printSimpleInstruction("brtrue",TmpLabel.c_str());
+  printSimpleInstruction("pop");
+  printValueLoad(VFalse);
+  Out << TmpLabel << ":\n";
+}
+
+
+void MSILWriter::printIndirectLoad(const Value* V) {
+  const Type* Ty = V->getType();
+  printValueLoad(V);
+  if (const PointerType* P = dyn_cast<PointerType>(Ty))
+    Ty = P->getElementType();
+  std::string Tmp = "ldind."+getTypePostfix(Ty, false);
+  printSimpleInstruction(Tmp.c_str());
+}
+
+
+void MSILWriter::printIndirectSave(const Value* Ptr, const Value* Val) {
+  printValueLoad(Ptr);
+  printValueLoad(Val);
+  printIndirectSave(Val->getType());
+}
+
+
+void MSILWriter::printIndirectSave(const Type* Ty) {
+  // Instruction need signed postfix for any type.
+  std::string postfix = getTypePostfix(Ty, false);
+  if (*postfix.begin()=='u') *postfix.begin() = 'i';
+  postfix = "stind."+postfix;
+  printSimpleInstruction(postfix.c_str());
+}
+
+
+void MSILWriter::printCastInstruction(unsigned int Op, const Value* V,
+                                      const Type* Ty) {
+  std::string Tmp("");
+  printValueLoad(V);
+  switch (Op) {
+  // Signed
+  case Instruction::SExt:
+  case Instruction::SIToFP:
+  case Instruction::FPToSI:
+    Tmp = "conv."+getTypePostfix(Ty,false,true);
+    printSimpleInstruction(Tmp.c_str());
+    break;
+  // Unsigned
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::FPToUI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+    Tmp = "conv."+getTypePostfix(Ty,false);
+    printSimpleInstruction(Tmp.c_str());
+    break;
+  // Do nothing
+  case Instruction::BitCast:
+    // FIXME: meaning that ld*/st* instruction do not change data format.
+    break;
+  default:
+    cerr << "Opcode = " << Op << '\n';
+    assert(0 && "Invalid conversion instruction");
+  }
+}
+
+
+void MSILWriter::printGepInstruction(const Value* V, gep_type_iterator I,
+                                     gep_type_iterator E) {
+  unsigned Size;
+  // Load address
+  printValuePtrLoad(V);
+  // Calculate element offset.
+  for (; I!=E; ++I){
+    Size = 0;
+    const Value* IndexValue = I.getOperand();
+    if (const StructType* StrucTy = dyn_cast<StructType>(*I)) {
+      uint64_t FieldIndex = cast<ConstantInt>(IndexValue)->getZExtValue();
+      // Offset is the sum of all previous structure fields.
+      for (uint64_t F = 0; F<FieldIndex; ++F)
+        Size += TD->getTypeSize(StrucTy->getContainedType((unsigned)F));
+      printPtrLoad(Size);
+      printSimpleInstruction("add");
+      continue;
+    } else if (const SequentialType* SeqTy = dyn_cast<SequentialType>(*I)) {
+      Size = TD->getTypeSize(SeqTy->getElementType());
+    } else {
+      Size = TD->getTypeSize(*I);
+    }
+    // Add offset of current element to stack top.
+    if (!isZeroValue(IndexValue)) {
+      // Constant optimization.
+      if (const ConstantInt* C = dyn_cast<ConstantInt>(IndexValue)) {
+        if (C->getValue().isNegative()) {
+          printPtrLoad(C->getValue().abs().getZExtValue()*Size);
+          printSimpleInstruction("sub");
+          continue;
+        } else
+          printPtrLoad(C->getZExtValue()*Size);
+      } else {
+        printPtrLoad(Size);
+        printValuePtrLoad(IndexValue);
+        printSimpleInstruction("mul");
+      }
+      printSimpleInstruction("add");
+    }
+  }
+}
+
+
+std::string MSILWriter::getCallSignature(const FunctionType* Ty,
+                                         const Instruction* Inst,
+                                         std::string Name) {
+  std::string Tmp("");
+  if (Ty->isVarArg()) Tmp += "vararg ";
+  // Name and return type.
+  Tmp += getTypeName(Ty->getReturnType())+Name+"(";
+  // Function argument type list.
+  unsigned NumParams = Ty->getNumParams();
+  for (unsigned I = 0; I!=NumParams; ++I) {
+    if (I!=0) Tmp += ",";
+    Tmp += getTypeName(Ty->getParamType(I));
+  }
+  // CLR needs to know the exact amount of parameters received by vararg
+  // function, because caller cleans the stack.
+  if (Ty->isVarArg() && Inst) {
+    // Origin to function arguments in "CallInst" or "InvokeInst".
+    unsigned Org = isa<InvokeInst>(Inst) ? 3 : 1;
+    // Print variable argument types.
+    unsigned NumOperands = Inst->getNumOperands()-Org;
+    if (NumParams<NumOperands) {
+      if (NumParams!=0) Tmp += ", ";
+      Tmp += "... , ";
+      for (unsigned J = NumParams; J!=NumOperands; ++J) {
+        if (J!=NumParams) Tmp += ", ";
+        Tmp += getTypeName(Inst->getOperand(J+Org)->getType());
+      }
+    }
+  }
+  return Tmp+")";
+}
+
+
+void MSILWriter::printFunctionCall(const Value* FnVal,
+                                   const Instruction* Inst) {
+  // Get function calling convention.
+  std::string Name = "";
+  if (const CallInst* Call = dyn_cast<CallInst>(Inst))
+    Name = getConvModopt(Call->getCallingConv());
+  else if (const InvokeInst* Invoke = dyn_cast<InvokeInst>(Inst))
+    Name = getConvModopt(Invoke->getCallingConv());
+  else {
+    cerr << "Instruction = " << Inst->getName() << '\n';
+    assert(0 && "Need \"Invoke\" or \"Call\" instruction only");
+  }
+  if (const Function* F = dyn_cast<Function>(FnVal)) {
+    // Direct call.
+    Name += getValueName(F);
+    printSimpleInstruction("call",
+      getCallSignature(F->getFunctionType(),Inst,Name).c_str());
+  } else {
+    // Indirect function call.
+    const PointerType* PTy = cast<PointerType>(FnVal->getType());
+    const FunctionType* FTy = cast<FunctionType>(PTy->getElementType());
+    // Load function address.
+    printValueLoad(FnVal);
+    printSimpleInstruction("calli",getCallSignature(FTy,Inst,Name).c_str());
+  }
+}
+
+
+void MSILWriter::printIntrinsicCall(const IntrinsicInst* Inst) {
+  std::string Name;
+  switch (Inst->getIntrinsicID()) {
+  case Intrinsic::vastart:
+    Name = getValueName(Inst->getOperand(1));
+    Name.insert(Name.length()-1,"$valist");
+    // Obtain the argument handle.
+    printSimpleInstruction("ldloca",Name.c_str());
+    printSimpleInstruction("arglist");
+    printSimpleInstruction("call",
+      "instance void [mscorlib]System.ArgIterator::.ctor"
+      "(valuetype [mscorlib]System.RuntimeArgumentHandle)");
+    // Save as pointer type "void*"
+    printValueLoad(Inst->getOperand(1));
+    printSimpleInstruction("ldloca",Name.c_str());
+    printIndirectSave(PointerType::get(IntegerType::get(8)));
+    break;
+  case Intrinsic::vaend:
+    // Close argument list handle.
+    printIndirectLoad(Inst->getOperand(1));
+    printSimpleInstruction("call","instance void [mscorlib]System.ArgIterator::End()");
+    break;
+  case Intrinsic::vacopy:
+    // Copy "ArgIterator" valuetype.
+    printIndirectLoad(Inst->getOperand(1));
+    printIndirectLoad(Inst->getOperand(2));
+    printSimpleInstruction("cpobj","[mscorlib]System.ArgIterator");
+    break;        
+  default:
+    cerr << "Intrinsic ID = " << Inst->getIntrinsicID() << '\n';
+    assert(0 && "Invalid intrinsic function");
+  }
+}
+
+
+void MSILWriter::printCallInstruction(const Instruction* Inst) {
+  if (isa<IntrinsicInst>(Inst)) {
+    // Handle intrinsic function.
+    printIntrinsicCall(cast<IntrinsicInst>(Inst));
+  } else {
+    // Load arguments to stack and call function.
+    for (int I = 1, E = Inst->getNumOperands(); I!=E; ++I)
+      printValueLoad(Inst->getOperand(I));
+    printFunctionCall(Inst->getOperand(0),Inst);
+  }
+}
+
+
+void MSILWriter::printICmpInstruction(unsigned Predicate, const Value* Left,
+                                      const Value* Right) {
+  switch (Predicate) {
+  case ICmpInst::ICMP_EQ:
+    printBinaryInstruction("ceq",Left,Right);
+    break;
+  case ICmpInst::ICMP_NE:
+    // Emulate = not neg (Op1 eq Op2)
+    printBinaryInstruction("ceq",Left,Right);
+    printSimpleInstruction("neg");
+    printSimpleInstruction("not");
+    break;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE:
+    // Emulate = (Op1 eq Op2) or (Op1 lt Op2)
+    printBinaryInstruction("ceq",Left,Right);
+    if (Predicate==ICmpInst::ICMP_ULE)
+      printBinaryInstruction("clt.un",Left,Right);
+    else
+      printBinaryInstruction("clt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE:
+    // Emulate = (Op1 eq Op2) or (Op1 gt Op2)
+    printBinaryInstruction("ceq",Left,Right);
+    if (Predicate==ICmpInst::ICMP_UGE)
+      printBinaryInstruction("cgt.un",Left,Right);
+    else
+      printBinaryInstruction("cgt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case ICmpInst::ICMP_ULT:
+    printBinaryInstruction("clt.un",Left,Right);
+    break;
+  case ICmpInst::ICMP_SLT:
+    printBinaryInstruction("clt",Left,Right);
+    break;
+  case ICmpInst::ICMP_UGT:
+    printBinaryInstruction("cgt.un",Left,Right);
+  case ICmpInst::ICMP_SGT:
+    printBinaryInstruction("cgt",Left,Right);
+    break;
+  default:
+    cerr << "Predicate = " << Predicate << '\n';
+    assert(0 && "Invalid icmp predicate");
+  }
+}
+
+
+void MSILWriter::printFCmpInstruction(unsigned Predicate, const Value* Left,
+                                      const Value* Right) {
+  // FIXME: Correct comparison
+  std::string NanFunc = "bool [mscorlib]System.Double::IsNaN(float64)";
+  switch (Predicate) {
+  case FCmpInst::FCMP_UGT:
+    // X >  Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("cgt",Left,Right);
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OGT:
+    // X >  Y
+    printBinaryInstruction("cgt",Left,Right);
+    break;
+  case FCmpInst::FCMP_UGE:
+    // X >= Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("cgt",Left,Right);
+    printSimpleInstruction("or");
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OGE:
+    // X >= Y
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("cgt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_ULT:
+    // X <  Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("clt",Left,Right);
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OLT:
+    // X <  Y
+    printBinaryInstruction("clt",Left,Right);
+    break;
+  case FCmpInst::FCMP_ULE:
+    // X <= Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("clt",Left,Right);
+    printSimpleInstruction("or");
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OLE:
+    // X <= Y
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("clt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_UEQ:
+    // X == Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OEQ:
+    // X == Y
+    printBinaryInstruction("ceq",Left,Right);
+    break;
+  case FCmpInst::FCMP_UNE:
+    // X != Y
+    printBinaryInstruction("ceq",Left,Right);
+    printSimpleInstruction("neg");
+    printSimpleInstruction("not");
+    break;
+  case FCmpInst::FCMP_ONE:
+    // X != Y && llvm_fcmp_ord(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printSimpleInstruction("not");
+    break;
+  case FCmpInst::FCMP_ORD:
+    // return X == X && Y == Y
+    printBinaryInstruction("ceq",Left,Left);
+    printBinaryInstruction("ceq",Right,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_UNO:
+    // X != X || Y != Y
+    printBinaryInstruction("ceq",Left,Left);
+    printSimpleInstruction("not");
+    printBinaryInstruction("ceq",Right,Right);
+    printSimpleInstruction("not");
+    printSimpleInstruction("or");
+    break;
+  default:
+    assert(0 && "Illegal FCmp predicate");
+  }
+}
+
+
+void MSILWriter::printInvokeInstruction(const InvokeInst* Inst) {
+  std::string Label = "leave$normal_"+utostr(getUniqID());
+  Out << ".try {\n";
+  // Load arguments
+  for (int I = 3, E = Inst->getNumOperands(); I!=E; ++I)
+    printValueLoad(Inst->getOperand(I));
+  // Print call instruction
+  printFunctionCall(Inst->getOperand(0),Inst);
+  // Save function result and leave "try" block
+  printValueSave(Inst);
+  printSimpleInstruction("leave",Label.c_str());
+  Out << "}\n";
+  Out << "catch [mscorlib]System.Exception {\n";
+  // Redirect to unwind block
+  printSimpleInstruction("pop");
+  printBranchToBlock(Inst->getParent(),NULL,Inst->getUnwindDest());
+  Out << "}\n" << Label << ":\n";
+  // Redirect to continue block
+  printBranchToBlock(Inst->getParent(),NULL,Inst->getNormalDest());
+}
+
+
+void MSILWriter::printSwitchInstruction(const SwitchInst* Inst) {
+  // FIXME: Emulate with IL "switch" instruction
+  // Emulate = if () else if () else if () else ...
+  for (unsigned int I = 1, E = Inst->getNumCases(); I!=E; ++I) {
+    printValueLoad(Inst->getCondition());
+    printValueLoad(Inst->getCaseValue(I));
+    printSimpleInstruction("ceq");
+    // Condition jump to successor block
+    printBranchToBlock(Inst->getParent(),Inst->getSuccessor(I),NULL);
+  }
+  // Jump to default block
+  printBranchToBlock(Inst->getParent(),NULL,Inst->getDefaultDest());
+}
+
+
+void MSILWriter::printVAArgInstruction(const VAArgInst* Inst) {
+  printIndirectLoad(Inst->getOperand(0));
+  printSimpleInstruction("call",
+    "instance typedref [mscorlib]System.ArgIterator::GetNextArg()");
+  printSimpleInstruction("refanyval","void*");
+  std::string Name = "ldind."+getTypePostfix(PointerType::get(IntegerType::get(8)),false);
+  printSimpleInstruction(Name.c_str());
+}
+
+
+void MSILWriter::printAllocaInstruction(const AllocaInst* Inst) {
+  uint64_t Size = TD->getTypeSize(Inst->getAllocatedType());
+  // Constant optimization.
+  if (const ConstantInt* CInt = dyn_cast<ConstantInt>(Inst->getOperand(0))) {
+    printPtrLoad(CInt->getZExtValue()*Size);
+  } else {
+    printPtrLoad(Size);
+    printValueLoad(Inst->getOperand(0));
+    printSimpleInstruction("mul");
+  }
+  printSimpleInstruction("localloc");
+}
+
+
+void MSILWriter::printInstruction(const Instruction* Inst) {
+  const Value *Left = 0, *Right = 0;
+  if (Inst->getNumOperands()>=1) Left = Inst->getOperand(0);
+  if (Inst->getNumOperands()>=2) Right = Inst->getOperand(1);
+  // Print instruction
+  // FIXME: "ShuffleVector","ExtractElement","InsertElement" support.
+  switch (Inst->getOpcode()) {
+  // Terminator
+  case Instruction::Ret:
+    if (Inst->getNumOperands()) {
+      printValueLoad(Left);
+      printSimpleInstruction("ret");
+    } else
+      printSimpleInstruction("ret");
+    break;
+  case Instruction::Br:
+    printBranchInstruction(cast<BranchInst>(Inst));
+    break;
+  // Binary
+  case Instruction::Add:
+    printBinaryInstruction("add",Left,Right);
+    break;
+  case Instruction::Sub:
+    printBinaryInstruction("sub",Left,Right);
+    break;
+  case Instruction::Mul:  
+    printBinaryInstruction("mul",Left,Right);
+    break;
+  case Instruction::UDiv:
+    printBinaryInstruction("div.un",Left,Right);
+    break;
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+    printBinaryInstruction("div",Left,Right);
+    break;
+  case Instruction::URem:
+    printBinaryInstruction("rem.un",Left,Right);
+    break;
+  case Instruction::SRem:
+  case Instruction::FRem:
+    printBinaryInstruction("rem",Left,Right);
+    break;
+  // Binary Condition
+  case Instruction::ICmp:
+    printICmpInstruction(cast<ICmpInst>(Inst)->getPredicate(),Left,Right);
+    break;
+  case Instruction::FCmp:
+    printFCmpInstruction(cast<FCmpInst>(Inst)->getPredicate(),Left,Right);
+    break;
+  // Bitwise Binary
+  case Instruction::And:
+    printBinaryInstruction("and",Left,Right);
+    break;
+  case Instruction::Or:
+    printBinaryInstruction("or",Left,Right);
+    break;
+  case Instruction::Xor:
+    printBinaryInstruction("xor",Left,Right);
+    break;
+  case Instruction::Shl:
+    printValueLoad(Left);
+    printValueLoad(Right);
+    printSimpleInstruction("conv.i4");
+    printSimpleInstruction("shl");
+    break;
+  case Instruction::LShr:
+    printValueLoad(Left);
+    printValueLoad(Right);
+    printSimpleInstruction("conv.i4");
+    printSimpleInstruction("shr.un");
+    break;
+  case Instruction::AShr:
+    printValueLoad(Left);
+    printValueLoad(Right);
+    printSimpleInstruction("conv.i4");
+    printSimpleInstruction("shr");
+    break;
+  case Instruction::Select:
+    printSelectInstruction(Inst->getOperand(0),Inst->getOperand(1),Inst->getOperand(2));
+    break;
+  case Instruction::Load:
+    printIndirectLoad(Inst->getOperand(0));
+    break;
+  case Instruction::Store:
+    printIndirectSave(Inst->getOperand(1), Inst->getOperand(0));
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    printCastInstruction(Inst->getOpcode(),Left,
+                         cast<CastInst>(Inst)->getDestTy());
+    break;
+  case Instruction::GetElementPtr:
+    printGepInstruction(Inst->getOperand(0),gep_type_begin(Inst),
+                        gep_type_end(Inst));
+    break;
+  case Instruction::Call:
+    printCallInstruction(cast<CallInst>(Inst));
+    break;
+  case Instruction::Invoke:
+    printInvokeInstruction(cast<InvokeInst>(Inst));
+    break;
+  case Instruction::Unwind:
+    printSimpleInstruction("newobj",
+      "instance void [mscorlib]System.Exception::.ctor()");
+    printSimpleInstruction("throw");
+    break;
+  case Instruction::Switch:
+    printSwitchInstruction(cast<SwitchInst>(Inst));
+    break;
+  case Instruction::Alloca:
+    printAllocaInstruction(cast<AllocaInst>(Inst));
+    break;
+  case Instruction::Malloc:
+    assert(0 && "LowerAllocationsPass used");
+    break;
+  case Instruction::Free:
+    assert(0 && "LowerAllocationsPass used");
+    break;
+  case Instruction::Unreachable:
+    printSimpleInstruction("ldstr", "\"Unreachable instruction\"");
+    printSimpleInstruction("newobj",
+      "instance void [mscorlib]System.Exception::.ctor(string)");
+    printSimpleInstruction("throw");
+    break;
+  case Instruction::VAArg:
+    printVAArgInstruction(cast<VAArgInst>(Inst));
+    break;
+  default:
+    cerr << "Instruction = " << Inst->getName() << '\n';
+    assert(0 && "Unsupported instruction");
+  }
+}
+
+
+void MSILWriter::printLoop(const Loop* L) {
+  Out << getLabelName(L->getHeader()->getName()) << ":\n";
+  const std::vector<BasicBlock*>& blocks = L->getBlocks();
+  for (unsigned I = 0, E = blocks.size(); I!=E; I++) {
+    BasicBlock* BB = blocks[I];
+    Loop* BBLoop = LInfo->getLoopFor(BB);
+    if (BBLoop == L)
+      printBasicBlock(BB);
+    else if (BB==BBLoop->getHeader() && BBLoop->getParentLoop()==L)
+      printLoop(BBLoop);
+  }
+  printSimpleInstruction("br",getLabelName(L->getHeader()->getName()).c_str());
+}
+
+
+void MSILWriter::printBasicBlock(const BasicBlock* BB) {
+  Out << getLabelName(BB) << ":\n";
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) {
+    const Instruction* Inst = I;
+    // Comment llvm original instruction
+    Out << "\n//" << *Inst << "\n";
+    // Do not handle PHI instruction in current block
+    if (Inst->getOpcode()==Instruction::PHI) continue;
+    // Print instruction
+    printInstruction(Inst);
+    // Save result
+    if (Inst->getType()!=Type::VoidTy) {
+      // Do not save value after invoke, it done in "try" block
+      if (Inst->getOpcode()==Instruction::Invoke) continue;
+      printValueSave(Inst);
+    }
+  }
+}
+
+
+void MSILWriter::printLocalVariables(const Function& F) {
+  std::string Name;
+  const Type* Ty = NULL;
+  std::set<const Value*> Printed;
+  const Value* VaList = NULL;
+  unsigned StackDepth = 8;
+  // Find local variables
+  for (const_inst_iterator I = inst_begin(&F), E = inst_end(&F); I!=E; ++I) {
+    if (I->getOpcode()==Instruction::Call ||
+        I->getOpcode()==Instruction::Invoke) {
+      // Test stack depth.
+      if (StackDepth<I->getNumOperands())
+        StackDepth = I->getNumOperands();
+    }
+    const AllocaInst* AI = dyn_cast<AllocaInst>(&*I);
+    if (AI && !isa<GlobalVariable>(AI)) {
+      // Local variable allocation.
+      Ty = PointerType::get(AI->getAllocatedType());
+      Name = getValueName(AI);
+      Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n";
+    } else if (I->getType()!=Type::VoidTy) {
+      // Operation result.
+      Ty = I->getType();
+      Name = getValueName(&*I);
+      Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n";
+    }
+    // Test on 'va_list' variable    
+    bool isVaList = false;     
+    if (const VAArgInst* VaInst = dyn_cast<VAArgInst>(&*I)) {
+      // "va_list" as "va_arg" instruction operand.
+      isVaList = true;
+      VaList = VaInst->getOperand(0);
+    } else if (const IntrinsicInst* Inst = dyn_cast<IntrinsicInst>(&*I)) {
+      // "va_list" as intrinsic function operand. 
+      switch (Inst->getIntrinsicID()) {
+      case Intrinsic::vastart:
+      case Intrinsic::vaend:
+      case Intrinsic::vacopy:
+        isVaList = true;
+        VaList = Inst->getOperand(1);
+        break;
+      default:
+        isVaList = false;
+      }
+    }
+    // Print "va_list" variable.
+    if (isVaList && Printed.insert(VaList).second) {
+      Name = getValueName(VaList);
+      Name.insert(Name.length()-1,"$valist");
+      Out << "\t.locals (valuetype [mscorlib]System.ArgIterator "
+          << Name << ")\n";
+    }
+  }
+  printSimpleInstruction(".maxstack",utostr(StackDepth*2).c_str());
+}
+
+
+void MSILWriter::printFunctionBody(const Function& F) {
+  // Print body
+  for (Function::const_iterator I = F.begin(), E = F.end(); I!=E; ++I) {
+    if (Loop *L = LInfo->getLoopFor(I)) {
+      if (L->getHeader()==I && L->getParentLoop()==0)
+        printLoop(L);
+    } else {
+      printBasicBlock(I);
+    }
+  }
+}
+
+
+void MSILWriter::printConstantExpr(const ConstantExpr* CE) {
+  const Value *left = 0, *right = 0;
+  if (CE->getNumOperands()>=1) left = CE->getOperand(0);
+  if (CE->getNumOperands()>=2) right = CE->getOperand(1);
+  // Print instruction
+  switch (CE->getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    printCastInstruction(CE->getOpcode(),left,CE->getType());
+    break;
+  case Instruction::GetElementPtr:
+    printGepInstruction(CE->getOperand(0),gep_type_begin(CE),gep_type_end(CE));
+    break;
+  case Instruction::ICmp:
+    printICmpInstruction(CE->getPredicate(),left,right);
+    break;
+  case Instruction::FCmp:
+    printFCmpInstruction(CE->getPredicate(),left,right);
+    break;
+  case Instruction::Select:
+    printSelectInstruction(CE->getOperand(0),CE->getOperand(1),CE->getOperand(2));
+    break;
+  case Instruction::Add:
+    printBinaryInstruction("add",left,right);
+    break;
+  case Instruction::Sub:
+    printBinaryInstruction("sub",left,right);
+    break;
+  case Instruction::Mul:
+    printBinaryInstruction("mul",left,right);
+    break;
+  case Instruction::UDiv:
+    printBinaryInstruction("div.un",left,right);
+    break;
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+    printBinaryInstruction("div",left,right);
+    break;
+  case Instruction::URem:
+    printBinaryInstruction("rem.un",left,right);
+    break;
+  case Instruction::SRem:
+  case Instruction::FRem:
+    printBinaryInstruction("rem",left,right);
+    break;
+  case Instruction::And:
+    printBinaryInstruction("and",left,right);
+    break;
+  case Instruction::Or:
+    printBinaryInstruction("or",left,right);
+    break;
+  case Instruction::Xor:
+    printBinaryInstruction("xor",left,right);
+    break;
+  case Instruction::Shl:
+    printBinaryInstruction("shl",left,right);
+    break;
+  case Instruction::LShr:
+    printBinaryInstruction("shr.un",left,right);
+    break;
+  case Instruction::AShr:
+    printBinaryInstruction("shr",left,right);
+    break;
+  default:
+    cerr << "Expression = " << *CE << "\n";
+    assert(0 && "Invalid constant expression");
+  }
+}
+
+
+void MSILWriter::printStaticInitializerList() {
+  // List of global variables with uninitialized fields.
+  for (std::map<const GlobalVariable*,std::vector<StaticInitializer> >::iterator
+       VarI = StaticInitList.begin(), VarE = StaticInitList.end(); VarI!=VarE;
+       ++VarI) {
+    const std::vector<StaticInitializer>& InitList = VarI->second;
+    if (InitList.empty()) continue;
+    // For each uninitialized field.
+    for (std::vector<StaticInitializer>::const_iterator I = InitList.begin(),
+         E = InitList.end(); I!=E; ++I) {
+      if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(I->constant)) {
+        Out << "\n// Init " << getValueName(VarI->first) << ", offset " <<
+          utostr(I->offset) << ", type "<< *I->constant->getType() << "\n\n";
+        // Load variable address
+        printValueLoad(VarI->first);
+        // Add offset
+        if (I->offset!=0) {
+          printPtrLoad(I->offset);
+          printSimpleInstruction("add");
+        }
+        // Load value
+        printConstantExpr(CE);
+        // Save result at offset
+        std::string postfix = getTypePostfix(CE->getType(),true);
+        if (*postfix.begin()=='u') *postfix.begin() = 'i';
+        postfix = "stind."+postfix;
+        printSimpleInstruction(postfix.c_str());
+      } else {
+        cerr << "Constant = " << *I->constant << '\n';
+        assert(0 && "Invalid static initializer");
+      }
+    }
+  }
+}
+
+
+void MSILWriter::printFunction(const Function& F) {
+  const FunctionType* FTy = F.getFunctionType();
+  const ParamAttrsList *Attrs = FTy->getParamAttrs();
+  bool isSigned = Attrs && Attrs->paramHasAttr(0, ParamAttr::SExt);
+  Out << "\n.method static ";
+  Out << (F.hasInternalLinkage() ? "private " : "public ");
+  if (F.isVarArg()) Out << "vararg ";
+  Out << getTypeName(F.getReturnType(),isSigned) << 
+    getConvModopt(F.getCallingConv()) << getValueName(&F) << '\n';
+  // Arguments
+  Out << "\t(";
+  unsigned ArgIdx = 1;
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I!=E;
+       ++I, ++ArgIdx) {
+    isSigned = Attrs && Attrs->paramHasAttr(ArgIdx, ParamAttr::SExt);
+    if (I!=F.arg_begin()) Out << ", ";
+    Out << getTypeName(I->getType(),isSigned) << getValueName(I);
+  }
+  Out << ") cil managed\n";
+  // Body
+  Out << "{\n";
+  printLocalVariables(F);
+  printFunctionBody(F);
+  Out << "}\n";
+}
+
+
+void MSILWriter::printDeclarations(const TypeSymbolTable& ST) {
+  std::string Name;
+  std::set<const Type*> Printed;
+  for (std::set<const Type*>::const_iterator
+       UI = UsedTypes->begin(), UE = UsedTypes->end(); UI!=UE; ++UI) {
+    const Type* Ty = *UI;
+    if (isa<ArrayType>(Ty) || isa<VectorType>(Ty) || isa<StructType>(Ty))
+      Name = getTypeName(Ty, false, true);
+    // Type with no need to declare.
+    else continue;
+    // Print not duplicated type
+    if (Printed.insert(Ty).second) {
+      Out << ".class value explicit ansi sealed '" << Name << "'";
+      Out << " { .pack " << 1 << " .size " << TD->getTypeSize(Ty) << " }\n\n";
+    }
+  }
+}
+
+
+unsigned int MSILWriter::getBitWidth(const Type* Ty) {
+  unsigned int N = Ty->getPrimitiveSizeInBits();
+  assert(N!=0 && "Invalid type in getBitWidth()");
+  switch (N) {
+  case 1:
+  case 8:
+  case 16:
+  case 32:
+  case 64:
+    return N;
+  default:
+    cerr << "Bits = " << N << '\n';
+    assert(0 && "Unsupported integer width");
+  }
+}
+
+
+void MSILWriter::printStaticConstant(const Constant* C, uint64_t& Offset) {
+  uint64_t TySize = 0;
+  const Type* Ty = C->getType();
+  // Print zero initialized constant.
+  if (isa<ConstantAggregateZero>(C) || C->isNullValue()) {
+    TySize = TD->getTypeSize(C->getType());
+    Offset += TySize;
+    Out << "int8 (0) [" << TySize << "]";
+    return;
+  }
+  // Print constant initializer
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    TySize = TD->getTypeSize(Ty);
+    const ConstantInt* Int = cast<ConstantInt>(C);
+    Out << getPrimitiveTypeName(Ty,true) << "(" << Int->getSExtValue() << ")";
+    break;
+  }
+  case Type::FloatTyID:
+  case Type::DoubleTyID: {
+    TySize = TD->getTypeSize(Ty);
+    const ConstantFP* FP = cast<ConstantFP>(C);
+    if (Ty->getTypeID() == Type::FloatTyID)
+      Out << "int32 (" << FloatToBits(FP->getValue()) << ')';
+    else
+      Out << "int64 (" << DoubleToBits(FP->getValue()) << ')';
+    break;
+  }
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID:
+    for (unsigned I = 0, E = C->getNumOperands(); I<E; I++) {
+      if (I!=0) Out << ",\n";
+      printStaticConstant(C->getOperand(I),Offset);
+    }
+    break;
+  case Type::PointerTyID:
+    TySize = TD->getTypeSize(C->getType());
+    // Initialize with global variable address
+    if (const GlobalVariable *G = dyn_cast<GlobalVariable>(C)) {
+      std::string name = getValueName(G);
+      Out << "&(" << name.insert(name.length()-1,"$data") << ")";
+    } else {
+      // Dynamic initialization
+      if (!isa<ConstantPointerNull>(C) && !C->isNullValue())
+        InitListPtr->push_back(StaticInitializer(C,Offset));
+      // Null pointer initialization
+      if (TySize==4) Out << "int32 (0)";
+      else if (TySize==8) Out << "int64 (0)";
+      else assert(0 && "Invalid pointer size");
+    }
+    break;
+  default:
+    cerr << "TypeID = " << Ty->getTypeID() << '\n';
+    assert(0 && "Invalid type in printStaticConstant()");
+  }
+  // Increase offset.
+  Offset += TySize;
+}
+
+
+void MSILWriter::printStaticInitializer(const Constant* C,
+                                        const std::string& Name) {
+  switch (C->getType()->getTypeID()) {
+  case Type::IntegerTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID: 
+    Out << getPrimitiveTypeName(C->getType(), false);
+    break;
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID:
+  case Type::PointerTyID:
+    Out << getTypeName(C->getType());
+    break;
+  default:
+    cerr << "Type = " << *C << "\n";
+    assert(0 && "Invalid constant type");
+  }
+  // Print initializer
+  std::string label = Name;
+  label.insert(label.length()-1,"$data");
+  Out << Name << " at " << label << '\n';
+  Out << ".data " << label << " = {\n";
+  uint64_t offset = 0;
+  printStaticConstant(C,offset);
+  Out << "\n}\n\n";
+}
+
+
+void MSILWriter::printVariableDefinition(const GlobalVariable* G) {
+  const Constant* C = G->getInitializer();
+  if (C->isNullValue() || isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
+    InitListPtr = 0;
+  else
+    InitListPtr = &StaticInitList[G];
+  printStaticInitializer(C,getValueName(G));
+}
+
+
+void MSILWriter::printGlobalVariables() {
+  if (ModulePtr->global_empty()) return;
+  Module::global_iterator I,E;
+  for (I = ModulePtr->global_begin(), E = ModulePtr->global_end(); I!=E; ++I) {
+    // Variable definition
+    Out << ".field static " << (I->isDeclaration() ? "public " :
+                                                     "private ");
+    if (I->isDeclaration()) {
+      Out << getTypeName(I->getType()) << getValueName(&*I) << "\n\n";
+    } else
+      printVariableDefinition(&*I);
+  }
+}
+
+
+const char* MSILWriter::getLibraryName(const Function* F) {
+  return getLibraryForSymbol(F->getName().c_str(), true, F->getCallingConv());
+}
+
+
+const char* MSILWriter::getLibraryName(const GlobalVariable* GV) {
+  return getLibraryForSymbol(Mang->getValueName(GV).c_str(), false, 0);
+}
+
+
+const char* MSILWriter::getLibraryForSymbol(const char* Name, bool isFunction,
+                                           unsigned CallingConv) {
+  // TODO: Read *.def file with function and libraries definitions.
+  return "MSVCRT.DLL";  
+}
+
+
+void MSILWriter::printExternals() {
+  Module::const_iterator I,E;
+  // Functions.
+  for (I=ModulePtr->begin(),E=ModulePtr->end(); I!=E; ++I) {
+    // Skip intrisics
+    if (I->getIntrinsicID()) continue;
+    if (I->isDeclaration()) {
+      const Function* F = I; 
+      std::string Name = getConvModopt(F->getCallingConv())+getValueName(F);
+      std::string Sig = 
+        getCallSignature(cast<FunctionType>(F->getFunctionType()), NULL, Name);
+      Out << ".method static hidebysig pinvokeimpl(\""
+          << getLibraryName(F) << "\")\n\t" << Sig << " preservesig {}\n\n";
+    }
+  }
+  // External variables and static initialization.
+  Out <<
+  ".method public hidebysig static pinvokeimpl(\"KERNEL32.DLL\" ansi winapi)"
+  "  native int LoadLibrary(string) preservesig {}\n"
+  ".method public hidebysig static pinvokeimpl(\"KERNEL32.DLL\" ansi winapi)"
+  "  native int GetProcAddress(native int, string) preservesig {}\n";
+  Out <<
+  ".method private static void* $MSIL_Import(string lib,string sym)\n"
+  " managed cil\n{\n"
+  "\tldarg\tlib\n"
+  "\tcall\tnative int LoadLibrary(string)\n"
+  "\tldarg\tsym\n"
+  "\tcall\tnative int GetProcAddress(native int,string)\n"
+  "\tdup\n"
+  "\tbrtrue\tL_01\n"
+  "\tldstr\t\"Can no import variable\"\n"
+  "\tnewobj\tinstance void [mscorlib]System.Exception::.ctor(string)\n"
+  "\tthrow\n"
+  "L_01:\n"
+  "\tret\n"
+  "}\n\n"
+  ".method static private void $MSIL_Init() managed cil\n{\n";
+  printStaticInitializerList();
+  // Foreach global variable.
+  for (Module::global_iterator I = ModulePtr->global_begin(),
+       E = ModulePtr->global_end(); I!=E; ++I) {
+    if (!I->isDeclaration() || !I->hasDLLImportLinkage()) continue;
+    // Use "LoadLibrary"/"GetProcAddress" to recive variable address.
+    std::string Label = "not_null$_"+utostr(getUniqID());
+    std::string Tmp = getTypeName(I->getType())+getValueName(&*I);
+    printSimpleInstruction("ldsflda",Tmp.c_str());
+    Out << "\tldstr\t\"" << getLibraryName(&*I) << "\"\n";
+    Out << "\tldstr\t\"" << Mang->getValueName(&*I) << "\"\n";
+    printSimpleInstruction("call","void* $MSIL_Import(string,string)");
+    printIndirectSave(I->getType());
+  }
+  printSimpleInstruction("ret");
+  Out << "}\n\n";
+}
+
+
+//===----------------------------------------------------------------------===//
+//			 External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool MSILTarget::addPassesToEmitWholeFile(PassManager &PM, std::ostream &o,
+                                          CodeGenFileType FileType, bool Fast)
+{
+  if (FileType != TargetMachine::AssemblyFile) return true;
+  MSILWriter* Writer = new MSILWriter(o);
+  PM.add(createLowerGCPass());
+  PM.add(createLowerAllocationsPass(true));
+  // FIXME: Handle switch trougth native IL instruction "switch"
+  PM.add(createLowerSwitchPass());
+  PM.add(createCFGSimplificationPass());
+  PM.add(new MSILModule(Writer->UsedTypes,Writer->TD));
+  PM.add(Writer);
+  return false;
+}

diff --git a/lib/Target/MSIL/MSILWriter.h b/lib/Target/MSIL/MSILWriter.h
new file mode 100644
index 0000000..c2cd0ab
--- /dev/null
+++ b/lib/Target/MSIL/MSILWriter.h

@@ -0,0 +1,255 @@
+//===-- MSILWriter.h - TargetMachine for the MSIL ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Roman Samoilov and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSILWriter that is used by the MSIL.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MSILWRITER_H
+#define MSILWRITER_H
+
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Support/Mangler.h"
+#include <algorithm>
+#include <ios>
+using namespace llvm;
+
+namespace {
+
+  class MSILModule : public ModulePass {
+    Module *ModulePtr;
+    const std::set<const Type *>*& UsedTypes;
+    const TargetData*& TD;
+
+  public:
+    static char ID;
+    MSILModule(const std::set<const Type *>*& _UsedTypes,
+               const TargetData*& _TD)
+      : ModulePass((intptr_t)&ID), UsedTypes(_UsedTypes), TD(_TD) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<FindUsedTypes>();
+      AU.addRequired<TargetData>();
+    }
+
+    virtual const char *getPassName() const {
+      return "MSIL backend definitions";
+    }
+
+    virtual bool runOnModule(Module &M);
+
+  };
+
+  class MSILWriter  : public FunctionPass {
+    struct StaticInitializer {
+      const Constant* constant;
+      uint64_t offset;
+      
+      StaticInitializer()
+        : constant(0), offset(0) {}
+
+      StaticInitializer(const Constant* _constant, uint64_t _offset)
+        : constant(_constant), offset(_offset) {} 
+    };
+
+    uint64_t UniqID;
+
+    uint64_t getUniqID() {
+      return ++UniqID;
+    }
+
+  public:
+    std::ostream &Out;
+    Module* ModulePtr;
+    const TargetData* TD;
+    Mangler* Mang;
+    LoopInfo *LInfo;
+    std::vector<StaticInitializer>* InitListPtr;
+    std::map<const GlobalVariable*,std::vector<StaticInitializer> >
+      StaticInitList;
+    const std::set<const Type *>* UsedTypes;
+    static char ID;
+    MSILWriter(std::ostream &o) : FunctionPass((intptr_t)&ID), Out(o) {
+      UniqID = 0;
+    }
+
+    enum ValueType {
+      UndefVT,
+      GlobalVT,
+      InternalVT,
+      ArgumentVT,
+      LocalVT,
+      ConstVT,
+      ConstExprVT
+    };
+
+    bool isVariable(ValueType V) {
+      return V==GlobalVT || V==InternalVT || V==ArgumentVT || V==LocalVT;
+    }
+
+    bool isConstValue(ValueType V) {
+      return V==ConstVT || V==ConstExprVT;
+    }
+
+    virtual const char *getPassName() const { return "MSIL backend"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.setPreservesAll();
+    }
+
+    bool runOnFunction(Function &F);
+
+    virtual bool doInitialization(Module &M);
+
+    virtual bool doFinalization(Module &M);
+
+    void printModuleStartup();
+
+    bool isZeroValue(const Value* V);
+
+    std::string getValueName(const Value* V);
+
+    std::string getLabelName(const Value* V);
+
+    std::string getLabelName(const std::string& Name);
+
+    std::string getConvModopt(unsigned CallingConvID);
+
+    std::string getArrayTypeName(Type::TypeID TyID, const Type* Ty);
+
+    std::string getPrimitiveTypeName(const Type* Ty, bool isSigned);
+
+    std::string getFunctionTypeName(const Type* Ty);
+
+    std::string getPointerTypeName(const Type* Ty);
+
+    std::string getTypeName(const Type* Ty, bool isSigned = false,
+                            bool isNested = false);
+
+    ValueType getValueLocation(const Value* V);
+
+    std::string getTypePostfix(const Type* Ty, bool Expand,
+                               bool isSigned = false);
+
+    void printConvToPtr();
+
+    void printPtrLoad(uint64_t N);
+
+    void printValuePtrLoad(const Value* V);
+
+    void printConstLoad(const Constant* C);
+
+    void printValueLoad(const Value* V);
+
+    void printValueSave(const Value* V);
+
+    void printBinaryInstruction(const char* Name, const Value* Left,
+                                const Value* Right);
+
+    void printSimpleInstruction(const char* Inst, const char* Operand = NULL);
+
+    void printPHICopy(const BasicBlock* Src, const BasicBlock* Dst);
+
+    void printBranchToBlock(const BasicBlock* CurrBB,
+                            const BasicBlock* TrueBB,
+                            const BasicBlock* FalseBB);
+
+    void printBranchInstruction(const BranchInst* Inst);
+
+    void printSelectInstruction(const Value* Cond, const Value* VTrue,
+                                const Value* VFalse);
+
+    void printIndirectLoad(const Value* V);
+
+    void printIndirectSave(const Value* Ptr, const Value* Val);
+
+    void printIndirectSave(const Type* Ty);
+
+    void printCastInstruction(unsigned int Op, const Value* V,
+                              const Type* Ty);
+
+    void printGepInstruction(const Value* V, gep_type_iterator I,
+                             gep_type_iterator E);
+
+    std::string getCallSignature(const FunctionType* Ty,
+                                 const Instruction* Inst,
+                                 std::string Name);
+
+    void printFunctionCall(const Value* FnVal, const Instruction* Inst);
+
+    void printIntrinsicCall(const IntrinsicInst* Inst);
+
+    void printCallInstruction(const Instruction* Inst);
+
+    void printICmpInstruction(unsigned Predicate, const Value* Left,
+                              const Value* Right);
+
+    void printFCmpInstruction(unsigned Predicate, const Value* Left,
+                              const Value* Right);
+
+    void printInvokeInstruction(const InvokeInst* Inst);
+
+    void printSwitchInstruction(const SwitchInst* Inst);
+
+    void printVAArgInstruction(const VAArgInst* Inst);
+
+    void printAllocaInstruction(const AllocaInst* Inst);
+
+    void printInstruction(const Instruction* Inst);
+
+    void printLoop(const Loop* L);
+
+    void printBasicBlock(const BasicBlock* BB);
+    
+    void printLocalVariables(const Function& F);
+
+    void printFunctionBody(const Function& F);
+
+    void printConstantExpr(const ConstantExpr* CE);
+
+    void printStaticInitializerList();
+
+    void printFunction(const Function& F);
+
+    void printDeclarations(const TypeSymbolTable& ST);
+
+    unsigned int getBitWidth(const Type* Ty);
+
+    void printStaticConstant(const Constant* C, uint64_t& Offset);
+
+    void printStaticInitializer(const Constant* C, const std::string& Name);
+
+    void printVariableDefinition(const GlobalVariable* G);
+
+    void printGlobalVariables();
+
+    const char* getLibraryName(const Function* F);
+
+    const char* getLibraryName(const GlobalVariable* GV); 
+    
+    const char* getLibraryForSymbol(const char* Name, bool isFunction,
+                                    unsigned CallingConv);
+
+    void printExternals();
+  };
+}
+
+#endif
+

diff --git a/lib/Target/MSIL/Makefile b/lib/Target/MSIL/Makefile
new file mode 100644
index 0000000..17f7247
--- /dev/null
+++ b/lib/Target/MSIL/Makefile

@@ -0,0 +1,14 @@
+##===- lib/Target/MSIL/Makefile ----------------------------*- Makefile -*-===##
+#
+#		      The LLVM Compiler Infrastructure
+#
+# This file was developed by Roman Samoilov and is distributed under
+# the University of Illinois Open Source License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMMSIL
+include $(LEVEL)/Makefile.common
+
+CompileCommonOpts := $(CompileCommonOpts) -Wno-format

diff --git a/lib/Target/MSIL/README.TXT b/lib/Target/MSIL/README.TXT
new file mode 100644
index 0000000..2b9a569
--- /dev/null
+++ b/lib/Target/MSIL/README.TXT

@@ -0,0 +1,26 @@
+//===---------------------------------------------------------------------===// 
+
+Vector instructions support.
+
+ShuffleVector
+ExtractElement
+InsertElement
+
+//===---------------------------------------------------------------------===// 
+
+Add "OpaqueType" type.
+
+//===---------------------------------------------------------------------===// 
+
+"switch" instruction emulation with CLI "switch" instruction.
+
+//===---------------------------------------------------------------------===// 
+
+Write linker for external function, because function export need to know 
+dynamic library where function located.
+
+.method static hidebysig pinvokeimpl("msvcrt.dll" cdecl)
+	void free(void*) preservesig {}
+
+
+

diff --git a/lib/Target/Makefile b/lib/Target/Makefile
new file mode 100644
index 0000000..59f50fe
--- /dev/null
+++ b/lib/Target/Makefile

@@ -0,0 +1,20 @@
+#===- lib/Target/Makefile ----------------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by the LLVM research group and is distributed under
+# the University of Illinois Open Source License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMTarget
+BUILD_ARCHIVE = 1
+
+# We include this early so we can access the value of TARGETS_TO_BUILD as the
+# value for PARALLEL_DIRS which must be set before Makefile.rules is included
+include $(LEVEL)/Makefile.config
+
+PARALLEL_DIRS := $(TARGETS_TO_BUILD)
+
+include $(LLVM_SRC_ROOT)/Makefile.rules

diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile
new file mode 100644
index 0000000..6ebffc7
--- /dev/null
+++ b/lib/Target/Mips/Makefile

@@ -0,0 +1,21 @@
+##===- lib/Target/Mips/Makefile ----------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by Bruno Cardoso Lopes and is distributed under the 
+# University of Illinois Open Source License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMMips
+TARGET = Mips
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = MipsGenRegisterInfo.h.inc MipsGenRegisterNames.inc \
+								MipsGenRegisterInfo.inc MipsGenInstrNames.inc \
+								MipsGenInstrInfo.inc MipsGenAsmWriter.inc \
+								MipsGenDAGISel.inc MipsGenCallingConv.inc \
+								MipsGenSubtarget.inc
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
new file mode 100644
index 0000000..48b08ea
--- /dev/null
+++ b/lib/Target/Mips/Mips.h

@@ -0,0 +1,38 @@
+//===-- Mips.h - Top-level interface for Mips representation ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in 
+// the LLVM Mips back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_MIPS_H
+#define TARGET_MIPS_H
+
+#include <iosfwd>
+
+namespace llvm {
+  class MipsTargetMachine;
+  class FunctionPassManager;
+  class FunctionPass;
+  class MachineCodeEmitter;
+
+  FunctionPass *createMipsCodePrinterPass(std::ostream &OS, 
+                                          MipsTargetMachine &TM);
+  FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
+} // end namespace llvm;
+
+// Defines symbolic names for Mips registers.  This defines a mapping from
+// register name to register number.
+#include "MipsGenRegisterNames.inc"
+
+// Defines symbolic names for the Mips instructions.
+#include "MipsGenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
new file mode 100644
index 0000000..662bc3b
--- /dev/null
+++ b/lib/Target/Mips/Mips.td

@@ -0,0 +1,63 @@
+//===- Mips.td - Describe the Mips Target Machine ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "../Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "MipsRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget features
+//===----------------------------------------------------------------------===//
+
+// TODO: dummy, needed to compile
+def FeatureCIX : SubtargetFeature<"r3000", "isR3000", "true",
+                                  "Enable r3000 extentions">;
+
+//===----------------------------------------------------------------------===//
+// Instruction Description
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrInfo.td"
+
+def MipsInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+  let TSFlagsFields = [];
+  let TSFlagsShifts = [];
+}
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "MipsCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Mips processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Mips : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = MipsInstrInfo;
+}

diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
new file mode 100644
index 0000000..1df1291
--- /dev/null
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp

@@ -0,0 +1,433 @@
+//===-- MipsAsmPrinter.cpp - Mips LLVM assembly writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format MIPS assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-asm-printer"
+
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "MipsMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include <cctype>
+
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  struct VISIBILITY_HIDDEN MipsAsmPrinter : public AsmPrinter {
+    MipsAsmPrinter(std::ostream &O, MipsTargetMachine &TM, 
+                   const TargetAsmInfo *T): 
+                   AsmPrinter(O, TM, T) {}
+
+    virtual const char *getPassName() const {
+      return "Mips Assembly Printer";
+    }
+
+    enum SetDirectiveFlags {
+      REORDER,      // enables instruction reordering.
+      NOREORDER,    // disables instruction reordering.
+      MACRO,        // enables GAS macros.
+      NOMACRO       // disables GAS macros.
+    };
+
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printMemOperand(const MachineInstr *MI, int opNum, 
+                         const char *Modifier = 0);
+
+    void printHex32(unsigned int Value);
+    void emitFunctionStart(MachineFunction &MF);
+    void emitFunctionEnd();
+    void emitFrameDirective(MachineFunction &MF);
+    void emitMaskDirective(MachineFunction &MF);
+    void emitFMaskDirective();
+    void emitSetDirective(SetDirectiveFlags Flag);
+
+    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+  };
+} // end of anonymous namespace
+
+#include "MipsGenAsmWriter.inc"
+
+/// createMipsCodePrinterPass - Returns a pass that prints the MIPS
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+FunctionPass *llvm::createMipsCodePrinterPass(std::ostream &o,
+                                              MipsTargetMachine &tm) 
+{
+  return new MipsAsmPrinter(o, tm, tm.getTargetAsmInfo());
+}
+
+/// This pattern will be emitted :
+///   .frame reg1, size, reg2
+/// It describes the stack frame. 
+/// reg1 - stack pointer
+/// size - stack size allocated for the function
+/// reg2 - return address register
+void MipsAsmPrinter::
+emitFrameDirective(MachineFunction &MF)
+{
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+
+  unsigned stackReg  = RI.getFrameRegister(MF);
+  unsigned returnReg = RI.getRARegister();
+  unsigned stackSize = MF.getFrameInfo()->getStackSize();
+
+
+  O << "\t.frame\t" << "$" << LowercaseString(RI.get(stackReg).Name) 
+                    << "," << stackSize << ","
+                    << "$" << LowercaseString(RI.get(returnReg).Name) 
+                    << "\n";
+}
+
+/// This pattern will be emitted :
+///   .mask bitmask, offset
+/// Tells the assembler (and possibly linker) which registers are saved and where. 
+/// bitmask - mask of all GPRs (little endian) 
+/// offset  - negative value. offset+stackSize should give where on the stack
+///           the first GPR is saved.
+/// TODO: consider calle saved GPR regs here, not hardcode register numbers.
+void MipsAsmPrinter::
+emitMaskDirective(MachineFunction &MF)
+{
+  const MRegisterInfo &RI  = *TM.getRegisterInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  bool hasFP  = RI.hasFP(MF);
+  bool saveRA = MF.getFrameInfo()->hasCalls();
+
+  int offset;
+
+  if (!MipsFI->getTopSavedRegOffset())
+    offset = 0;
+  else
+    offset = -(MF.getFrameInfo()->getStackSize()
+               -MipsFI->getTopSavedRegOffset());
+
+  #ifndef NDEBUG
+  DOUT << "<--ASM PRINTER--emitMaskDirective-->" << "\n";
+  DOUT << "StackSize :  " << MF.getFrameInfo()->getStackSize() << "\n";
+  DOUT << "getTopSavedRegOffset() : " << MipsFI->getTopSavedRegOffset() << "\n";
+  DOUT << "offset : " << offset << "\n\n";
+  #endif
+
+  unsigned int bitmask = 0;
+
+  if (hasFP) 
+    bitmask |= (1 << 30);
+  
+  if (saveRA) 
+    bitmask |= (1 << 31);
+
+  O << "\t.mask\t"; 
+  printHex32(bitmask);
+  O << "," << offset << "\n";
+}
+
+/// This pattern will be emitted :
+///   .fmask bitmask, offset
+/// Tells the assembler (and possibly linker) which float registers are saved.
+/// bitmask - mask of all Float Point registers (little endian) 
+/// offset  - negative value. offset+stackSize should give where on the stack
+///           the first Float Point register is saved.
+/// TODO: implement this, dummy for now
+void MipsAsmPrinter::
+emitFMaskDirective()
+{
+  O << "\t.fmask\t0x00000000,0" << "\n";
+}
+
+/// Print a 32 bit hex number filling with 0's on the left.
+/// TODO: make this setfill and setw
+void MipsAsmPrinter::
+printHex32(unsigned int Value) {
+  O << "0x" << std::hex << Value << std::dec;  
+}
+
+/// Emit Set directives.
+void MipsAsmPrinter::
+emitSetDirective(SetDirectiveFlags Flag) {
+  
+  O << "\t.set\t";
+  switch(Flag) {
+      case REORDER:   O << "reorder" << "\n"; break;
+      case NOREORDER: O << "noreorder" << "\n"; break;
+      case MACRO:     O << "macro" << "\n"; break;
+      case NOMACRO:   O << "nomacro" << "\n"; break;
+      default: break;
+  }
+}
+
+/// Emit the directives used by GAS on the start of functions
+void MipsAsmPrinter::
+emitFunctionStart(MachineFunction &MF)
+{
+  // Print out the label for the function.
+  const Function *F = MF.getFunction();
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+
+  // On Mips GAS, if .align #n is present, #n means the number of bits 
+  // to be cleared. So, if we want 4 byte alignment, we must have .align 2
+  EmitAlignment(1, F);
+
+  O << "\t.globl\t"  << CurrentFnName << "\n";
+  O << "\t.ent\t"    << CurrentFnName << "\n";
+  O << "\t.type\t"   << CurrentFnName << ", @function\n";
+  O << CurrentFnName << ":\n";
+
+  emitFrameDirective(MF);
+  emitMaskDirective(MF);
+  emitFMaskDirective();
+
+  emitSetDirective(NOREORDER);
+  emitSetDirective(NOMACRO);
+}
+
+/// Emit the directives used by GAS on the end of functions
+void MipsAsmPrinter::
+emitFunctionEnd() {
+  emitSetDirective(MACRO);
+  emitSetDirective(REORDER);
+  O << "\t.end\t" << CurrentFnName << "\n";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+bool MipsAsmPrinter::
+runOnMachineFunction(MachineFunction &MF) 
+{
+  SetupMachineFunction(MF);
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  O << "\n\n";
+
+  // What's my mangled name?
+  CurrentFnName = Mang->getValueName(MF.getFunction());
+
+  // Emit the function start directives
+  emitFunctionStart(MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printInstruction(II);
+      ++EmittedInsts;
+    }
+  }
+
+  // Emit function end directives
+  emitFunctionEnd();
+
+  // We didn't modify anything.
+  return false;
+}
+
+void MipsAsmPrinter::
+printOperand(const MachineInstr *MI, int opNum) 
+{
+  const MachineOperand &MO = MI->getOperand(opNum);
+  const MRegisterInfo  &RI = *TM.getRegisterInfo();
+  bool  closeP=false;
+
+  // %hi and %lo used on mips gas to break large constants
+  if (MI->getOpcode() == Mips::LUi && !MO.isRegister() 
+      && !MO.isImmediate()) {
+    O << "%hi(";
+    closeP = true;
+  } else if ((MI->getOpcode() == Mips::ADDiu) && !MO.isRegister() 
+             && !MO.isImmediate()) {
+    O << "%lo(";
+    closeP = true;
+  }
+ 
+  switch (MO.getType()) 
+  {
+    case MachineOperand::MO_Register:
+      if (MRegisterInfo::isPhysicalRegister(MO.getReg()))
+        O << "$" << LowercaseString (RI.get(MO.getReg()).Name);
+      else
+        O << "$" << MO.getReg();
+      break;
+
+    case MachineOperand::MO_Immediate:
+      if ((MI->getOpcode() == Mips::SLTiu) || (MI->getOpcode() == Mips::ORi) || 
+          (MI->getOpcode() == Mips::LUi)   || (MI->getOpcode() == Mips::ANDi))
+        O << (unsigned short int)MO.getImmedValue();
+      else
+        O << (short int)MO.getImmedValue();
+      break;
+
+    case MachineOperand::MO_MachineBasicBlock:
+      printBasicBlockLabel(MO.getMachineBasicBlock());
+      return;
+
+    case MachineOperand::MO_GlobalAddress:
+      O << Mang->getValueName(MO.getGlobal());
+      break;
+
+    case MachineOperand::MO_ExternalSymbol:
+      O << MO.getSymbolName();
+      break;
+
+    case MachineOperand::MO_ConstantPoolIndex:
+      O << TAI->getPrivateGlobalPrefix() << "CPI"
+        << getFunctionNumber() << "_" << MO.getConstantPoolIndex();
+      break;
+  
+    default:
+      O << "<unknown operand type>"; abort (); break;
+  }
+
+  if (closeP) O << ")";
+}
+
+void MipsAsmPrinter::
+printMemOperand(const MachineInstr *MI, int opNum, const char *Modifier) 
+{
+  // lw/sw $reg, MemOperand
+  // will turn into :
+  // lw/sw $reg, imm($reg)
+  printOperand(MI, opNum);
+  O << "(";
+  printOperand(MI, opNum+1);
+  O << ")";
+}
+
+bool MipsAsmPrinter::
+doInitialization(Module &M) 
+{
+  Mang = new Mangler(M);
+  return false; // success
+}
+
+bool MipsAsmPrinter::
+doFinalization(Module &M) 
+{
+  const TargetData *TD = TM.getTargetData();
+
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(),
+         E = M.global_end(); I != E; ++I)
+
+    // External global require no code
+    if (I->hasInitializer()) {
+
+      // Check to see if this is a special global 
+      // used by LLVM, if so, emit it.
+      if (EmitSpecialLLVMGlobal(I))
+        continue;
+      
+      O << "\n\n";
+      std::string name = Mang->getValueName(I);
+      Constant *C      = I->getInitializer();
+      unsigned Size    = TD->getTypeSize(C->getType());
+      unsigned Align   = TD->getPrefTypeAlignment(C->getType());
+
+      if (C->isNullValue() && (I->hasLinkOnceLinkage() || 
+        I->hasInternalLinkage() || I->hasWeakLinkage() 
+        /* FIXME: Verify correct */)) {
+
+        SwitchToDataSection(".data", I);
+        if (I->hasInternalLinkage())
+          O << "\t.local " << name << "\n";
+
+        O << "\t.comm " << name << "," 
+          << TD->getTypeSize(C->getType()) 
+          << "," << Align << "\n";
+
+      } else {
+
+        switch (I->getLinkage()) 
+        {
+          case GlobalValue::LinkOnceLinkage:
+          case GlobalValue::WeakLinkage:   
+            // FIXME: Verify correct for weak.
+            // Nonnull linkonce -> weak
+            O << "\t.weak " << name << "\n";
+            SwitchToDataSection("", I);
+            O << "\t.section\t\".llvm.linkonce.d." << name
+                          << "\",\"aw\",@progbits\n";
+            break;
+          case GlobalValue::AppendingLinkage:
+            // FIXME: appending linkage variables 
+            // should go into a section of  their name or 
+            // something.  For now, just emit them as external.
+          case GlobalValue::ExternalLinkage:
+            // If external or appending, declare as a global symbol
+            O << "\t.globl " << name << "\n";
+          case GlobalValue::InternalLinkage:
+            if (C->isNullValue())
+              SwitchToDataSection(".bss", I);
+            else
+              SwitchToDataSection(".data", I);
+            break;
+          case GlobalValue::GhostLinkage:
+            cerr << "Should not have any" 
+                 << "unmaterialized functions!\n";
+            abort();
+          case GlobalValue::DLLImportLinkage:
+            cerr << "DLLImport linkage is" 
+                 << "not supported by this target!\n";
+            abort();
+          case GlobalValue::DLLExportLinkage:
+            cerr << "DLLExport linkage is" 
+                 << "not supported by this target!\n";
+            abort();
+          default:
+            assert(0 && "Unknown linkage type!");          
+        }
+        O << "\t.align " << Align << "\n";
+        O << "\t.type " << name << ",@object\n";
+        O << "\t.size " << name << "," << Size << "\n";
+        O << name << ":\n";
+        EmitGlobalConstant(C);
+    }
+  }
+
+  AsmPrinter::doFinalization(M);
+  return false; // success
+}

diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
new file mode 100644
index 0000000..23ef850
--- /dev/null
+++ b/lib/Target/Mips/MipsCallingConv.td

@@ -0,0 +1,39 @@
+//===- MipsCallingConv.td - Calling Conventions for Mips --------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for Mips architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>: 
+  CCIf<!strconcat("State.getTarget().getSubtarget<MipsSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Mips Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_Mips : CallingConv<[
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// Mips Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_Mips : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3]>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;
+

diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
new file mode 100644
index 0000000..d6e3830
--- /dev/null
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp

@@ -0,0 +1,272 @@
+//===-- MipsISelDAGToDAG.cpp - A dag to dag inst selector for Mips --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-isel"
+
+#include "Mips.h"
+#include "MipsISelLowering.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+#include <set>
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace {
+
+class VISIBILITY_HIDDEN MipsDAGToDAGISel : public SelectionDAGISel {
+
+  /// TM - Keep a reference to MipsTargetMachine.
+  MipsTargetMachine &TM;
+
+  /// MipsLowering - This object fully describes how to lower LLVM code to an
+  /// Mips-specific SelectionDAG.
+  MipsTargetLowering MipsLowering;
+
+  /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  //TODO: add initialization on constructor
+  //const MipsSubtarget *Subtarget;
+ 
+public:
+  MipsDAGToDAGISel(MipsTargetMachine &tm) : 
+        SelectionDAGISel(MipsLowering),
+        TM(tm), MipsLowering(*TM.getTargetLowering()) {}
+  
+  virtual void InstructionSelectBasicBlock(SelectionDAG &SD);
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "MIPS DAG->DAG Pattern Instruction Selection";
+  } 
+  
+
+private:  
+  // Include the pieces autogenerated from the target description.
+  #include "MipsGenDAGISel.inc"
+
+  SDNode *Select(SDOperand N);
+
+  // Complex Pattern.
+  bool SelectAddr(SDOperand Op, SDOperand N, 
+                  SDOperand &Base, SDOperand &Offset);
+
+
+  // getI32Imm - Return a target constant with the specified
+  // value, of type i32.
+  inline SDOperand getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+
+  #ifndef NDEBUG
+  unsigned Indent;
+  #endif
+};
+
+}
+
+/// InstructionSelectBasicBlock - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void MipsDAGToDAGISel::
+InstructionSelectBasicBlock(SelectionDAG &SD) 
+{
+  DEBUG(BB->dump());
+  // Codegen the basic block.
+  #ifndef NDEBUG
+  DOUT << "===== Instruction selection begins:\n";
+  Indent = 0;
+  #endif
+
+  // Select target instructions for the DAG.
+  SD.setRoot(SelectRoot(SD.getRoot()));
+
+  #ifndef NDEBUG
+  DOUT << "===== Instruction selection ends:\n";
+  #endif
+
+  SD.RemoveDeadNodes();
+  
+  // Emit machine code to BB. 
+  ScheduleAndEmitDAG(SD);
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsDAGToDAGISel::
+SelectAddr(SDOperand Op, SDOperand Addr, SDOperand &Offset, SDOperand &Base)
+{
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+    
+  // TargetExternalSymbol and TargetGlobalAddress are
+  // lowered and their addresses go into registers, so
+  // they should not be touched here.
+  if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+       Addr.getOpcode() == ISD::TargetGlobalAddress))
+    return false;
+  
+  // Operand is an result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) 
+  {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) 
+    {
+      if (Predicate_immSExt16(CN)) 
+      {
+        // If the first operand is a FI, get the TargetFI Node
+        if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                    (Addr.getOperand(0))) {
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+        } else {
+          Base = Addr.getOperand(0);
+        }
+
+        Offset = CurDAG->getTargetConstant(CN->getValue(), MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  Base   = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+SDNode* MipsDAGToDAGISel::
+Select(SDOperand N) 
+{
+  SDNode *Node = N.Val;
+  unsigned Opcode = Node->getOpcode();
+
+  // Dump information about the Node being selected
+  #ifndef NDEBUG
+  DOUT << std::string(Indent, ' ') << "Selecting: ";
+  DEBUG(Node->dump(CurDAG));
+  DOUT << "\n";
+  Indent += 2;
+  #endif
+
+  // If we have a custom node, we already have selected!
+  if (Opcode >= ISD::BUILTIN_OP_END && Opcode < MipsISD::FIRST_NUMBER) {
+    #ifndef NDEBUG
+    DOUT << std::string(Indent-2, ' ') << "== ";
+    DEBUG(Node->dump(CurDAG));
+    DOUT << "\n";
+    Indent -= 2;
+    #endif
+    return NULL;
+  }
+
+  ///
+  // Instruction Selection not handled by custom or by the 
+  // auto-generated tablegen selection should be handled here
+  /// 
+  switch(Opcode) {
+
+    default: break;
+
+    /// Special Mul operations
+    case ISD::MULHS:
+    case ISD::MULHU: {
+      SDOperand MulOp1 = Node->getOperand(0);
+      SDOperand MulOp2 = Node->getOperand(1);
+      AddToISelQueue(MulOp1);
+      AddToISelQueue(MulOp2);
+
+      unsigned MulOp  = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
+      SDNode *MulNode = CurDAG->getTargetNode(MulOp, MVT::Flag, MulOp1, MulOp2);
+
+      SDOperand MFInFlag = SDOperand(MulNode, 0);
+      return CurDAG->getTargetNode(Mips::MFHI, MVT::i32, MFInFlag);
+    }
+
+    /// Div operations
+    case ISD::SDIV: 
+    case ISD::UDIV: {
+      SDOperand DivOp1 = Node->getOperand(0);
+      SDOperand DivOp2 = Node->getOperand(1);
+      AddToISelQueue(DivOp1);
+      AddToISelQueue(DivOp2);
+
+      unsigned DivOp  = (Opcode == ISD::SDIV ? Mips::DIV : Mips::DIVu);
+      SDNode *DivNode = CurDAG->getTargetNode(DivOp, MVT::Flag, DivOp1, DivOp2);
+
+      SDOperand MFInFlag = SDOperand(DivNode, 0);
+      return CurDAG->getTargetNode(Mips::MFLO, MVT::i32, MFInFlag);
+    }
+
+    /// Rem operations
+    case ISD::SREM: 
+    case ISD::UREM: {
+      SDOperand RemOp1 = Node->getOperand(0);
+      SDOperand RemOp2 = Node->getOperand(1);
+      AddToISelQueue(RemOp1);
+      AddToISelQueue(RemOp2);
+      
+      unsigned RemOp  = (Opcode == ISD::SREM ? Mips::DIV : Mips::DIVu);
+      SDNode *RemNode = CurDAG->getTargetNode(RemOp, MVT::Flag, RemOp1, RemOp2);
+
+      SDOperand MFInFlag = SDOperand(RemNode, 0);
+      return CurDAG->getTargetNode(Mips::MFHI, MVT::i32, MFInFlag);
+    }
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(N);
+
+  #ifndef NDEBUG
+  DOUT << std::string(Indent-2, ' ') << "=> ";
+  if (ResNode == NULL || ResNode == N.Val)
+    DEBUG(N.Val->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DOUT << "\n";
+  Indent -= 2;
+  #endif
+
+  return ResNode;
+}
+
+/// createMipsISelDag - This pass converts a legalized DAG into a 
+/// MIPS-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createMipsISelDag(MipsTargetMachine &TM) {
+  return new MipsDAGToDAGISel(TM);
+}

diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
new file mode 100644
index 0000000..790cdaf
--- /dev/null
+++ b/lib/Target/Mips/MipsISelLowering.cpp

@@ -0,0 +1,557 @@
+//===-- MipsISelLowering.cpp - Mips DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-lower"
+
+#include "MipsISelLowering.h"
+#include "MipsTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+#include <set>
+
+using namespace llvm;
+
+const char *MipsTargetLowering::
+getTargetNodeName(unsigned Opcode) const 
+{
+  switch (Opcode) 
+  {
+    case MipsISD::JmpLink   : return "MipsISD::JmpLink";
+    case MipsISD::Hi        : return "MipsISD::Hi";
+    case MipsISD::Lo        : return "MipsISD::Lo";
+    case MipsISD::Ret       : return "MipsISD::Ret";
+    default                 : return NULL;
+  }
+}
+
+MipsTargetLowering::
+MipsTargetLowering(MipsTargetMachine &TM): TargetLowering(TM) 
+{
+  // Mips does not have i1 type, so use i32 for
+  // setcc operations results (slt, sgt, ...). 
+  setSetCCResultType(MVT::i32);
+  setSetCCResultContents(ZeroOrOneSetCCResult);
+
+  // Set up the register classes
+  addRegisterClass(MVT::i32, Mips::CPURegsRegisterClass);
+
+  // Custom
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::RET, MVT::Other, Custom);
+
+  // Load extented operations for i1 types must be promoted 
+  setLoadXAction(ISD::EXTLOAD,  MVT::i1,  Promote);
+  setLoadXAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadXAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+
+  // Store operations for i1 types must be promoted
+  setStoreXAction(MVT::i1, Promote);
+
+  // Mips does not have these NodeTypes below.
+  setOperationAction(ISD::BR_JT,     MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,     MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+
+  // Mips not supported intrinsics.
+  setOperationAction(ISD::MEMMOVE, MVT::Other, Expand);
+  setOperationAction(ISD::MEMSET, MVT::Other, Expand);
+  setOperationAction(ISD::MEMCPY, MVT::Other, Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  setOperationAction(ISD::LABEL, MVT::Other, Expand);
+
+  // Use the default for now
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+  setStackPointerRegisterToSaveRestore(Mips::SP);
+  computeRegisterProperties();
+}
+
+
+SDOperand MipsTargetLowering::
+LowerOperation(SDOperand Op, SelectionDAG &DAG) 
+{
+  switch (Op.getOpcode()) 
+  {
+    case ISD::CALL:             return LowerCALL(Op, DAG);
+    case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
+    case ISD::RET:              return LowerRET(Op, DAG);
+    case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
+    case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+    case ISD::RETURNADDR:       return LowerRETURNADDR(Op, DAG);
+  }
+  return SDOperand();
+}
+
+//===----------------------------------------------------------------------===//
+//  Lower helper functions
+//===----------------------------------------------------------------------===//
+
+// AddLiveIn - This helper function adds the specified physical register to the
+// MachineFunction as a live in value.  It also creates a corresponding
+// virtual register for it.
+static unsigned
+AddLiveIn(MachineFunction &MF, unsigned PReg, TargetRegisterClass *RC) 
+{
+  assert(RC->contains(PReg) && "Not the correct regclass!");
+  unsigned VReg = MF.getSSARegMap()->createVirtualRegister(RC);
+  MF.addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+// Set up a frame object for the return address.
+//SDOperand MipsTargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
+//  if (ReturnAddrIndex == 0) {
+//    MachineFunction &MF = DAG.getMachineFunction();
+//    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(4, 0);
+//  }
+//
+//  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+//}
+
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+SDOperand MipsTargetLowering::
+LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) 
+{
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  SDOperand GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  SDOperand Hi = DAG.getNode(MipsISD::Hi, MVT::i32, GA);
+  SDOperand Lo = DAG.getNode(MipsISD::Lo, MVT::i32, GA);
+
+  return DAG.getNode(ISD::ADD, MVT::i32, Lo, Hi);
+}
+
+SDOperand MipsTargetLowering::
+LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG)
+{
+  assert(0 && "TLS not implemented for MIPS.");
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//
+//  The lower operations present on calling convention works on this order:
+//      LowerCALL (virt regs --> phys regs, virt regs --> stack) 
+//      LowerFORMAL_ARGUMENTS (phys --> virt regs, stack --> virt regs)
+//      LowerRET (virt regs --> phys regs)
+//      LowerCALL (phys regs --> virt regs)
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//                  CALL Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// Mips custom CALL implementation
+SDOperand MipsTargetLowering::
+LowerCALL(SDOperand Op, SelectionDAG &DAG)
+{
+  unsigned CallingConv= cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+
+  // By now, only CallingConv::C implemented
+  switch (CallingConv) 
+  {
+    default:
+      assert(0 && "Unsupported calling convention");
+    case CallingConv::Fast:
+    case CallingConv::C:
+      return LowerCCCCallTo(Op, DAG, CallingConv);
+  }
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual
+/// regs to (physical regs)/(stack frame), CALLSEQ_START and
+/// CALLSEQ_END are emitted.
+/// TODO: isVarArg, isTailCall, sret, GOT, linkage types.
+SDOperand MipsTargetLowering::
+LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC) 
+{
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned StackReg   = MF.getTarget().getRegisterInfo()->getFrameRegister(MF);
+
+  SDOperand Chain  = Op.getOperand(0);
+  SDOperand Callee = Op.getOperand(4);
+  bool isVarArg    = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+
+  // To meet ABI, Mips must always allocate 16 bytes on
+  // the stack (even if less than 4 are used as arguments)
+  int VTsize = MVT::getSizeInBits(MVT::i32)/8;
+  MFI->CreateFixedObject(VTsize, -(VTsize*3));
+
+  CCInfo.AnalyzeCallOperands(Op.Val, CC_Mips);
+  
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, 
+                                 getPointerTy()));
+
+  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+  SmallVector<SDOperand, 8> MemOpChains;
+
+  SDOperand StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments start after the 5 first operands of ISD::CALL
+    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+    
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+        break;
+    }
+    
+    // Arguments that can be passed on register, 
+    // must be kept at RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+
+      assert(VA.isMemLoc());
+
+      if (StackPtr.Val == 0)
+        StackPtr = DAG.getRegister(StackReg, getPointerTy());
+     
+      // Create the frame index object for this incoming parameter
+      // This guarantees that when allocating Local Area our room
+      // will not be overwritten.
+      int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+                                      -(16 + VA.getLocMemOffset()) );
+
+      SDOperand PtrOff = DAG.getFrameIndex(FI,getPointerTy());
+
+      // emit ISD::STORE whichs stores the 
+      // parameter value to a stack Location
+      MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+    }
+  }
+
+  // Transform all store nodes into one single node because
+  // all store nodes are independent of each other.
+  if (!MemOpChains.empty())     
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, 
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token 
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, 
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct 
+  // call is) turn it into a TargetGlobalAddress node so that legalize 
+  // doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+  } else 
+  if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+  // MipsJmpLink = #chain, #target_address, #opt_in_flags...
+  //             = Chain, Callee, Reg#1, Reg#2, ...  
+  //
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are 
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.Val)
+    Ops.push_back(InFlag);
+
+  Chain  = DAG.getNode(MipsISD::JmpLink, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  Ops.clear();
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+  Ops.push_back(InFlag);
+  Chain  = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
+}
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. Returns a SDNode with the same number of values as the 
+/// ISD::CALL.
+SDNode *MipsTargetLowering::
+LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, 
+        unsigned CallingConv, SelectionDAG &DAG) {
+  
+  bool isVarArg = cast<ConstantSDNode>(TheCall->getOperand(2))->getValue() != 0;
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+
+  CCInfo.AnalyzeCallResult(TheCall, RetCC_Mips);
+  SmallVector<SDOperand, 8> ResultVals;
+
+  // Returns void
+  if (!RVLocs.size())
+    return Chain.Val;
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, RVLocs[i].getLocReg(),
+                                 RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    ResultVals.push_back(Chain.getValue(0));
+  }
+  
+  // Merge everything together with a MERGE_VALUES node.
+  ResultVals.push_back(Chain);
+  return DAG.getNode(ISD::MERGE_VALUES, TheCall->getVTList(),
+                       &ResultVals[0], ResultVals.size()).Val;
+}
+
+//===----------------------------------------------------------------------===//
+//             FORMAL_ARGUMENTS Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// Mips custom FORMAL_ARGUMENTS implementation
+SDOperand MipsTargetLowering::
+LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) 
+{
+  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+  switch(CC) 
+  {
+    default:
+      assert(0 && "Unsupported calling convention");
+    case CallingConv::C:
+      return LowerCCCArguments(Op, DAG);
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: isVarArg, sret
+SDOperand MipsTargetLowering::
+LowerCCCArguments(SDOperand Op, SelectionDAG &DAG) 
+{
+  SDOperand Root        = Op.getOperand(0);
+  MachineFunction &MF   = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  unsigned CC   = DAG.getMachineFunction().getFunction()->getCallingConv();
+
+  unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF);
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+
+  CCInfo.AnalyzeFormalArguments(Op.Val, CC_Mips);
+  SmallVector<SDOperand, 8> ArgValues;
+  SDOperand StackPtr;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored on registers
+    if (VA.isRegLoc()) {
+      MVT::ValueType RegVT = VA.getLocVT();
+      TargetRegisterClass *RC;
+            
+      if (RegVT == MVT::i32)
+        RC = Mips::CPURegsRegisterClass;
+      else
+        assert(0 && "support only Mips::CPURegsRegisterClass");
+      
+
+      // Transform the arguments stored on 
+      // physical registers into virtual ones
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+      SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
+      
+      // If this is an 8 or 16-bit value, it is really passed promoted 
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then 
+      // truncate to the right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      
+      if (VA.getLocInfo() != CCValAssign::Full)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
+
+      ArgValues.push_back(ArgValue);
+
+      // To meet ABI, when VARARGS are passed on registers, the registers
+      // containt must be written to the their always reserved home location 
+      // on the stack.
+      if (isVarArg) {
+
+        if (StackPtr.Val == 0)
+          StackPtr = DAG.getRegister(StackReg, getPointerTy());
+     
+        // Create the frame index object for this incoming parameter
+        // The first 16 bytes are reserved.
+        int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+                                        i*4);
+        SDOperand PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+      
+        // emit ISD::STORE whichs stores the 
+        // parameter value to a stack Location
+        ArgValues.push_back(DAG.getStore(Root, ArgValue, PtrOff, NULL, 0));
+      }
+
+    } else {
+      // sanity check
+      assert(VA.isMemLoc());
+      
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+                                      (16 + VA.getLocMemOffset()));
+
+      // Create load nodes to retrieve arguments from the stack
+      SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0));
+    }
+  }
+  ArgValues.push_back(Root);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDOperand MipsTargetLowering::
+LowerRET(SDOperand Op, SelectionDAG &DAG)
+{
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC   = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+
+  // Analize return values of ISD::RET
+  CCInfo.AnalyzeReturn(Op.Val, RetCC_Mips);
+
+  // If this is the first return lowered for this function, add 
+  // the regs to the liveout set for the function.
+  if (DAG.getMachineFunction().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  // The chain is always operand #0
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // ISD::RET => ret chain, (regnum1,val1), ...
+    // So i*2+1 index only the regnums
+    Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), 
+                                 Op.getOperand(i*2+1), Flag);
+
+    // guarantee that all emitted copies are
+    // stuck together, avoiding something bad
+    Flag = Chain.getValue(1);
+  }
+
+  // Return on Mips is always a "jr $ra"
+  if (Flag.Val)
+    return DAG.getNode(MipsISD::Ret, MVT::Other, 
+                           Chain, DAG.getRegister(Mips::RA, MVT::i32), Flag);
+  else // Return Void
+    return DAG.getNode(MipsISD::Ret, MVT::Other, 
+                           Chain, DAG.getRegister(Mips::RA, MVT::i32));
+}

diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
new file mode 100644
index 0000000..0199175
--- /dev/null
+++ b/lib/Target/Mips/MipsISelLowering.h

@@ -0,0 +1,84 @@
+//===-- MipsISelLowering.h - Mips DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MipsISELLOWERING_H
+#define MipsISELLOWERING_H
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "Mips.h"
+#include "MipsSubtarget.h"
+
+namespace llvm {
+  namespace MipsISD {
+    enum NodeType {
+      // Start the numbering from where ISD NodeType finishes.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+Mips::INSTRUCTION_LIST_END,
+
+      // Jump and link (call)
+      JmpLink,
+
+      // Get the Higher 16 bits from a 32-bit immediate
+      // No relation with Mips Hi register
+      Hi, 
+
+      // Get the Lower 16 bits from a 32-bit immediate
+      // No relation with Mips Lo register
+      Lo, 
+
+      // Return 
+      Ret
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class MipsTargetLowering : public TargetLowering 
+  {
+    // FrameIndex for return slot.
+    int ReturnAddrIndex;
+
+    // const MipsSubtarget &MipsSubTarget;
+  public:
+
+    MipsTargetLowering(MipsTargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+
+    /// getTargetNodeName - This method returns the name of a target specific 
+    //  DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+  private:
+    // Lower Operand helpers
+    SDOperand LowerCCCArguments(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC);
+    SDNode *LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode*TheCall,
+                            unsigned CallingConv, SelectionDAG &DAG);
+    SDOperand getReturnAddressFrameIndex(SelectionDAG &DAG);
+
+    // Lower Operand specifics
+    SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG);
+
+  };
+}
+
+#endif // MipsISELLOWERING_H

diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
new file mode 100644
index 0000000..b88fa90
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrFormats.td

@@ -0,0 +1,96 @@
+//===- MipsRegisterInfo.td - Mips Register defs -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  All the possible Mips fields are:
+//
+//  opcode  - operation code.
+//  rs      - src reg.
+//  rt      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  rd      - dst reg, only used on 3 regs instr.
+//  shamt   - only used on shift instructions, contains the shift amount.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+// Generic Mips Format
+class MipsInst<dag ops, string asmstr, list<dag> pattern>: 
+      Instruction 
+{
+  field bits<32> Inst;
+
+  let Namespace = "Mips";
+
+  bits<6> opcode;
+
+  // Top 5 bits are the 'opcode' field
+  let Inst{31-26} = opcode;   
+  
+  dag OperandList = ops;
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
+//===----------------------------------------------------------------------===//
+
+class FR<bits<6> op, bits<6> _funct, dag ops, string asmstr, list<dag> pattern>:
+      MipsInst<ops, asmstr, pattern> 
+{
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<5>  shamt;
+  bits<6>  funct;
+
+  let opcode = op;
+  let funct  = _funct;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt; 
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = shamt;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|rs|rt|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FI<bits<6> op, dag ops, string asmstr, list<dag> pattern>: 
+      MipsInst<ops, asmstr, pattern> 
+{
+  bits<5>  rt;
+  bits<5>  rs;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt; 
+  let Inst{15-0}  = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Format J instruction class in Mips : <|opcode|address|>
+//===----------------------------------------------------------------------===//
+
+class FJ<bits<6> op, dag ops, string asmstr, list<dag> pattern>: 
+      MipsInst<ops, asmstr, pattern> 
+{
+  bits<26> addr;
+
+  let opcode = op;
+  
+  let Inst{25-0} = addr;
+}

diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
new file mode 100644
index 0000000..8084030
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrInfo.cpp

@@ -0,0 +1,114 @@
+//===- MipsInstrInfo.cpp - Mips Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "MipsGenInstrInfo.inc"
+
+using namespace llvm;
+
+// TODO: Add the subtarget support on this constructor
+MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm)
+  : TargetInstrInfo(MipsInsts, sizeof(MipsInsts)/sizeof(MipsInsts[0])),
+    TM(tm), RI(*this) {}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImmediate() && op.getImmedValue() == 0;
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+bool MipsInstrInfo::
+isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg) const 
+{
+  //  addu  $dst, $src, $zero || addu  $dst, $zero, $src
+  //  or    $dst, $src, $zero || or    $dst, $zero, $src
+  if ((MI.getOpcode() == Mips::ADDu) || (MI.getOpcode() == Mips::OR))
+  {
+    if (MI.getOperand(1).getReg() == Mips::ZERO) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(2).getReg();
+      return true;
+    } else if (MI.getOperand(2).getReg() == Mips::ZERO) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  }
+
+  //  addiu $dst, $src, 0
+  if (MI.getOpcode() == Mips::ADDiu) 
+  {
+    if ((MI.getOperand(1).isRegister()) && (isZeroImm(MI.getOperand(2)))) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  }
+  return false;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned MipsInstrInfo::
+isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const 
+{
+  // TODO: add lhu, lbu ???
+  if (MI->getOpcode() == Mips::LW) 
+  {
+    if ((MI->getOperand(2).isFrameIndex()) && // is a stack slot
+        (MI->getOperand(1).isImmediate()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(1)))) 
+    {
+      FrameIndex = MI->getOperand(2).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned MipsInstrInfo::
+isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const 
+{
+  // TODO: add sb, sh ???
+  if (MI->getOpcode() == Mips::SW) {
+    if ((MI->getOperand(0).isFrameIndex()) && // is a stack slot
+        (MI->getOperand(1).isImmediate()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(1)))) 
+    {
+      FrameIndex = MI->getOperand(0).getFrameIndex();
+      return MI->getOperand(2).getReg();
+    }
+  }
+  return 0;
+}
+
+unsigned MipsInstrInfo::
+InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, 
+             MachineBasicBlock *FBB, const std::vector<MachineOperand> &Cond)
+             const
+{
+  // TODO: add Mips::J here.
+  assert(0 && "Cant handle any kind of branches!");
+  return 1;
+}

diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
new file mode 100644
index 0000000..356cf3d
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrInfo.h

@@ -0,0 +1,63 @@
+//===- MipsInstrInfo.h - Mips Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSINSTRUCTIONINFO_H
+#define MIPSINSTRUCTIONINFO_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+
+class MipsInstrInfo : public TargetInstrInfo 
+{
+  MipsTargetMachine &TM;
+  const MipsRegisterInfo RI;
+public:
+  MipsInstrInfo(MipsTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and
+  /// leave the source and dest operands in the passed parameters.
+  ///
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg) const;
+  
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB, 
+                                const std::vector<MachineOperand> &Cond) const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
new file mode 100644
index 0000000..1f5d152
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrInfo.td

@@ -0,0 +1,469 @@
+//===- MipsInstrInfo.td - Mips Register defs --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Mips profiles and nodes
+//===----------------------------------------------------------------------===//
+
+// Call
+def SDT_MipsJmpLink : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+def MipsJmpLink     : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink, [SDNPHasChain,
+                             SDNPOutFlag]>;
+
+// Hi and Lo nodes are created to let easy manipulation of 16-bit when 
+// handling 32-bit immediates. They are used on MipsISelLowering to 
+// lower stuff like GlobalAddress, ExternalSymbol, ...
+// This two nodes have nothing to do with Mips Registers Hi and Lo.
+def MipsHi : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
+def MipsLo : SDNode<"MipsISD::Lo", SDTIntUnaryOp>; 
+
+// Return 
+def SDT_MipsRet : SDTypeProfile<0, 1, [SDTCisInt<0>]>; 
+def MipsRet     : SDNode<"MipsISD::Ret", SDT_MipsRet, [SDNPHasChain, 
+                             SDNPOptInFlag]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_MipsCallSeq : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def callseq_start   : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeq, 
+                             [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end     : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeq, 
+                             [SDNPHasChain, SDNPOutFlag]>;
+
+// Instruction operand types
+def brtarget    : Operand<OtherVT>;
+def calltarget  : Operand<i32>;
+def uimm16      : Operand<i32>;
+def simm16      : Operand<i32>;
+def shamt       : Operand<i32>; 
+
+// Address operand
+def mem : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops simm16, CPURegs);
+}
+
+//===----------------------------------------------------------------------===//
+// Mips Patterns and Transformations
+//===----------------------------------------------------------------------===//
+
+// Transformation Function - get the lower 16 bits.
+def LO16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getValue() & 0xFFFF);
+}]>;
+
+// Transformation Function - get the higher 16 bits.
+def HI16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getValue() >> 16);
+}]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16  : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i32)
+    return (int32_t)N->getValue() == (short)N->getValue();
+  else    
+    return (int64_t)N->getValue() == (short)N->getValue();
+}]>;
+
+// Node immediate fits as 16-bit zero extended on target immediate.
+// The LO16 param means that only the lower 16 bits of the node
+// immediate are caught.
+// e.g. addiu, sltiu
+def immZExt16  : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i32)
+    return (uint32_t)N->getValue() == (unsigned short)N->getValue();
+  else    
+    return (uint64_t)N->getValue() == (unsigned short)N->getValue();
+}], LO16>;
+
+// shamt field must fit in 5 bits.
+def immZExt5 : PatLeaf<(imm), [{
+  return N->getValue() == ((N->getValue()) & 0x1f) ;
+}]>;
+
+// Mips Address Mode! SDNode frameindex could possibily be a match 
+// since load and store instructions from stack used it.
+def addr : ComplexPattern<i32, 2, "SelectAddr", [frameindex], []>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+// Arithmetic 3 register operands
+let isCommutable = 1 in 
+class ArithR< bits<6> op, bits<6> func, string instr_asm, SDNode OpNode>: 
+  FR< op, 
+      func, 
+      (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c), 
+      !strconcat(instr_asm, " $dst, $b, $c"), 
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))] >;
+
+let isCommutable = 1 in 
+class ArithOverflowR< bits<6> op, bits<6> func, string instr_asm>: 
+  FR< op, 
+      func, 
+      (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c), 
+      !strconcat(instr_asm, " $dst, $b, $c"), 
+      []>;
+
+// Arithmetic 2 register operands
+let isCommutable = 1 in
+class ArithI<bits<6> op, string instr_asm, SDNode OpNode, 
+             Operand Od, PatLeaf imm_type> : 
+  FI< op, 
+      (ops CPURegs:$dst, CPURegs:$b, Od:$c), 
+      !strconcat(instr_asm, " $dst, $b, $c"), 
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, imm_type:$c))] >;
+
+// Arithmetic Multiply ADD/SUB
+let rd=0 in
+class MArithR<bits<6> func, string instr_asm> : 
+  FR< 0x1c, 
+      func,
+      (ops CPURegs:$rs, CPURegs:$rt), 
+      !strconcat(instr_asm, " $rs, $rt"), 
+      []>;
+
+//  Logical
+class LogicR<bits<6> func, string instr_asm, SDNode OpNode>:
+  FR< 0x00, 
+      func, 
+      (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c), 
+      !strconcat(instr_asm, " $dst, $b, $c"), 
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))] >;
+
+class LogicI<bits<6> op, string instr_asm, SDNode OpNode>:
+  FI< op,
+      (ops CPURegs:$dst, CPURegs:$b, uimm16:$c),
+      !strconcat(instr_asm, " $dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, immSExt16:$c))]>;
+
+class LogicNOR<bits<6> op, bits<6> func, string instr_asm>:
+  FR< op, 
+      func, 
+      (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c), 
+      !strconcat(instr_asm, " $dst, $b, $c"), 
+      [(set CPURegs:$dst, (not (or CPURegs:$b, CPURegs:$c)))] >;
+
+// Shifts
+let rt = 0 in
+class LogicR_shift_imm<bits<6> func, string instr_asm, SDNode OpNode>:
+  FR< 0x00, 
+      func, 
+      (ops CPURegs:$dst, CPURegs:$b, shamt:$c),
+      !strconcat(instr_asm, " $dst, $b, $c"), 
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt5:$c))] >;
+
+class LogicR_shift_reg<bits<6> func, string instr_asm, SDNode OpNode>:
+  FR< 0x00, 
+      func, 
+      (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, " $dst, $b, $c"), 
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))] >;
+
+// Load Upper Imediate
+class LoadUpper<bits<6> op, string instr_asm>:
+  FI< op,
+      (ops CPURegs:$dst, uimm16:$imm),
+      !strconcat(instr_asm, " $dst, $imm"),
+      []>;
+
+// Memory Load/Store 
+let isLoad = 1 in
+class LoadM<bits<6> op, string instr_asm, PatFrag OpNode>:
+  FI< op,
+      (ops CPURegs:$dst, mem:$addr),
+      !strconcat(instr_asm, " $dst, $addr"),
+      [(set CPURegs:$dst, (OpNode addr:$addr))]>;
+
+let isStore = 1 in
+class StoreM<bits<6> op, string instr_asm, PatFrag OpNode>:
+  FI< op,
+      (ops CPURegs:$dst, mem:$addr),
+      !strconcat(instr_asm, " $dst, $addr"),
+      [(OpNode CPURegs:$dst, addr:$addr)]>;
+
+// Conditional Branch
+let isBranch = 1, noResults=1, isTerminator=1 in
+class CBranch<bits<6> op, string instr_asm, PatFrag cond_op>:
+  FI< op,
+      (ops CPURegs:$a, CPURegs:$b, brtarget:$offset),
+      !strconcat(instr_asm, " $a, $b, $offset"),
+      [(brcond (cond_op CPURegs:$a, CPURegs:$b), bb:$offset)]>;
+
+class SetCC_R<bits<6> op, bits<6> func, string instr_asm,
+      PatFrag cond_op>:
+  FR< op,
+      func,
+      (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, " $dst, $b, $c"),
+      [(set CPURegs:$dst, (cond_op CPURegs:$b, CPURegs:$c))]>;
+
+class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op,
+      Operand Od, PatLeaf imm_type>:
+  FI< op,
+      (ops CPURegs:$dst, CPURegs:$b, Od:$c),
+      !strconcat(instr_asm, " $dst, $b, $c"),
+      [(set CPURegs:$dst, (cond_op CPURegs:$b, imm_type:$c))]>;
+
+// Unconditional branch
+let hasCtrlDep=1, noResults=1, isTerminator=1 in
+class JumpFJ<bits<6> op, string instr_asm>:
+  FJ< op,
+      (ops brtarget:$target),
+      !strconcat(instr_asm, " $target"),
+      [(br bb:$target)]>;
+
+let hasCtrlDep=1, noResults=1, isTerminator=1, rd=0 in
+class JumpFR<bits<6> op, bits<6> func, string instr_asm>:
+  FR< op,
+      func,
+      (ops CPURegs:$target),
+      !strconcat(instr_asm, " $target"),
+      []>;
+
+// Jump and Link (Call)
+let isCall=1 in
+class JumpLink<bits<6> op, string instr_asm>: 
+  FJ< op,
+      (ops calltarget:$target),
+      !strconcat(instr_asm, " $target"),
+      [(MipsJmpLink imm:$target)]>;
+
+let isCall=1 in
+class JumpLinkReg<bits<6> op, bits<6> func, string instr_asm>:
+  FR< op,
+      func,
+      (ops CPURegs:$rd, CPURegs:$rs),
+      !strconcat(instr_asm, " $rs, $rd"),
+      []>;
+
+// Mul, Div 
+class MulDiv<bits<6> func, string instr_asm>: 
+  FR< 0x00, 
+      func, 
+      (ops CPURegs:$a, CPURegs:$b), 
+      !strconcat(instr_asm, " $a, $b"), 
+      []>;
+
+// Move from Hi/Lo 
+class MoveFromTo<bits<6> func, string instr_asm>:
+  FR< 0x00, 
+      func, 
+      (ops CPURegs:$dst), 
+      !strconcat(instr_asm, " $dst"), 
+      []>;
+
+// Count Leading Ones/Zeros in Word
+class CountLeading<bits<6> func, string instr_asm>:
+  FR< 0x1c, 
+      func, 
+      (ops CPURegs:$dst, CPURegs:$src), 
+      !strconcat(instr_asm, " $dst, $src"), 
+      []>;
+
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag ops, string asmstr, list<dag> pattern>: 
+      MipsInst<ops, asmstr, pattern>;
+
+// As stack alignment is always done with addiu, we need a 16-bit immediate
+def ADJCALLSTACKDOWN : Pseudo<(ops uimm16:$amt),
+                              "!ADJCALLSTACKDOWN $amt",
+                              [(callseq_start imm:$amt)]>, Imp<[SP],[SP]>;
+def ADJCALLSTACKUP   : Pseudo<(ops uimm16:$amt),
+                              "!ADJCALLSTACKUP $amt",
+                              [(callseq_end imm:$amt)]>, Imp<[SP],[SP]>;
+
+def IMPLICIT_DEF_CPURegs : Pseudo<(ops CPURegs:$dst),
+                                  "!IMPLICIT_DEF $dst",
+                                  [(set CPURegs:$dst, (undef))]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mips32 I
+//===----------------------------------------------------------------------===//
+
+// Arithmetic
+def ADDiu   : ArithI<0x09, "addiu", add, uimm16, immSExt16>;
+def ADDi    : ArithI<0x08, "addi",  add, simm16, immZExt16>;
+def MUL     : ArithR<0x1c, 0x02, "mul", mul>;
+def ADDu    : ArithR<0x00, 0x21, "addu", add>;
+def SUBu    : ArithR<0x00, 0x23, "subu", sub>;
+def ADD     : ArithOverflowR<0x00, 0x20, "add">;
+def SUB     : ArithOverflowR<0x00, 0x22, "sub">;
+def MADD    : MArithR<0x00, "madd">;
+def MADDU   : MArithR<0x01, "maddu">;
+def MSUB    : MArithR<0x04, "msub">;
+def MSUBU   : MArithR<0x05, "msubu">;
+
+// Logical
+def AND     : LogicR<0x24, "and", and>;
+def OR      : LogicR<0x25, "or",  or>;
+def XOR     : LogicR<0x26, "xor", xor>;
+def ANDi    : LogicI<0x0c, "andi", and>;
+def ORi     : LogicI<0x0d, "ori",  or>;
+def XORi    : LogicI<0x0e, "xori",  xor>;
+def NOR     : LogicNOR<0x00, 0x27, "nor">;
+
+// Shifts 
+def SLL     : LogicR_shift_imm<0x00, "sll", shl>;
+def SRL     : LogicR_shift_imm<0x02, "srl", srl>;
+def SRA     : LogicR_shift_imm<0x03, "sra", sra>;
+def SLLV    : LogicR_shift_reg<0x04, "sllv", shl>;
+def SRLV    : LogicR_shift_reg<0x06, "srlv", srl>;
+def SRAV    : LogicR_shift_reg<0x07, "srav", sra>;
+
+// Load Upper Immediate
+def LUi     : LoadUpper<0x0f, "lui">;
+
+// Load/Store
+def LB      : LoadM<0x20, "lb",  sextloadi8>;
+def LBu     : LoadM<0x24, "lbu", zextloadi8>;
+def LH      : LoadM<0x21, "lh",  sextloadi16>;
+def LHu     : LoadM<0x25, "lhu", zextloadi16>;
+def LW      : LoadM<0x23, "lw",  load>;
+def SB      : StoreM<0x28, "sb", truncstorei8>;
+def SH      : StoreM<0x29, "sh", truncstorei16>;
+def SW      : StoreM<0x2b, "sw", store>;
+
+// Conditional Branch
+def BEQ     : CBranch<0x04, "beq", seteq>;
+def BNE     : CBranch<0x05, "bne", setne>;
+def SLT     : SetCC_R<0x00, 0x2a, "slt", setlt>;
+def SLTu    : SetCC_R<0x00, 0x2b, "sltu", setult>;
+def SLTi    : SetCC_I<0x0a, "slti", setlt, simm16, immSExt16>;
+def SLTiu   : SetCC_I<0x0b, "sltiu", setult, uimm16, immZExt16>;
+
+// Unconditional jump
+def J       : JumpFJ<0x02, "j">;
+def JR      : JumpFR<0x00, 0x08, "jr">;
+
+// Jump and Link (Call)
+def JAL     : JumpLink<0x03, "jal">;
+def JALR    : JumpLinkReg<0x00, 0x09, "jalr">;
+
+// MulDiv and Move From Hi/Lo operations, have
+// their correpondent SDNodes created on ISelDAG.
+// Special Mul, Div operations
+def MULT    : MulDiv<0x18, "mult">;
+def MULTu   : MulDiv<0x19, "multu">;
+def DIV     : MulDiv<0x1a, "div">;
+def DIVu    : MulDiv<0x1b, "divu">;
+
+// Move From Hi/Lo 
+def MFHI    : MoveFromTo<0x10, "mfhi">;
+def MFLO    : MoveFromTo<0x12, "mflo">;
+def MTHI    : MoveFromTo<0x11, "mthi">;
+def MTLO    : MoveFromTo<0x13, "mtlo">;
+
+// Count Leading
+def CLO     : CountLeading<0x21, "clo">;
+def CLZ     : CountLeading<0x20, "clz">;
+
+// No operation
+let addr=0 in
+def NOOP :  FJ<0, (ops), "nop", []>;
+
+// Ret instruction - as mips does not have "ret" a 
+// jr $ra must be generated.
+let isReturn=1, isTerminator=1, hasDelaySlot=1, noResults=1,
+    isBarrier=1, hasCtrlDep=1, rs=0, rt=0, shamt=0 in 
+{
+  def RET : FR <0x00, 0x02, (ops CPURegs:$target),
+                "jr $target", [(MipsRet CPURegs:$target)]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Small immediates
+def : Pat<(i32 immSExt16:$in), 
+          (ADDiu ZERO, imm:$in)>;
+def : Pat<(i32 immZExt16:$in), 
+          (ORi ZERO, imm:$in)>;
+
+// Arbitrary immediates
+def : Pat<(i32 imm:$imm),
+          (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Call
+def : Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+          (JAL tglobaladdr:$dst)>;
+def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
+          (JAL texternalsym:$dst)>;
+
+// GlobalAddress, Constant Pool, ExternalSymbol, and JumpTable
+def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : Pat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
+
+// When extracting the address from GlobalAddress we
+// need something of the form "addiu $reg, %lo(addr)"
+def : Pat<(add CPURegs:$a, (MipsLo tglobaladdr:$in)),
+          (ADDiu CPURegs:$a, tglobaladdr:$in)>;
+
+// Mips does not have not, so we increase the operation  
+def : Pat<(not CPURegs:$in),
+          (NOR CPURegs:$in, CPURegs:$in)>;
+
+// extended load and stores 
+def : Pat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
+def : Pat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
+def : Pat<(truncstorei1 CPURegs:$src, addr:$addr), 
+           (SB CPURegs:$src, addr:$src)>;
+
+def : Pat<(brcond (setne CPURegs:$lhs, (add ZERO, 0)), bb:$dst),
+          (BNE CPURegs:$lhs, ZERO, bb:$dst)>;
+
+
+// Conditional branch patterns.
+// cond branches patterns, 2 register operands signed.
+def : Pat<(brcond (setlt CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BNE (SLT CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setle CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLT CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setgt CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BNE (SLT CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setge CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLT CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
+
+// cond branches patterns, 2 register operands unsigned.
+def : Pat<(brcond (setult CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BNE (SLTu CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setule CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLTu CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setugt CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BNE (SLTu CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setuge CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLTu CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
+
+// cond branches patterns, reg/imm operands signed.
+def : Pat<(brcond (setult CPURegs:$lhs, immSExt16:$rhs), bb:$dst),
+          (BNE (SLTi CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setuge CPURegs:$lhs, immSExt16:$rhs), bb:$dst),
+          (BEQ (SLTi CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+
+// cond branches patterns, reg/imm operands unsigned.
+def : Pat<(brcond (setult CPURegs:$lhs, immZExt16:$rhs), bb:$dst),
+          (BNE (SLTiu CPURegs:$lhs, immZExt16:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setuge CPURegs:$lhs, immZExt16:$rhs), bb:$dst),
+          (BEQ (SLTiu CPURegs:$lhs, immZExt16:$rhs), ZERO, bb:$dst)>;

diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
new file mode 100644
index 0000000..b362dc3
--- /dev/null
+++ b/lib/Target/Mips/MipsMachineFunction.h

@@ -0,0 +1,54 @@
+//===-- MipsMachineFunctionInfo.h - Private data used for Mips ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS_MACHINE_FUNCTION_INFO_H
+#define MIPS_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// MipsFunctionInfo - This class is derived from MachineFunction private
+/// Mips target-specific information for each MachineFunction.
+class MipsFunctionInfo : public MachineFunctionInfo {
+
+private:
+  /// Holds for each function where on the stack 
+  /// the Frame Pointer must be saved
+  int FPStackOffset;
+
+  /// Holds for each function where on the stack 
+  /// the Return Address must be saved
+  int RAStackOffset;
+
+public:
+  MipsFunctionInfo(MachineFunction& MF) 
+  : FPStackOffset(0), RAStackOffset(0)
+  {}
+
+  int getFPStackOffset() const { return FPStackOffset; }
+  void setFPStackOffset(int Off) { FPStackOffset = Off; }
+
+  int getRAStackOffset() const { return RAStackOffset; }
+  void setRAStackOffset(int Off) { RAStackOffset = Off; }
+
+  int getTopSavedRegOffset() const { 
+    return (RAStackOffset > FPStackOffset) ? 
+           (RAStackOffset) : (FPStackOffset);
+  }
+};
+
+} // end of namespace llvm
+
+
+#endif

diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
new file mode 100644
index 0000000..c7a87ca
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp

@@ -0,0 +1,422 @@
+//===- MipsRegisterInfo.cpp - MIPS Register Information -== -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-reg-info"
+
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "MipsMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+//#include "MipsSubtarget.h"
+
+using namespace llvm;
+
+// TODO: add subtarget support
+MipsRegisterInfo::MipsRegisterInfo(const TargetInstrInfo &tii)
+  : MipsGenRegisterInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+  TII(tii) {}
+
+void MipsRegisterInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+          unsigned SrcReg, int FI, 
+          const TargetRegisterClass *RC) const 
+{
+  if (RC == Mips::CPURegsRegisterClass)
+    BuildMI(MBB, I, TII.get(Mips::SW)).addReg(SrcReg, false, false, true)
+          .addImm(0).addFrameIndex(FI);
+  else
+    assert(0 && "Can't store this register to stack slot");
+}
+
+void MipsRegisterInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const 
+{
+  if (RC == Mips::CPURegsRegisterClass)
+    BuildMI(MBB, I, TII.get(Mips::LW), DestReg).addImm(0).addFrameIndex(FI);
+  else
+    assert(0 && "Can't load this register from stack slot");
+}
+
+void MipsRegisterInfo::
+copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+             unsigned DestReg, unsigned SrcReg,
+             const TargetRegisterClass *RC) const 
+{
+  if (RC == Mips::CPURegsRegisterClass)
+    BuildMI(MBB, I, TII.get(Mips::ADDu), DestReg).addReg(Mips::ZERO)
+      .addReg(SrcReg);
+  else
+    assert (0 && "Can't copy this register");
+}
+
+void MipsRegisterInfo::reMaterialize(MachineBasicBlock &MBB, 
+                                      MachineBasicBlock::iterator I,
+                                      unsigned DestReg, 
+                                      const MachineInstr *Orig) const 
+{
+    MachineInstr *MI = Orig->clone();
+    MI->getOperand(0).setReg(DestReg);
+    MBB.insert(I, MI);
+}
+
+MachineInstr *MipsRegisterInfo::
+foldMemoryOperand(MachineInstr* MI, unsigned OpNum, int FI) const 
+{
+  MachineInstr *NewMI = NULL;
+
+  switch (MI->getOpcode()) 
+  {
+    case Mips::ADDu:
+      if ((MI->getOperand(0).isRegister()) &&
+        (MI->getOperand(1).isRegister()) && 
+        (MI->getOperand(1).getReg() == Mips::ZERO) &&
+        (MI->getOperand(2).isRegister())) 
+      {
+        if (OpNum == 0)    // COPY -> STORE
+          NewMI = BuildMI(TII.get(Mips::SW)).addFrameIndex(FI)
+                  .addImm(0).addReg(MI->getOperand(2).getReg());
+        else               // COPY -> LOAD
+          NewMI = BuildMI(TII.get(Mips::LW), MI->getOperand(0)
+                  .getReg()).addImm(0).addFrameIndex(FI);
+      }
+      break;
+  }
+
+  if (NewMI)
+    NewMI->copyKillDeadInfo(MI);
+  return NewMI;
+}
+
+/// Mips Callee Saved Registers
+const unsigned* MipsRegisterInfo::
+getCalleeSavedRegs() const 
+{
+  // Mips calle-save register range is $16-$26(s0-s7)
+  static const unsigned CalleeSavedRegs[] = {  
+    Mips::S0, Mips::S1, Mips::S2, Mips::S3, 
+    Mips::S4, Mips::S5, Mips::S6, Mips::S7, 0
+  };
+  return CalleeSavedRegs;
+}
+
+/// Mips Callee Saved Register Classes
+const TargetRegisterClass* const* 
+MipsRegisterInfo::getCalleeSavedRegClasses() const 
+{
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, 0 
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector MipsRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const
+{
+  BitVector Reserved(getNumRegs());
+  Reserved.set(Mips::ZERO);
+  Reserved.set(Mips::AT);
+  Reserved.set(Mips::K0);
+  Reserved.set(Mips::K1);
+  Reserved.set(Mips::GP);
+  Reserved.set(Mips::SP);
+  Reserved.set(Mips::FP);
+  Reserved.set(Mips::RA);
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Stack Frame Processing methods
+// +----------------------------+
+//
+// Too meet ABI, we construct the frame on the reverse
+// of natural order.
+//
+// The LLVM Frame will look like this:
+//
+// As the stack grows down, we start at 0, and the reference
+// is decrement.
+//
+//  0          ----------
+// -4          Args to pass
+//  .          saved "Callee Saved" Registers
+//  .          Local Area
+//  .          saved FP
+//  .          saved RA
+// -StackSize  -----------
+//
+// On the EliminateFrameIndex we just negate the address above
+// and we get the stack frame required by the ABI, which is:
+//
+// sp + stacksize  -------------
+//                 saved $RA  (only on non-leaf functions)
+//                 saved $FP  (only with frame pointer)
+//                 saved "Callee Saved" Registers
+//                 Local Area
+//                 saved $GP  (used in PIC - not supported yet)
+//                 Args to pass area
+// sp              -------------
+//
+// The sp is the stack pointer subtracted/added from the stack size
+// at the Prologue/Epilogue
+//
+// References to the previous stack (to obtain arguments) are done
+// with fixed location stack frames using positive stack offsets.
+//
+// Examples:
+// - reference to the actual stack frame
+//   for any local area var there is smt like : FI >= 0, StackOffset: -4
+//     sw REGX, 4(REGY)
+//
+// - reference to previous stack frame
+//   suppose there's a store to the 5th arguments : FI < 0, StackOffset: 16.
+//   The emitted instruction will be something like:
+//     sw REGX, 16+StackSize (REGY)
+//
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool MipsRegisterInfo::
+hasFP(const MachineFunction &MF) const {
+  return (NoFramePointerElim || MF.getFrameInfo()->hasVarSizedObjects());
+}
+
+// This function eliminate ADJCALLSTACKDOWN, 
+// ADJCALLSTACKUP pseudo instructions
+void MipsRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+// FrameIndex represent objects inside a abstract stack.
+// We must replace FrameIndex with an stack/frame pointer
+// direct reference.
+void MipsRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, 
+                    RegScavenger *RS) const 
+{
+  MachineInstr &MI    = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+
+  unsigned i = 0;
+  while (!MI.getOperand(i).isFrameIndex()) {
+    ++i;
+    assert(i < MI.getNumOperands() && 
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getFrameIndex();
+  int stackSize  = MF.getFrameInfo()->getStackSize();
+  int spOffset   = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  #ifndef NDEBUG
+  DOUT << "\nFunction : " << MF.getFunction()->getName() << "\n";
+  DOUT << "<--------->\n";
+  MI.print(DOUT);
+  DOUT << "FrameIndex : " << FrameIndex << "\n";
+  DOUT << "spOffset   : " << spOffset << "\n";
+  DOUT << "stackSize  : " << stackSize << "\n";
+  #endif
+
+  int Offset = ( (spOffset >= 0) ? (stackSize + spOffset) : (-spOffset));
+
+  #ifndef NDEBUG
+  DOUT << "Offset     : " << Offset << "\n";
+  DOUT << "<--------->\n";
+  #endif
+
+  MI.getOperand(i-1).ChangeToImmediate(Offset);
+  MI.getOperand(i).ChangeToRegister(getFrameRegister(MF),false);
+}
+
+void MipsRegisterInfo::
+emitPrologue(MachineFunction &MF) const 
+{
+  MachineBasicBlock &MBB   = MF.front();
+  MachineFrameInfo *MFI    = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  int NumBytes = (int) MFI->getStackSize();
+
+  #ifndef NDEBUG
+  DOUT << "\n<--- EMIT PROLOGUE --->";
+  DOUT << "Stack size :" << NumBytes << "\n";
+  #endif
+
+  // Do we need to allocate space on the stack?
+  if (NumBytes == 0) return;
+
+  int FPOffset, RAOffset;
+  
+  // Always allocate space for saved RA and FP,
+  // even if FramePointer is not used. When not
+  // using FP, the last stack slot becomes empty
+  // and RA is saved before it.
+  if ((hasFP(MF)) && (MFI->hasCalls())) {
+    FPOffset = NumBytes;
+    RAOffset = (NumBytes+4);
+  } else if ((!hasFP(MF)) && (MFI->hasCalls())) {
+    FPOffset = 0;
+    RAOffset = NumBytes;
+  } else if ((hasFP(MF)) && (!MFI->hasCalls())) {
+    FPOffset = NumBytes;
+    RAOffset = 0;
+  }
+
+  MFI->setObjectOffset(MFI->CreateStackObject(4,4), -FPOffset);
+  MFI->setObjectOffset(MFI->CreateStackObject(4,4), -RAOffset);
+  MipsFI->setFPStackOffset(FPOffset);
+  MipsFI->setRAStackOffset(RAOffset);
+
+  #ifndef NDEBUG
+  DOUT << "FPOffset :" << FPOffset << "\n";
+  DOUT << "RAOffset :" << RAOffset << "\n";
+  #endif
+
+  // Align stack. 
+  NumBytes += 8;
+  unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+  NumBytes = ((NumBytes+Align-1)/Align*Align);
+
+  #ifndef NDEBUG
+  DOUT << "New stack size :" << NumBytes << "\n\n";
+  #endif
+
+  // Update frame info
+  MFI->setStackSize(NumBytes);
+
+  // Adjust stack : addi sp, sp, (-imm)
+  BuildMI(MBB, MBBI, TII.get(Mips::ADDiu), Mips::SP)
+      .addReg(Mips::SP).addImm(-NumBytes);
+
+  // Save the return address only if the function isnt a leaf one.
+  // sw  $ra, stack_loc($sp)
+  if (MFI->hasCalls()) { 
+    BuildMI(MBB, MBBI, TII.get(Mips::SW))
+        .addReg(Mips::RA).addImm(RAOffset).addReg(Mips::SP);
+  }
+
+  // if framepointer enabled, save it and set it
+  // to point to the stack pointer
+  if (hasFP(MF)) {
+    // sw  $fp,stack_loc($sp)
+    BuildMI(MBB, MBBI, TII.get(Mips::SW))
+      .addReg(Mips::FP).addImm(FPOffset).addReg(Mips::SP);
+
+    // move $fp, $sp
+    BuildMI(MBB, MBBI, TII.get(Mips::ADDu), Mips::FP)
+      .addReg(Mips::SP).addReg(Mips::ZERO);
+  }
+}
+
+void MipsRegisterInfo::
+emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const 
+{
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI         = MF.getInfo<MipsFunctionInfo>();
+
+  // Get the number of bytes from FrameInfo
+  int NumBytes = (int) MFI->getStackSize();
+
+  // Get the FI's where RA and FP are saved.
+  int FPOffset = MipsFI->getFPStackOffset();
+  int RAOffset = MipsFI->getRAStackOffset();
+
+  #ifndef NDEBUG
+  DOUT << "\n<--- EMIT EPILOGUE --->" << "\n";
+  DOUT << "Stack size :" << NumBytes << "\n";
+  DOUT << "FPOffset :" << FPOffset << "\n";
+  DOUT << "RAOffset :" << RAOffset << "\n\n";
+  #endif
+
+  // if framepointer enabled, restore it and restore the
+  // stack pointer
+  if (hasFP(MF)) {
+    // move $sp, $fp
+    BuildMI(MBB, MBBI, TII.get(Mips::ADDu), Mips::SP)
+      .addReg(Mips::FP).addReg(Mips::ZERO);
+
+    // lw  $fp,stack_loc($sp)
+    BuildMI(MBB, MBBI, TII.get(Mips::LW))
+      .addReg(Mips::FP).addImm(FPOffset).addReg(Mips::SP);
+  }
+
+  // Restore the return address only if the function isnt a leaf one.
+  // lw  $ra, stack_loc($sp)
+  if (MFI->hasCalls()) { 
+    BuildMI(MBB, MBBI, TII.get(Mips::LW))
+        .addReg(Mips::RA).addImm(RAOffset).addReg(Mips::SP);
+  }
+
+  // adjust stack  : insert addi sp, sp, (imm)
+  if (NumBytes) {
+    BuildMI(MBB, MBBI, TII.get(Mips::ADDiu), Mips::SP)
+      .addReg(Mips::SP).addImm(NumBytes);
+  }
+}
+
+void MipsRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
+}
+
+unsigned MipsRegisterInfo::
+getRARegister() const {
+  return Mips::RA;
+}
+
+unsigned MipsRegisterInfo::
+getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? Mips::FP : Mips::SP;
+}
+
+unsigned MipsRegisterInfo::
+getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned MipsRegisterInfo::
+getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+#include "MipsGenRegisterInfo.inc"
+

diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
new file mode 100644
index 0000000..d84194f
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterInfo.h

@@ -0,0 +1,83 @@
+//===- MipsRegisterInfo.h - Mips Register Information Impl ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSREGISTERINFO_H
+#define MIPSREGISTERINFO_H
+
+#include "llvm/Target/MRegisterInfo.h"
+#include "MipsGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+class Type;
+
+struct MipsRegisterInfo : public MipsGenRegisterInfo {
+  const TargetInstrInfo &TII;
+  
+  MipsRegisterInfo(const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, int FrameIndex,
+                           const TargetRegisterClass *RC) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+              MachineBasicBlock::iterator MBBI,
+              unsigned DestReg, int FrameIndex,
+              const TargetRegisterClass *RC) const;
+
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, const MachineInstr *Orig) const;
+
+  MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
+                                  int FrameIndex) const;
+
+  void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+          unsigned DestReg, unsigned SrcReg,
+          const TargetRegisterClass *RC) const;
+  
+
+  const unsigned *getCalleeSavedRegs() const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses() const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
new file mode 100644
index 0000000..2b7d15f
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterInfo.td

@@ -0,0 +1,80 @@
+//===- MipsRegisterInfo.td - Mips Register defs -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MIPS register file
+//===----------------------------------------------------------------------===//
+
+// We have banks of 32 registers each.
+class MipsReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "Mips";
+}
+
+// Mips CPU Registers
+class MipsGPRReg<bits<5> num, string n> : MipsReg<n> {
+  let Num = num;
+}
+
+// CPU GPR Registers
+def ZERO : MipsGPRReg< 0, "ZERO">, DwarfRegNum<0>;
+def AT : MipsGPRReg< 1, "AT">, DwarfRegNum<1>;
+def V0 : MipsGPRReg< 2, "2">, DwarfRegNum<2>;
+def V1 : MipsGPRReg< 3, "3">, DwarfRegNum<3>;
+def A0 : MipsGPRReg< 4, "4">, DwarfRegNum<5>;
+def A1 : MipsGPRReg< 5, "5">, DwarfRegNum<5>;
+def A2 : MipsGPRReg< 6, "6">, DwarfRegNum<6>;
+def A3 : MipsGPRReg< 7, "7">, DwarfRegNum<7>;
+def T0 : MipsGPRReg< 8, "8">, DwarfRegNum<8>;
+def T1 : MipsGPRReg< 9, "9">, DwarfRegNum<9>;
+def T2 : MipsGPRReg< 10, "10">, DwarfRegNum<10>;
+def T3 : MipsGPRReg< 11, "11">, DwarfRegNum<11>;
+def T4 : MipsGPRReg< 12, "12">, DwarfRegNum<12>;
+def T5 : MipsGPRReg< 13, "13">, DwarfRegNum<13>;
+def T6 : MipsGPRReg< 14, "14">, DwarfRegNum<14>;
+def T7 : MipsGPRReg< 15, "15">, DwarfRegNum<15>;
+def S0 : MipsGPRReg< 16, "16">, DwarfRegNum<16>;
+def S1 : MipsGPRReg< 17, "17">, DwarfRegNum<17>;
+def S2 : MipsGPRReg< 18, "18">, DwarfRegNum<18>;
+def S3 : MipsGPRReg< 19, "19">, DwarfRegNum<19>;
+def S4 : MipsGPRReg< 20, "20">, DwarfRegNum<20>;
+def S5 : MipsGPRReg< 21, "21">, DwarfRegNum<21>;
+def S6 : MipsGPRReg< 22, "22">, DwarfRegNum<22>;
+def S7 : MipsGPRReg< 23, "23">, DwarfRegNum<23>;
+def T8 : MipsGPRReg< 24, "24">, DwarfRegNum<24>;
+def T9 : MipsGPRReg< 25, "25">, DwarfRegNum<25>;
+def K0 : MipsGPRReg< 26, "26">, DwarfRegNum<26>;
+def K1 : MipsGPRReg< 27, "27">, DwarfRegNum<27>;
+def GP : MipsGPRReg< 28, "GP">, DwarfRegNum<28>;
+def SP : MipsGPRReg< 29, "SP">, DwarfRegNum<29>;
+def FP : MipsGPRReg< 30, "FP">, DwarfRegNum<30>;
+def RA : MipsGPRReg< 31, "RA">, DwarfRegNum<31>;
+
+// CPU Registers Class
+def CPURegs : RegisterClass<"Mips", [i32], 32, 
+  // Return Values and Arguments
+  [V0, V1, A0, A1, A2, A3,
+  // Not preserved across procedure calls
+  T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, 
+  // Callee save
+  S0, S1, S2, S3, S4, S5, S6, S7,
+  // Reserved
+  ZERO, AT, K0, K1, GP, SP, FP, RA]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    CPURegsClass::iterator
+    CPURegsClass::allocation_order_end(const MachineFunction &MF) const {
+      // The last 8 registers on the list above are reserved
+      return end()-8;
+    }
+  }];
+}

diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
new file mode 100644
index 0000000..a394f77
--- /dev/null
+++ b/lib/Target/Mips/MipsSubtarget.cpp

@@ -0,0 +1,26 @@
+//===- MipsSubtarget.cpp - Mips Subtarget Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Mips specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSubtarget.h"
+#include "Mips.h"
+#include "MipsGenSubtarget.inc"
+using namespace llvm;
+
+MipsSubtarget::MipsSubtarget(const TargetMachine &TM, const Module &M, 
+                             const std::string &FS) : isR3000(false) 
+{
+  std::string CPU = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}

diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
new file mode 100644
index 0000000..7ec61ca
--- /dev/null
+++ b/lib/Target/Mips/MipsSubtarget.h

@@ -0,0 +1,43 @@
+//=====-- MipsSubtarget.h - Define Subtarget for the Mips -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSUBTARGET_H
+#define MIPSSUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <string>
+
+namespace llvm {
+class Module;
+
+class MipsSubtarget : public TargetSubtarget {
+protected:
+  bool isR3000;
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  MipsSubtarget(const TargetMachine &TM, const Module &M, 
+                const std::string &FS);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU);
+
+  bool IsR3000() const { return isR3000; }
+};
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/Mips/MipsTargetAsmInfo.cpp b/lib/Target/Mips/MipsTargetAsmInfo.cpp
new file mode 100644
index 0000000..08166f6
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetAsmInfo.cpp

@@ -0,0 +1,22 @@
+//===-- MipsTargetAsmInfo.cpp - Mips asm properties -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MipsTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetAsmInfo.h"
+
+using namespace llvm;
+
+MipsTargetAsmInfo::MipsTargetAsmInfo(const MipsTargetMachine &TM) {
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  CommentString = "#";
+}

diff --git a/lib/Target/Mips/MipsTargetAsmInfo.h b/lib/Target/Mips/MipsTargetAsmInfo.h
new file mode 100644
index 0000000..908f036
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetAsmInfo.h

@@ -0,0 +1,30 @@
+//=====-- MipsTargetAsmInfo.h - Mips asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETASMINFO_H
+#define MIPSTARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class MipsTargetMachine;
+
+  struct MipsTargetAsmInfo : public TargetAsmInfo {
+    MipsTargetAsmInfo(const MipsTargetMachine &TM);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
new file mode 100644
index 0000000..7fdba30
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetMachine.cpp

@@ -0,0 +1,83 @@
+//===-- MipsTargetMachine.cpp - Define TargetMachine for Mips -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Mips target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsTargetAsmInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+namespace {
+  // Register the target.
+  RegisterTarget<MipsTargetMachine> X("mips", "  Mips");
+}
+
+const TargetAsmInfo *MipsTargetMachine::
+createTargetAsmInfo() const 
+{
+  return new MipsTargetAsmInfo(*this);
+}
+
+// DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
+//
+// FrameInfo  --> StackGrowsDown, 8 bytes aligned, 
+//                LOA : 0
+MipsTargetMachine::
+MipsTargetMachine(const Module &M, const std::string &FS): 
+  Subtarget(*this, M, FS), DataLayout("E-p:32:32:32"), 
+  InstrInfo(*this), FrameInfo(TargetFrameInfo::StackGrowsDown, 8, 0),
+  TLInfo(*this) {}
+
+// return 0 and must specify -march to gen MIPS code.
+unsigned MipsTargetMachine::
+getModuleMatchQuality(const Module &M) 
+{
+  // We strongly match "mips-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 5 && std::string(TT.begin(), TT.begin()+5) == "mips-")
+    return 20;
+  
+  return 0;
+}
+
+// Install an instruction selector pass using 
+// the ISelDag to gen Mips code.
+bool MipsTargetMachine::
+addInstSelector(FunctionPassManager &PM, bool Fast) 
+{
+  PM.add(createMipsISelDag(*this));
+  return false;
+}
+
+// Implemented by targets that want to run passes immediately before 
+// machine code is emitted. return true if -print-machineinstrs should 
+// print out the code after the passes.
+// TODO: Delay slot must be implemented here.
+bool MipsTargetMachine::
+addPreEmitPass(FunctionPassManager &PM, bool Fast) 
+{
+  return false;
+}
+
+// Implements the AssemblyEmitter for the target. Must return
+// true if AssemblyEmitter is supported
+bool MipsTargetMachine::
+addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                   std::ostream &Out) 
+{
+  // Output assembly language.
+  PM.add(createMipsCodePrinterPass(Out, *this));
+  return false;
+}

diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
new file mode 100644
index 0000000..9e1ccc3
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetMachine.h

@@ -0,0 +1,65 @@
+//===-- MipsTargetMachine.h - Define TargetMachine for Mips -00--*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bruno Cardoso Lopes and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETMACHINE_H
+#define MIPSTARGETMACHINE_H
+
+#include "MipsSubtarget.h"
+#include "MipsInstrInfo.h"
+#include "MipsISelLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+
+namespace llvm {
+  class MipsTargetMachine : public LLVMTargetMachine {
+    MipsSubtarget       Subtarget;
+    const TargetData    DataLayout; // Calculates type size & alignment
+    MipsInstrInfo       InstrInfo;
+    TargetFrameInfo     FrameInfo;
+    MipsTargetLowering  TLInfo;
+  
+  protected:
+    virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+  public:
+    MipsTargetMachine(const Module &M, const std::string &FS);
+
+    virtual const MipsInstrInfo   *getInstrInfo()     const 
+    { return &InstrInfo; }
+    virtual const TargetFrameInfo *getFrameInfo()     const 
+    { return &FrameInfo; }
+    virtual const TargetSubtarget *getSubtargetImpl() const 
+    { return &Subtarget; }
+    virtual const TargetData      *getTargetData()    const 
+    { return &DataLayout;}
+
+    virtual const MRegisterInfo   *getRegisterInfo()  const {
+      return &InstrInfo.getRegisterInfo();
+    }
+
+    virtual MipsTargetLowering   *getTargetLowering() const { 
+      return const_cast<MipsTargetLowering*>(&TLInfo); 
+    }
+
+    static unsigned getModuleMatchQuality(const Module &M);
+
+    // Pass Pipeline Configuration
+    virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);
+    virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast);
+    virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                    std::ostream &Out);
+  };
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/PowerPC/Makefile b/lib/Target/PowerPC/Makefile
new file mode 100644
index 0000000..77288ed
--- /dev/null
+++ b/lib/Target/PowerPC/Makefile

@@ -0,0 +1,20 @@
+##===- lib/Target/PowerPC/Makefile -------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by the LLVM research group and is distributed under
+# the University of Illinois Open Source License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMPowerPC
+TARGET = PPC
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = PPCGenInstrNames.inc PPCGenRegisterNames.inc \
+                PPCGenAsmWriter.inc  PPCGenCodeEmitter.inc \
+                PPCGenRegisterInfo.h.inc PPCGenRegisterInfo.inc \
+                PPCGenInstrInfo.inc PPCGenDAGISel.inc \
+                PPCGenSubtarget.inc PPCGenCallingConv.inc
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
new file mode 100644
index 0000000..9327f30
--- /dev/null
+++ b/lib/Target/PowerPC/PPC.h

@@ -0,0 +1,47 @@
+//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// PowerPC back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_H
+#define LLVM_TARGET_POWERPC_H
+
+#include <iosfwd>
+
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+  class PPCTargetMachine;
+  class FunctionPassManager;
+  class FunctionPass;
+  class MachineCodeEmitter;
+  
+FunctionPass *createPPCBranchSelectionPass();
+FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+FunctionPass *createPPCAsmPrinterPass(std::ostream &OS,
+                                      PPCTargetMachine &TM);
+FunctionPass *createPPCCodeEmitterPass(PPCTargetMachine &TM,
+                                       MachineCodeEmitter &MCE);
+} // end namespace llvm;
+
+// Defines symbolic names for PowerPC registers.  This defines a mapping from
+// register name to register number.
+//
+#include "PPCGenRegisterNames.inc"
+
+// Defines symbolic names for the PowerPC instructions.
+//
+#include "PPCGenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
new file mode 100644
index 0000000..76f8ac4
--- /dev/null
+++ b/lib/Target/PowerPC/PPC.td

@@ -0,0 +1,114 @@
+//===- PPC.td - Describe the PowerPC Target Machine --------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+//
+include "../Target.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC Subtarget features.
+//
+ 
+//===----------------------------------------------------------------------===//
+// CPU Directives                                                             //
+//===----------------------------------------------------------------------===//
+
+def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">;
+def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">;
+def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">;
+def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">;
+def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
+def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
+def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
+
+def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
+                                        "Enable 64-bit instructions">;
+def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
+                              "Enable 64-bit registers usage for ppc32 [beta]">;
+def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
+                                        "Enable Altivec instructions">;
+def FeatureGPUL      : SubtargetFeature<"gpul","IsGigaProcessor", "true",
+                                        "Enable GPUL instructions">;
+def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
+                                        "Enable the fsqrt instruction">; 
+def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
+                                        "Enable the stfiwx instruction">; 
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "PPCRegisterInfo.td"
+include "PPCSchedule.td"
+include "PPCInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC processors supported.
+//
+
+def : Processor<"generic", G3Itineraries, [Directive32]>;
+def : Processor<"601", G3Itineraries, [Directive601]>;
+def : Processor<"602", G3Itineraries, [Directive602]>;
+def : Processor<"603", G3Itineraries, [Directive603]>;
+def : Processor<"603e", G3Itineraries, [Directive603]>;
+def : Processor<"603ev", G3Itineraries, [Directive603]>;
+def : Processor<"604", G3Itineraries, [Directive604]>;
+def : Processor<"604e", G3Itineraries, [Directive604]>;
+def : Processor<"620", G3Itineraries, [Directive620]>;
+def : Processor<"g3", G3Itineraries, [Directive7400]>;
+def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive750, FeatureAltivec]>;
+def : Processor<"750", G4Itineraries, [Directive750, FeatureAltivec]>;
+def : Processor<"970", G5Itineraries,
+                  [Directive970, FeatureAltivec,
+                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"g5", G5Itineraries,
+                  [Directive970, FeatureAltivec,
+                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"ppc", G3Itineraries, [Directive32]>;
+def : Processor<"ppc64", G5Itineraries,
+                  [Directive64, FeatureAltivec,
+                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "PPCCallingConv.td"
+
+def PPCInstrInfo : InstrInfo {
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the PPCInstrInfo.h file.
+  let TSFlagsFields = ["PPC970_First",
+                       "PPC970_Single",
+                       "PPC970_Cracked",
+                       "PPC970_Unit"];
+  let TSFlagsShifts = [0, 1, 2, 3];
+
+  let isLittleEndianEncoding = 1;
+}
+
+
+def PPC : Target {
+  // Information about the instructions.
+  let InstructionSet = PPCInstrInfo;
+}

diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
new file mode 100644
index 0000000..2880196
--- /dev/null
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp

@@ -0,0 +1,1101 @@
+//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly --------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PowerPC assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+// Documentation at http://developer.apple.com/documentation/DeveloperTools/
+// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asmprinter"
+#include "PPC.h"
+#include "PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "PPCSubtarget.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/MRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  struct VISIBILITY_HIDDEN PPCAsmPrinter : public AsmPrinter {
+    std::set<std::string> FnStubs, GVStubs;
+    const PPCSubtarget &Subtarget;
+
+    PPCAsmPrinter(std::ostream &O, TargetMachine &TM, const TargetAsmInfo *T)
+      : AsmPrinter(O, TM, T), Subtarget(TM.getSubtarget<PPCSubtarget>()) {
+    }
+
+    virtual const char *getPassName() const {
+      return "PowerPC Assembly Printer";
+    }
+
+    PPCTargetMachine &getTM() {
+      return static_cast<PPCTargetMachine&>(TM);
+    }
+
+    unsigned enumRegToMachineReg(unsigned enumReg) {
+      switch (enumReg) {
+      default: assert(0 && "Unhandled register!"); break;
+      case PPC::CR0:  return  0;
+      case PPC::CR1:  return  1;
+      case PPC::CR2:  return  2;
+      case PPC::CR3:  return  3;
+      case PPC::CR4:  return  4;
+      case PPC::CR5:  return  5;
+      case PPC::CR6:  return  6;
+      case PPC::CR7:  return  7;
+      }
+      abort();
+    }
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.  This method returns true if the
+    /// machine instruction was sufficiently described to print it, otherwise it
+    /// returns false.
+    bool printInstruction(const MachineInstr *MI);
+
+    void printMachineInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO);
+    
+    /// stripRegisterPrefix - This method strips the character prefix from a
+    /// register name so that only the number is left.  Used by for linux asm.
+    const char *stripRegisterPrefix(const char *RegName) {
+      switch (RegName[0]) {
+      case 'r':
+      case 'f':
+      case 'v': return RegName + 1;
+      case 'c': if (RegName[1] == 'r') return RegName + 2;
+      }
+       
+      return RegName;
+    }
+    
+    /// printRegister - Print register according to target requirements.
+    ///
+    void printRegister(const MachineOperand &MO, bool R0AsZero) {
+      unsigned RegNo = MO.getReg();
+      assert(MRegisterInfo::isPhysicalRegister(RegNo) && "Not physreg??");
+      
+      // If we should use 0 for R0.
+      if (R0AsZero && RegNo == PPC::R0) {
+        O << "0";
+        return;
+      }
+      
+      const char *RegName = TM.getRegisterInfo()->get(RegNo).Name;
+      // Linux assembler (Others?) does not take register mnemonics.
+      // FIXME - What about special registers used in mfspr/mtspr?
+      if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+      O << RegName;
+    }
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (MO.isRegister()) {
+        printRegister(MO, false);
+      } else if (MO.isImmediate()) {
+        O << MO.getImmedValue();
+      } else {
+        printOp(MO);
+      }
+    }
+    
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode);
+    
+    
+    void printS5ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      char value = MI->getOperand(OpNo).getImmedValue();
+      value = (value << (32-5)) >> (32-5);
+      O << (int)value;
+    }
+    void printU5ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      unsigned char value = MI->getOperand(OpNo).getImmedValue();
+      assert(value <= 31 && "Invalid u5imm argument!");
+      O << (unsigned int)value;
+    }
+    void printU6ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      unsigned char value = MI->getOperand(OpNo).getImmedValue();
+      assert(value <= 63 && "Invalid u6imm argument!");
+      O << (unsigned int)value;
+    }
+    void printS16ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      O << (short)MI->getOperand(OpNo).getImmedValue();
+    }
+    void printU16ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      O << (unsigned short)MI->getOperand(OpNo).getImmedValue();
+    }
+    void printS16X4ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImmediate()) {
+        O << (short)(MI->getOperand(OpNo).getImmedValue()*4);
+      } else {
+        O << "lo16(";
+        printOp(MI->getOperand(OpNo));
+        if (TM.getRelocationModel() == Reloc::PIC_)
+          O << "-\"L" << getFunctionNumber() << "$pb\")";
+        else
+          O << ')';
+      }
+    }
+    void printBranchOperand(const MachineInstr *MI, unsigned OpNo) {
+      // Branches can take an immediate operand.  This is used by the branch
+      // selection pass to print $+8, an eight byte displacement from the PC.
+      if (MI->getOperand(OpNo).isImmediate()) {
+        O << "$+" << MI->getOperand(OpNo).getImmedValue()*4;
+      } else {
+        printOp(MI->getOperand(OpNo));
+      }
+    }
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (TM.getRelocationModel() != Reloc::Static) {
+        if (MO.getType() == MachineOperand::MO_GlobalAddress) {
+          GlobalValue *GV = MO.getGlobal();
+          if (((GV->isDeclaration() || GV->hasWeakLinkage() ||
+                GV->hasLinkOnceLinkage()))) {
+            // Dynamically-resolved functions need a stub for the function.
+            std::string Name = Mang->getValueName(GV);
+            FnStubs.insert(Name);
+            O << "L" << Name << "$stub";
+            if (GV->hasExternalWeakLinkage())
+              ExtWeakSymbols.insert(GV);
+            return;
+          }
+        }
+        if (MO.getType() == MachineOperand::MO_ExternalSymbol) {
+          std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName();
+          FnStubs.insert(Name);
+          O << "L" << Name << "$stub";
+          return;
+        }
+      }
+      
+      printOp(MI->getOperand(OpNo));
+    }
+    void printAbsAddrOperand(const MachineInstr *MI, unsigned OpNo) {
+     O << (int)MI->getOperand(OpNo).getImmedValue()*4;
+    }
+    void printPICLabel(const MachineInstr *MI, unsigned OpNo) {
+      O << "\"L" << getFunctionNumber() << "$pb\"\n";
+      O << "\"L" << getFunctionNumber() << "$pb\":";
+    }
+    void printSymbolHi(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImmediate()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        if (Subtarget.isDarwin()) O << "ha16(";
+        printOp(MI->getOperand(OpNo));
+        if (TM.getRelocationModel() == Reloc::PIC_)
+          O << "-\"L" << getFunctionNumber() << "$pb\"";
+        if (Subtarget.isDarwin())
+          O << ')';
+        else
+          O << "@ha";
+      }
+    }
+    void printSymbolLo(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImmediate()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        if (Subtarget.isDarwin()) O << "lo16(";
+        printOp(MI->getOperand(OpNo));
+        if (TM.getRelocationModel() == Reloc::PIC_)
+          O << "-\"L" << getFunctionNumber() << "$pb\"";
+        if (Subtarget.isDarwin())
+          O << ')';
+        else
+          O << "@l";
+      }
+    }
+    void printcrbitm(const MachineInstr *MI, unsigned OpNo) {
+      unsigned CCReg = MI->getOperand(OpNo).getReg();
+      unsigned RegNo = enumRegToMachineReg(CCReg);
+      O << (0x80 >> RegNo);
+    }
+    // The new addressing mode printers.
+    void printMemRegImm(const MachineInstr *MI, unsigned OpNo) {
+      printSymbolLo(MI, OpNo);
+      O << '(';
+      if (MI->getOperand(OpNo+1).isRegister() && 
+          MI->getOperand(OpNo+1).getReg() == PPC::R0)
+        O << "0";
+      else
+        printOperand(MI, OpNo+1);
+      O << ')';
+    }
+    void printMemRegImmShifted(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImmediate())
+        printS16X4ImmOperand(MI, OpNo);
+      else 
+        printSymbolLo(MI, OpNo);
+      O << '(';
+      if (MI->getOperand(OpNo+1).isRegister() && 
+          MI->getOperand(OpNo+1).getReg() == PPC::R0)
+        O << "0";
+      else
+        printOperand(MI, OpNo+1);
+      O << ')';
+    }
+    
+    void printMemRegReg(const MachineInstr *MI, unsigned OpNo) {
+      // When used as the base register, r0 reads constant zero rather than
+      // the value contained in the register.  For this reason, the darwin
+      // assembler requires that we print r0 as 0 (no r) when used as the base.
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      printRegister(MO, true);
+      O << ", ";
+      printOperand(MI, OpNo+1);
+    }
+    
+    void printPredicateOperand(const MachineInstr *MI, unsigned OpNo, 
+                               const char *Modifier);
+    
+    virtual bool runOnMachineFunction(MachineFunction &F) = 0;
+    virtual bool doFinalization(Module &M) = 0;
+
+    virtual void EmitExternalGlobal(const GlobalVariable *GV);
+  };
+
+  /// LinuxAsmPrinter - PowerPC assembly printer, customized for Linux
+  struct VISIBILITY_HIDDEN LinuxAsmPrinter : public PPCAsmPrinter {
+
+    DwarfWriter DW;
+
+    LinuxAsmPrinter(std::ostream &O, PPCTargetMachine &TM,
+                    const TargetAsmInfo *T)
+      : PPCAsmPrinter(O, TM, T), DW(O, this, T) {
+    }
+
+    virtual const char *getPassName() const {
+      return "Linux PPC Assembly Printer";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      PPCAsmPrinter::getAnalysisUsage(AU);
+    }
+
+    /// getSectionForFunction - Return the section that we should emit the
+    /// specified function body into.
+    virtual std::string getSectionForFunction(const Function &F) const;
+  };
+
+  /// DarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac OS
+  /// X
+  struct VISIBILITY_HIDDEN DarwinAsmPrinter : public PPCAsmPrinter {
+  
+    DwarfWriter DW;
+
+    DarwinAsmPrinter(std::ostream &O, PPCTargetMachine &TM,
+                     const TargetAsmInfo *T)
+      : PPCAsmPrinter(O, TM, T), DW(O, this, T) {
+    }
+
+    virtual const char *getPassName() const {
+      return "Darwin PPC Assembly Printer";
+    }
+    
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      PPCAsmPrinter::getAnalysisUsage(AU);
+    }
+
+    /// getSectionForFunction - Return the section that we should emit the
+    /// specified function body into.
+    virtual std::string getSectionForFunction(const Function &F) const;
+  };
+} // end of anonymous namespace
+
+// Include the auto-generated portion of the assembly writer
+#include "PPCGenAsmWriter.inc"
+
+void PPCAsmPrinter::printOp(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    cerr << "printOp() does not handle immediate values\n";
+    abort();
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMachineBasicBlock());
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getJumpTableIndex();
+    // FIXME: PIC relocation model
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getConstantPoolIndex();
+    return;
+  case MachineOperand::MO_ExternalSymbol:
+    // Computing the address of an external symbol, not calling it.
+    if (TM.getRelocationModel() != Reloc::Static) {
+      std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName();
+      GVStubs.insert(Name);
+      O << "L" << Name << "$non_lazy_ptr";
+      return;
+    }
+    O << TAI->getGlobalPrefix() << MO.getSymbolName();
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    // Computing the address of a global symbol, not calling it.
+    GlobalValue *GV = MO.getGlobal();
+    std::string Name = Mang->getValueName(GV);
+
+    // External or weakly linked global variables need non-lazily-resolved stubs
+    if (TM.getRelocationModel() != Reloc::Static) {
+      if (((GV->isDeclaration() || GV->hasWeakLinkage() ||
+            GV->hasLinkOnceLinkage()))) {
+        GVStubs.insert(Name);
+        O << "L" << Name << "$non_lazy_ptr";
+        return;
+      }
+    }
+    O << Name;
+    
+    if (MO.getOffset() > 0)
+      O << "+" << MO.getOffset();
+    else if (MO.getOffset() < 0)
+      O << MO.getOffset();
+    
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+    return;
+  }
+
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// EmitExternalGlobal - In this case we need to use the indirect symbol.
+///
+void PPCAsmPrinter::EmitExternalGlobal(const GlobalVariable *GV) {
+  std::string Name = getGlobalLinkName(GV);
+  if (TM.getRelocationModel() != Reloc::Static) {
+    GVStubs.insert(Name);
+    O << "L" << Name << "$non_lazy_ptr";
+    return;
+  }
+  O << Name;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant, 
+                                    const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'c': // Don't print "$" before a global var name or constant.
+      // PPC never has a prefix.
+      printOperand(MI, OpNo);
+      return false;
+    case 'L': // Write second word of DImode reference.  
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isRegister() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isRegister())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        O << "i";
+      return false;
+    }
+  }
+  
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                          unsigned AsmVariant, 
+                                          const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  if (MI->getOperand(OpNo).isRegister())
+    printMemRegReg(MI, OpNo);
+  else
+    printMemRegImm(MI, OpNo);
+  return false;
+}
+
+void PPCAsmPrinter::printPredicateOperand(const MachineInstr *MI, unsigned OpNo, 
+                                          const char *Modifier) {
+  assert(Modifier && "Must specify 'cc' or 'reg' as predicate op modifier!");
+  unsigned Code = MI->getOperand(OpNo).getImm();
+  if (!strcmp(Modifier, "cc")) {
+    switch ((PPC::Predicate)Code) {
+    case PPC::PRED_ALWAYS: return; // Don't print anything for always.
+    case PPC::PRED_LT: O << "lt"; return;
+    case PPC::PRED_LE: O << "le"; return;
+    case PPC::PRED_EQ: O << "eq"; return;
+    case PPC::PRED_GE: O << "ge"; return;
+    case PPC::PRED_GT: O << "gt"; return;
+    case PPC::PRED_NE: O << "ne"; return;
+    case PPC::PRED_UN: O << "un"; return;
+    case PPC::PRED_NU: O << "nu"; return;
+    }
+      
+  } else {
+    assert(!strcmp(Modifier, "reg") &&
+           "Need to specify 'cc' or 'reg' as predicate op modifier!");
+    // Don't print the register for 'always'.
+    if (Code == PPC::PRED_ALWAYS) return;
+    printOperand(MI, OpNo+1);
+  }
+}
+
+
+/// printMachineInstruction -- Print out a single PowerPC MI in Darwin syntax to
+/// the current output stream.
+///
+void PPCAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // Check for slwi/srwi mnemonics.
+  if (MI->getOpcode() == PPC::RLWINM) {
+    bool FoundMnemonic = false;
+    unsigned char SH = MI->getOperand(2).getImmedValue();
+    unsigned char MB = MI->getOperand(3).getImmedValue();
+    unsigned char ME = MI->getOperand(4).getImmedValue();
+    if (SH <= 31 && MB == 0 && ME == (31-SH)) {
+      O << "slwi "; FoundMnemonic = true;
+    }
+    if (SH <= 31 && MB == (32-SH) && ME == 31) {
+      O << "srwi "; FoundMnemonic = true;
+      SH = 32-SH;
+    }
+    if (FoundMnemonic) {
+      printOperand(MI, 0);
+      O << ", ";
+      printOperand(MI, 1);
+      O << ", " << (unsigned int)SH << "\n";
+      return;
+    }
+  } else if (MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) {
+    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+      O << "mr ";
+      printOperand(MI, 0);
+      O << ", ";
+      printOperand(MI, 1);
+      O << "\n";
+      return;
+    }
+  } else if (MI->getOpcode() == PPC::RLDICR) {
+    unsigned char SH = MI->getOperand(2).getImmedValue();
+    unsigned char ME = MI->getOperand(3).getImmedValue();
+    // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
+    if (63-SH == ME) {
+      O << "sldi ";
+      printOperand(MI, 0);
+      O << ", ";
+      printOperand(MI, 1);
+      O << ", " << (unsigned int)SH << "\n";
+      return;
+    }
+  }
+
+  if (printInstruction(MI))
+    return; // Printer was automatically generated
+
+  assert(0 && "Unhandled instruction in asm writer!");
+  abort();
+  return;
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool LinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>());
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+  
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+  
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+    break;
+  case Function::ExternalLinkage:
+    O << "\t.global\t" << CurrentFnName << '\n'
+      << "\t.type\t" << CurrentFnName << ", @function\n";
+    break;
+  case Function::WeakLinkage:
+  case Function::LinkOnceLinkage:
+    O << "\t.global\t" << CurrentFnName << '\n';
+    O << "\t.weak\t" << CurrentFnName << '\n';
+    break;
+  }
+  
+  if (F->hasHiddenVisibility())
+    if (const char *Directive = TAI->getHiddenDirective())
+      O << Directive << CurrentFnName << "\n";
+  
+  EmitAlignment(2, F);
+  O << CurrentFnName << ":\n";
+
+  // Emit pre-function debug information.
+  DW.BeginFunction(&MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printMachineInstruction(II);
+    }
+  }
+
+  O << "\t.size\t" << CurrentFnName << ",.-" << CurrentFnName << "\n";
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+  
+  // Emit post-function debug information.
+  DW.EndFunction();
+  
+  // We didn't modify anything.
+  return false;
+}
+
+bool LinuxAsmPrinter::doInitialization(Module &M) {
+  AsmPrinter::doInitialization(M);
+  
+  // GNU as handles section names wrapped in quotes
+  Mang->setUseQuotes(true);
+
+  SwitchToTextSection(TAI->getTextSection());
+  
+  // Emit initial debug information.
+  DW.BeginModule(&M);
+  return false;
+}
+
+bool LinuxAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (!I->hasInitializer()) continue;   // External global require no code
+    
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(I))
+      continue;
+
+    std::string name = Mang->getValueName(I);
+
+    if (I->hasHiddenVisibility())
+      if (const char *Directive = TAI->getHiddenDirective())
+        O << Directive << name << "\n";
+    
+    Constant *C = I->getInitializer();
+    unsigned Size = TD->getTypeSize(C->getType());
+    unsigned Align = TD->getPreferredAlignmentLog(I);
+
+    if (C->isNullValue() && /* FIXME: Verify correct */
+        (I->hasInternalLinkage() || I->hasWeakLinkage() ||
+         I->hasLinkOnceLinkage() ||
+         (I->hasExternalLinkage() && !I->hasSection()))) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+      if (I->hasExternalLinkage()) {
+        O << "\t.global " << name << '\n';
+        O << "\t.type " << name << ", @object\n";
+        //O << "\t.zerofill __DATA, __common, " << name << ", "
+        //  << Size << ", " << Align;
+      } else if (I->hasInternalLinkage()) {
+        SwitchToDataSection("\t.data", I);
+        O << TAI->getLCOMMDirective() << name << "," << Size;
+      } else {
+        SwitchToDataSection("\t.data", I);
+        O << ".comm " << name << "," << Size;
+      }
+      O << "\t\t" << TAI->getCommentString() << " '" << I->getName() << "'\n";
+    } else {
+      switch (I->getLinkage()) {
+      case GlobalValue::LinkOnceLinkage:
+      case GlobalValue::WeakLinkage:
+        O << "\t.global " << name << '\n'
+          << "\t.type " << name << ", @object\n"
+          << "\t.weak " << name << '\n';
+        SwitchToDataSection("\t.data", I);
+        break;
+      case GlobalValue::AppendingLinkage:
+        // FIXME: appending linkage variables should go into a section of
+        // their name or something.  For now, just emit them as external.
+      case GlobalValue::ExternalLinkage:
+        // If external or appending, declare as a global symbol
+        O << "\t.global " << name << "\n"
+          << "\t.type " << name << ", @object\n";
+        // FALL THROUGH
+      case GlobalValue::InternalLinkage:
+        if (I->isConstant()) {
+          const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+          if (TAI->getCStringSection() && CVA && CVA->isCString()) {
+            SwitchToDataSection(TAI->getCStringSection(), I);
+            break;
+          }
+        }
+
+        // FIXME: special handling for ".ctors" & ".dtors" sections
+        if (I->hasSection() &&
+            (I->getSection() == ".ctors" ||
+             I->getSection() == ".dtors")) {
+          std::string SectionName = ".section " + I->getSection()
+                                                + ",\"aw\",@progbits";
+          SwitchToDataSection(SectionName.c_str());
+        } else {
+          if (I->isConstant() && TAI->getReadOnlySection())
+            SwitchToDataSection(TAI->getReadOnlySection(), I);
+          else
+            SwitchToDataSection(TAI->getDataSection(), I);
+        }
+        break;
+      default:
+        cerr << "Unknown linkage type!";
+        abort();
+      }
+
+      EmitAlignment(Align, I);
+      O << name << ":\t\t\t\t" << TAI->getCommentString() << " '"
+        << I->getName() << "'\n";
+
+      // If the initializer is a extern weak symbol, remember to emit the weak
+      // reference!
+      if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+        if (GV->hasExternalWeakLinkage())
+          ExtWeakSymbols.insert(GV);
+
+      EmitGlobalConstant(C);
+      O << '\n';
+    }
+  }
+
+  // TODO
+
+  // Emit initial debug information.
+  DW.EndModule();
+
+  AsmPrinter::doFinalization(M);
+  return false; // success
+}
+
+std::string LinuxAsmPrinter::getSectionForFunction(const Function &F) const {
+  switch (F.getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::ExternalLinkage:
+  case Function::InternalLinkage: return TAI->getTextSection();
+  case Function::WeakLinkage:
+  case Function::LinkOnceLinkage:
+    return ".text";
+  }
+}
+
+std::string DarwinAsmPrinter::getSectionForFunction(const Function &F) const {
+  switch (F.getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::ExternalLinkage:
+  case Function::InternalLinkage: return TAI->getTextSection();
+  case Function::WeakLinkage:
+  case Function::LinkOnceLinkage:
+    return ".section __TEXT,__textcoal_nt,coalesced,pure_instructions";
+  }
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool DarwinAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>());
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+  
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+  
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+    break;
+  case Function::ExternalLinkage:
+    O << "\t.globl\t" << CurrentFnName << "\n";
+    break;
+  case Function::WeakLinkage:
+  case Function::LinkOnceLinkage:
+    O << "\t.globl\t" << CurrentFnName << "\n";
+    O << "\t.weak_definition\t" << CurrentFnName << "\n";
+    break;
+  }
+  
+  if (F->hasHiddenVisibility())
+    if (const char *Directive = TAI->getHiddenDirective())
+      O << Directive << CurrentFnName << "\n";
+  
+  EmitAlignment(4, F);
+  O << CurrentFnName << ":\n";
+
+  // Emit pre-function debug information.
+  DW.BeginFunction(&MF);
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printMachineInstruction(II);
+    }
+  }
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+  
+  // Emit post-function debug information.
+  DW.EndFunction();
+  
+  // We didn't modify anything.
+  return false;
+}
+
+
+bool DarwinAsmPrinter::doInitialization(Module &M) {
+  static const char *CPUDirectives[] = {
+    "ppc",
+    "ppc601",
+    "ppc602",
+    "ppc603",
+    "ppc7400",
+    "ppc750",
+    "ppc970",
+    "ppc64"
+  };
+
+  unsigned Directive = Subtarget.getDarwinDirective();
+  if (Subtarget.isGigaProcessor() && Directive < PPC::DIR_970)
+    Directive = PPC::DIR_970;
+  if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400)
+    Directive = PPC::DIR_7400;
+  if (Subtarget.isPPC64() && Directive < PPC::DIR_970)
+    Directive = PPC::DIR_64;
+  assert(Directive <= PPC::DIR_64 && "Directive out of range.");
+  O << "\t.machine " << CPUDirectives[Directive] << "\n";
+     
+  AsmPrinter::doInitialization(M);
+  
+  // Darwin wants symbols to be quoted if they have complex names.
+  Mang->setUseQuotes(true);
+  
+  // Prime text sections so they are adjacent.  This reduces the likelihood a
+  // large data or debug section causes a branch to exceed 16M limit.
+  SwitchToTextSection(".section __TEXT,__textcoal_nt,coalesced,"
+                      "pure_instructions");
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    SwitchToTextSection(".section __TEXT,__picsymbolstub1,symbol_stubs,"
+                          "pure_instructions,32");
+  } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
+    SwitchToTextSection(".section __TEXT,__symbol_stub1,symbol_stubs,"
+                        "pure_instructions,16");
+  }
+  SwitchToTextSection(TAI->getTextSection());
+  
+  // Emit initial debug information.
+  DW.BeginModule(&M);
+  return false;
+}
+
+bool DarwinAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (!I->hasInitializer()) continue;   // External global require no code
+    
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(I)) {
+      if (TM.getRelocationModel() == Reloc::Static) {
+        if (I->getName() == "llvm.global_ctors")
+          O << ".reference .constructors_used\n";
+        else if (I->getName() == "llvm.global_dtors")
+          O << ".reference .destructors_used\n";
+      }
+      continue;
+    }
+    
+    std::string name = Mang->getValueName(I);
+    
+    if (I->hasHiddenVisibility())
+      if (const char *Directive = TAI->getHiddenDirective())
+        O << Directive << name << "\n";
+    
+    Constant *C = I->getInitializer();
+    const Type *Type = C->getType();
+    unsigned Size = TD->getTypeSize(Type);
+    unsigned Align = TD->getPreferredAlignmentLog(I);
+
+    if (C->isNullValue() && /* FIXME: Verify correct */
+        (I->hasInternalLinkage() || I->hasWeakLinkage() ||
+         I->hasLinkOnceLinkage() ||
+         (I->hasExternalLinkage() && !I->hasSection()))) {
+      if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+      if (I->hasExternalLinkage()) {
+        O << "\t.globl " << name << '\n';
+        O << "\t.zerofill __DATA, __common, " << name << ", "
+          << Size << ", " << Align;
+      } else if (I->hasInternalLinkage()) {
+        SwitchToDataSection("\t.data", I);
+        O << TAI->getLCOMMDirective() << name << "," << Size << "," << Align;
+      } else {
+        SwitchToDataSection("\t.data", I);
+        O << ".comm " << name << "," << Size;
+      }
+      O << "\t\t" << TAI->getCommentString() << " '" << I->getName() << "'\n";
+    } else {
+      switch (I->getLinkage()) {
+      case GlobalValue::LinkOnceLinkage:
+      case GlobalValue::WeakLinkage:
+        O << "\t.globl " << name << '\n'
+          << "\t.weak_definition " << name << '\n';
+        SwitchToDataSection(".section __DATA,__datacoal_nt,coalesced", I);
+        break;
+      case GlobalValue::AppendingLinkage:
+        // FIXME: appending linkage variables should go into a section of
+        // their name or something.  For now, just emit them as external.
+      case GlobalValue::ExternalLinkage:
+        // If external or appending, declare as a global symbol
+        O << "\t.globl " << name << "\n";
+        // FALL THROUGH
+      case GlobalValue::InternalLinkage:
+        if (I->isConstant()) {
+          const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+          if (TAI->getCStringSection() && CVA && CVA->isCString()) {
+            SwitchToDataSection(TAI->getCStringSection(), I);
+            break;
+          }
+        }
+
+        if (!I->isConstant())
+          SwitchToDataSection(TAI->getDataSection(), I);
+        else {
+          // Read-only data.
+          bool HasReloc = C->ContainsRelocations();
+          if (HasReloc &&
+              TM.getRelocationModel() != Reloc::Static)
+            SwitchToDataSection("\t.const_data\n");
+          else if (!HasReloc && Size == 4 &&
+                   TAI->getFourByteConstantSection())
+            SwitchToDataSection(TAI->getFourByteConstantSection(), I);
+          else if (!HasReloc && Size == 8 &&
+                   TAI->getEightByteConstantSection())
+            SwitchToDataSection(TAI->getEightByteConstantSection(), I);
+          else if (!HasReloc && Size == 16 &&
+                   TAI->getSixteenByteConstantSection())
+            SwitchToDataSection(TAI->getSixteenByteConstantSection(), I);
+          else if (TAI->getReadOnlySection())
+            SwitchToDataSection(TAI->getReadOnlySection(), I);
+          else
+            SwitchToDataSection(TAI->getDataSection(), I);
+        }
+        break;
+      default:
+        cerr << "Unknown linkage type!";
+        abort();
+      }
+
+      EmitAlignment(Align, I);
+      O << name << ":\t\t\t\t" << TAI->getCommentString() << " '"
+        << I->getName() << "'\n";
+
+      // If the initializer is a extern weak symbol, remember to emit the weak
+      // reference!
+      if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+        if (GV->hasExternalWeakLinkage())
+          ExtWeakSymbols.insert(GV);
+
+      EmitGlobalConstant(C);
+      O << '\n';
+    }
+  }
+
+  bool isPPC64 = TD->getPointerSizeInBits() == 64;
+
+  // Output stubs for dynamically-linked functions
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      SwitchToTextSection(".section __TEXT,__picsymbolstub1,symbol_stubs,"
+                          "pure_instructions,32");
+      EmitAlignment(4);
+      O << "L" << *i << "$stub:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\tmflr r0\n";
+      O << "\tbcl 20,31,L0$" << *i << "\n";
+      O << "L0$" << *i << ":\n";
+      O << "\tmflr r11\n";
+      O << "\taddis r11,r11,ha16(L" << *i << "$lazy_ptr-L0$" << *i << ")\n";
+      O << "\tmtlr r0\n";
+      if (isPPC64)
+        O << "\tldu r12,lo16(L" << *i << "$lazy_ptr-L0$" << *i << ")(r11)\n";
+      else
+        O << "\tlwzu r12,lo16(L" << *i << "$lazy_ptr-L0$" << *i << ")(r11)\n";
+      O << "\tmtctr r12\n";
+      O << "\tbctr\n";
+      SwitchToDataSection(".lazy_symbol_pointer");
+      O << "L" << *i << "$lazy_ptr:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      if (isPPC64)
+        O << "\t.quad dyld_stub_binding_helper\n";
+      else
+        O << "\t.long dyld_stub_binding_helper\n";
+    }
+  } else {
+    for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      SwitchToTextSection(".section __TEXT,__symbol_stub1,symbol_stubs,"
+                          "pure_instructions,16");
+      EmitAlignment(4);
+      O << "L" << *i << "$stub:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\tlis r11,ha16(L" << *i << "$lazy_ptr)\n";
+      if (isPPC64)
+        O << "\tldu r12,lo16(L" << *i << "$lazy_ptr)(r11)\n";
+      else
+        O << "\tlwzu r12,lo16(L" << *i << "$lazy_ptr)(r11)\n";
+      O << "\tmtctr r12\n";
+      O << "\tbctr\n";
+      SwitchToDataSection(".lazy_symbol_pointer");
+      O << "L" << *i << "$lazy_ptr:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      if (isPPC64)
+        O << "\t.quad dyld_stub_binding_helper\n";
+      else
+        O << "\t.long dyld_stub_binding_helper\n";
+    }
+  }
+
+  O << "\n";
+
+  // Output stubs for external and common global variables.
+  if (GVStubs.begin() != GVStubs.end()) {
+    SwitchToDataSection(".non_lazy_symbol_pointer");
+    for (std::set<std::string>::iterator I = GVStubs.begin(),
+         E = GVStubs.end(); I != E; ++I) {
+      O << "L" << *I << "$non_lazy_ptr:\n";
+      O << "\t.indirect_symbol " << *I << "\n";
+      if (isPPC64)
+        O << "\t.quad\t0\n";
+      else
+        O << "\t.long\t0\n";
+        
+    }
+  }
+
+  // Emit initial debug information.
+  DW.EndModule();
+
+  // Funny Darwin hack: This flag tells the linker that no global symbols
+  // contain code that falls through to other global symbols (e.g. the obvious
+  // implementation of multiple entry points).  If this doesn't occur, the
+  // linker can safely perform dead code stripping.  Since LLVM never generates
+  // code that does this, it is always safe to set.
+  O << "\t.subsections_via_symbols\n";
+
+  AsmPrinter::doFinalization(M);
+  return false; // success
+}
+
+
+
+/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
+/// for a MachineFunction to the given output stream, in a format that the
+/// Darwin assembler can deal with.
+///
+FunctionPass *llvm::createPPCAsmPrinterPass(std::ostream &o,
+                                            PPCTargetMachine &tm) {
+  const PPCSubtarget *Subtarget = &tm.getSubtarget<PPCSubtarget>();
+
+  if (Subtarget->isDarwin()) {
+    return new DarwinAsmPrinter(o, tm, tm.getTargetAsmInfo());
+  } else {
+    return new LinuxAsmPrinter(o, tm, tm.getTargetAsmInfo());
+  }
+}
+

diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
new file mode 100644
index 0000000..4286f01
--- /dev/null
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp

@@ -0,0 +1,199 @@
+//===-- PPCBranchSelector.cpp - Emit long conditional branches-----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Baegeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 16 bits of displacement to reach their
+// target basic block.  It does this in two passes; a calculation of basic block
+// positions pass, and a branch psuedo op to machine branch opcode pass.  This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-branch-select"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCPredicates.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace {
+  struct VISIBILITY_HIDDEN PPCBSel : public MachineFunctionPass {
+    static char ID;
+    PPCBSel() : MachineFunctionPass((intptr_t)&ID) {}
+
+    /// BlockSizes - The sizes of the basic blocks in the function.
+    std::vector<unsigned> BlockSizes;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "PowerPC Branch Selector";
+    }
+  };
+  char PPCBSel::ID = 0;
+}
+
+/// createPPCBranchSelectionPass - returns an instance of the Branch Selection
+/// Pass
+///
+FunctionPass *llvm::createPPCBranchSelectionPass() {
+  return new PPCBSel();
+}
+
+/// getNumBytesForInstruction - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+///
+static unsigned getNumBytesForInstruction(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case PPC::IMPLICIT_DEF_GPRC: // no asm emitted
+  case PPC::IMPLICIT_DEF_G8RC: // no asm emitted
+  case PPC::IMPLICIT_DEF_F4:   // no asm emitted
+  case PPC::IMPLICIT_DEF_F8:   // no asm emitted
+  case PPC::IMPLICIT_DEF_VRRC: // no asm emitted
+    return 0;
+  case PPC::INLINEASM: {       // Inline Asm: Variable size.
+    MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return MF->getTarget().getTargetAsmInfo()->getInlineAsmLength(AsmStr);
+  }
+  case PPC::LABEL: {
+    return 0;
+  }
+  default:
+    return 4; // PowerPC instructions are all 4 bytes
+  }
+}
+
+
+bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  // Give the blocks of the function a dense, in-order, numbering.
+  Fn.RenumberBlocks();
+  BlockSizes.resize(Fn.getNumBlockIDs());
+
+  // Measure each MBB and compute a size for the entire function.
+  unsigned FuncSize = 0;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+
+    unsigned BlockSize = 0;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI)
+      BlockSize += getNumBytesForInstruction(MBBI);
+    
+    BlockSizes[MBB->getNumber()] = BlockSize;
+    FuncSize += BlockSize;
+  }
+  
+  // If the entire function is smaller than the displacement of a branch field,
+  // we know we don't need to shrink any branches in this function.  This is a
+  // common case.
+  if (FuncSize < (1 << 15)) {
+    BlockSizes.clear();
+    return false;
+  }
+  
+  // For each conditional branch, if the offset to its destination is larger
+  // than the offset field allows, transform it into a long branch sequence
+  // like this:
+  //   short branch:
+  //     bCC MBB
+  //   long branch:
+  //     b!CC $PC+8
+  //     b MBB
+  //
+  bool MadeChange = true;
+  bool EverMadeChange = false;
+  while (MadeChange) {
+    // Iteratively expand branches until we reach a fixed point.
+    MadeChange = false;
+  
+    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+         ++MFI) {
+      MachineBasicBlock &MBB = *MFI;
+      unsigned MBBStartOffset = 0;
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+           I != E; ++I) {
+        if (I->getOpcode() != PPC::BCC || I->getOperand(2).isImm()) {
+          MBBStartOffset += getNumBytesForInstruction(I);
+          continue;
+        }
+        
+        // Determine the offset from the current branch to the destination
+        // block.
+        MachineBasicBlock *Dest = I->getOperand(2).getMachineBasicBlock();
+        
+        int BranchSize;
+        if (Dest->getNumber() <= MBB.getNumber()) {
+          // If this is a backwards branch, the delta is the offset from the
+          // start of this block to this branch, plus the sizes of all blocks
+          // from this block to the dest.
+          BranchSize = MBBStartOffset;
+          
+          for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        } else {
+          // Otherwise, add the size of the blocks between this block and the
+          // dest to the number of bytes left in this block.
+          BranchSize = -MBBStartOffset;
+
+          for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        }
+
+        // If this branch is in range, ignore it.
+        if (isInt16(BranchSize)) {
+          MBBStartOffset += 4;
+          continue;
+        }
+        
+        // Otherwise, we have to expand it to a long branch.
+        // The BCC operands are:
+        // 0. PPC branch predicate
+        // 1. CR register
+        // 2. Target MBB
+        PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
+        unsigned CRReg = I->getOperand(1).getReg();
+        
+        MachineInstr *OldBranch = I;
+        
+        // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
+        BuildMI(MBB, I, TII->get(PPC::BCC))
+          .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+        
+        // Uncond branch to the real destination.
+        I = BuildMI(MBB, I, TII->get(PPC::B)).addMBB(Dest);
+
+        // Remove the old branch from the function.
+        OldBranch->eraseFromParent();
+        
+        // Remember that this instruction is 8-bytes, increase the size of the
+        // block by 4, remember to iterate.
+        BlockSizes[MBB.getNumber()] += 4;
+        MBBStartOffset += 8;
+        ++NumExpanded;
+        MadeChange = true;
+      }
+    }
+    EverMadeChange |= MadeChange;
+  }
+  
+  BlockSizes.clear();
+  return true;
+}
+

diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
new file mode 100644
index 0000000..9e31b5a
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCallingConv.td

@@ -0,0 +1,65 @@
+//===- PPCCallingConv.td - Calling Conventions for PowerPC ------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the PowerPC 32- and 64-bit
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Return-value convention for PowerPC
+def RetCC_PPC : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[R3, R4]>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4]>>,
+  
+  CCIfType<[f32, f64], CCAssignToReg<[F1]>>,
+  
+  // Vector types are always returned in V2.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+/*
+def CC_PPC : CallingConv<[
+  // The first 8 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+  
+  // Common sub-targets passes FP values in F1 - F13
+  CCIfType<[f32, f64], CCIfSubtarget<"isMachoABI()",
+           CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8,F9,F10,F11,F12,F13]>>>,
+  // ELF32 sub-target pass FP values in F1 - F8.
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+           
+  // The first 12 Vector arguments are passed in altivec registers.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+              CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10,V11,V12,V13]>>
+
+/*
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToStack<16, 16>>*/
+]>;
+
+*/
+

diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
new file mode 100644
index 0000000..5dceffd
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp

@@ -0,0 +1,237 @@
+//===-- PPCCodeEmitter.cpp - JIT Code Emitter for PowerPC32 -------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC 32-bit CodeEmitter and associated machinery to
+// JIT-compile bitcode to native PowerPC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetMachine.h"
+#include "PPCRelocations.h"
+#include "PPC.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"                   
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN PPCCodeEmitter : public MachineFunctionPass {
+    TargetMachine &TM;
+    MachineCodeEmitter &MCE;
+
+    /// MovePCtoLROffset - When/if we see a MovePCtoLR instruction, we record
+    /// its address in the function into this pointer.
+    void *MovePCtoLROffset;
+    
+    /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr
+    ///
+    int getMachineOpValue(MachineInstr &MI, MachineOperand &MO);
+
+  public:
+    static char ID;
+    PPCCodeEmitter(TargetMachine &T, MachineCodeEmitter &M)
+      : MachineFunctionPass((intptr_t)&ID), TM(T), MCE(M) {}
+
+    const char *getPassName() const { return "PowerPC Machine Code Emitter"; }
+
+    /// runOnMachineFunction - emits the given MachineFunction to memory
+    ///
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    /// emitBasicBlock - emits the given MachineBasicBlock to memory
+    ///
+    void emitBasicBlock(MachineBasicBlock &MBB);
+
+    /// getValueBit - return the particular bit of Val
+    ///
+    unsigned getValueBit(int64_t Val, unsigned bit) { return (Val >> bit) & 1; }
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+    ///
+    unsigned getBinaryCodeForInstr(MachineInstr &MI);
+  };
+  char PPCCodeEmitter::ID = 0;
+}
+
+/// createPPCCodeEmitterPass - Return a pass that emits the collected PPC code
+/// to the specified MCE object.
+FunctionPass *llvm::createPPCCodeEmitterPass(PPCTargetMachine &TM,
+                                             MachineCodeEmitter &MCE) {
+  return new PPCCodeEmitter(TM, MCE);
+}
+
+#ifdef __APPLE__ 
+extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+#endif
+
+bool PPCCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+          MF.getTarget().getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+  do {
+    MovePCtoLROffset = 0;
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+      emitBasicBlock(*BB);
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
+  MCE.StartMachineBasicBlock(&MBB);
+  
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I){
+    MachineInstr &MI = *I;
+    switch (MI.getOpcode()) {
+    default:
+      MCE.emitWordBE(getBinaryCodeForInstr(*I));
+      break;
+    case PPC::IMPLICIT_DEF_GPRC:
+    case PPC::IMPLICIT_DEF_G8RC:
+    case PPC::IMPLICIT_DEF_F8:
+    case PPC::IMPLICIT_DEF_F4:
+    case PPC::IMPLICIT_DEF_VRRC:
+      break; // pseudo opcode, no side effects
+    case PPC::MovePCtoLR:
+    case PPC::MovePCtoLR8:
+      assert(TM.getRelocationModel() == Reloc::PIC_);
+      MovePCtoLROffset = (void*)MCE.getCurrentPCValue();
+      MCE.emitWordBE(0x48000005);   // bl 1
+      break;
+    }
+  }
+}
+
+int PPCCodeEmitter::getMachineOpValue(MachineInstr &MI, MachineOperand &MO) {
+
+  intptr_t rv = 0; // Return value; defaults to 0 for unhandled cases
+                   // or things that get fixed up later by the JIT.
+  if (MO.isRegister()) {
+    rv = PPCRegisterInfo::getRegisterNumbering(MO.getReg());
+
+    // Special encoding for MTCRF and MFOCRF, which uses a bit mask for the
+    // register, not the register number directly.
+    if ((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) &&
+        (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7)) {
+      rv = 0x80 >> rv;
+    }
+  } else if (MO.isImmediate()) {
+    rv = MO.getImmedValue();
+  } else if (MO.isGlobalAddress() || MO.isExternalSymbol() ||
+             MO.isConstantPoolIndex() || MO.isJumpTableIndex()) {
+    unsigned Reloc = 0;
+    if (MI.getOpcode() == PPC::BL_Macho || MI.getOpcode() == PPC::BL8_Macho ||
+        MI.getOpcode() == PPC::BL_ELF || MI.getOpcode() == PPC::BL8_ELF)
+      Reloc = PPC::reloc_pcrel_bx;
+    else {
+      if (TM.getRelocationModel() == Reloc::PIC_) {
+        assert(MovePCtoLROffset && "MovePCtoLR not seen yet?");
+      }
+      switch (MI.getOpcode()) {
+      default: MI.dump(); assert(0 && "Unknown instruction for relocation!");
+      case PPC::LIS:
+      case PPC::LIS8:
+      case PPC::ADDIS:
+      case PPC::ADDIS8:
+        Reloc = PPC::reloc_absolute_high;       // Pointer to symbol
+        break;
+      case PPC::LI:
+      case PPC::LI8:
+      case PPC::LA:
+      // Loads.
+      case PPC::LBZ:
+      case PPC::LBZ8:
+      case PPC::LHA:
+      case PPC::LHA8:
+      case PPC::LHZ:
+      case PPC::LHZ8:
+      case PPC::LWZ:
+      case PPC::LWZ8:
+      case PPC::LFS:
+      case PPC::LFD:
+      
+      // Stores.
+      case PPC::STB:
+      case PPC::STB8:
+      case PPC::STH:
+      case PPC::STH8:
+      case PPC::STW:
+      case PPC::STW8:
+      case PPC::STFS:
+      case PPC::STFD:
+        Reloc = PPC::reloc_absolute_low;
+        break;
+
+      case PPC::LWA:
+      case PPC::LD:
+      case PPC::STD:
+      case PPC::STD_32:
+        Reloc = PPC::reloc_absolute_low_ix;
+        break;
+      }
+    }
+    
+    MachineRelocation R;
+    if (MO.isGlobalAddress()) {
+      R = MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                   MO.getGlobal(), 0);
+    } else if (MO.isExternalSymbol()) {
+      R = MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                       Reloc, MO.getSymbolName(), 0);
+    } else if (MO.isConstantPoolIndex()) {
+      R = MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getConstantPoolIndex(), 0);
+    } else {
+      assert(MO.isJumpTableIndex());
+      R = MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getJumpTableIndex(), 0);
+    }
+    
+    // If in PIC mode, we need to encode the negated address of the
+    // 'movepctolr' into the unrelocated field.  After relocation, we'll have
+    // &gv-&movepctolr-4 in the imm field.  Once &movepctolr is added to the imm
+    // field, we get &gv.  This doesn't happen for branch relocations, which are
+    // always implicitly pc relative.
+    if (TM.getRelocationModel() == Reloc::PIC_ && Reloc != PPC::reloc_pcrel_bx){
+      assert(MovePCtoLROffset && "MovePCtoLR not seen yet?");
+      R.setConstantVal(-(intptr_t)MovePCtoLROffset - 4);
+    }
+    MCE.addRelocation(R);
+    
+  } else if (MO.isMachineBasicBlock()) {
+    unsigned Reloc = 0;
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == PPC::B || Opcode == PPC::BL_Macho ||
+        Opcode == PPC::BLA_Macho || Opcode == PPC::BL_ELF || 
+        Opcode == PPC::BLA_ELF)
+      Reloc = PPC::reloc_pcrel_bx;
+    else // BCC instruction
+      Reloc = PPC::reloc_pcrel_bcx;
+    MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                               Reloc,
+                                               MO.getMachineBasicBlock()));
+  } else {
+    cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
+    abort();
+  }
+
+  return rv;
+}
+
+#include "PPCGenCodeEmitter.inc"
+

diff --git a/lib/Target/PowerPC/PPCFrameInfo.h b/lib/Target/PowerPC/PPCFrameInfo.h
new file mode 100644
index 0000000..81365e9
--- /dev/null
+++ b/lib/Target/PowerPC/PPCFrameInfo.h

@@ -0,0 +1,93 @@
+//===-- PPCFrameInfo.h - Define TargetFrameInfo for PowerPC -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_FRAMEINFO_H
+#define POWERPC_FRAMEINFO_H
+
+#include "PPC.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class PPCFrameInfo: public TargetFrameInfo {
+  const TargetMachine &TM;
+
+public:
+  PPCFrameInfo(const TargetMachine &tm, bool LP64)
+    : TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0), TM(tm) {
+  }
+
+  /// getReturnSaveOffset - Return the previous frame offset to save the
+  /// return address.
+  static unsigned getReturnSaveOffset(bool LP64, bool isMacho) {
+    if (isMacho)
+      return LP64 ? 16 : 8;
+    // For ELF 32 ABI:
+    return 4;
+  }
+
+  /// getFramePointerSaveOffset - Return the previous frame offset to save the
+  /// frame pointer.
+  static unsigned getFramePointerSaveOffset(bool LP64, bool isMacho) {
+    // For MachO ABI:
+    // Use the TOC save slot in the PowerPC linkage area for saving the frame
+    // pointer (if needed.)  LLVM does not generate code that uses the TOC (R2
+    // is treated as a caller saved register.)
+    if (isMacho)
+      return LP64 ? 40 : 20;
+    
+    // For ELF 32 ABI:
+    // Save it right before the link register
+    return -4U;
+  }
+  
+  /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
+  ///
+  static unsigned getLinkageSize(bool LP64, bool isMacho) {
+    if (isMacho)
+      return 6 * (LP64 ? 8 : 4);
+    
+    // For ELF 32 ABI:
+    return 8;
+  }
+
+  /// getMinCallArgumentsSize - Return the size of the minium PowerPC ABI
+  /// argument area.
+  static unsigned getMinCallArgumentsSize(bool LP64, bool isMacho) {
+    // For Macho ABI:
+    // The prolog code of the callee may store up to 8 GPR argument registers to
+    // the stack, allowing va_start to index over them in memory if its varargs.
+    // Because we cannot tell if this is needed on the caller side, we have to
+    // conservatively assume that it is needed.  As such, make sure we have at
+    // least enough stack space for the caller to store the 8 GPRs.
+    if (isMacho)
+      return 8 * (LP64 ? 8 : 4);
+    
+    // For ELF 32 ABI:
+    // There is no default stack allocated for the 8 first GPR arguments.
+    return 0;
+  }
+
+  /// getMinCallFrameSize - Return the minimum size a call frame can be using
+  /// the PowerPC ABI.
+  static unsigned getMinCallFrameSize(bool LP64, bool isMacho) {
+    // The call frame needs to be at least big enough for linkage and 8 args.
+    return getLinkageSize(LP64, isMacho) +
+           getMinCallArgumentsSize(LP64, isMacho);
+  }
+  
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
new file mode 100644
index 0000000..26e1f47
--- /dev/null
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp

@@ -0,0 +1,303 @@
+//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "PPCHazardRecognizers.h"
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// PowerPC 970 Hazard Recognizer
+//
+// This models the dispatch group formation of the PPC970 processor.  Dispatch
+// groups are bundles of up to five instructions that can contain various mixes
+// of instructions.  The PPC970 can dispatch a peak of 4 non-branch and one 
+// branch instruction per-cycle.
+//
+// There are a number of restrictions to dispatch group formation: some
+// instructions can only be issued in the first slot of a dispatch group, & some
+// instructions fill an entire dispatch group.  Additionally, only branches can
+// issue in the 5th (last) slot.
+//
+// Finally, there are a number of "structural" hazards on the PPC970.  These
+// conditions cause large performance penalties due to misprediction, recovery,
+// and replay logic that has to happen.  These cases include setting a CTR and
+// branching through it in the same dispatch group, and storing to an address,
+// then loading from the same address within a dispatch group.  To avoid these
+// conditions, we insert no-op instructions when appropriate.
+//
+// FIXME: This is missing some significant cases:
+//   1. Modeling of microcoded instructions.
+//   2. Handling of serialized operations.
+//   3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
+//
+
+PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetInstrInfo &tii)
+  : TII(tii) {
+  EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::EndDispatchGroup() {
+  DOUT << "=== Start of dispatch group\n";
+  NumIssued = 0;
+  
+  // Structural hazard info.
+  HasCTRSet = false;
+  NumStores = 0;
+}
+
+
+PPCII::PPC970_Unit 
+PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
+                                     bool &isFirst, bool &isSingle,
+                                     bool &isCracked,
+                                     bool &isLoad, bool &isStore) {
+  if (Opcode < ISD::BUILTIN_OP_END) {
+    isFirst = isSingle = isCracked = isLoad = isStore = false;
+    return PPCII::PPC970_Pseudo;
+  }
+  Opcode -= ISD::BUILTIN_OP_END;
+  
+  const TargetInstrDescriptor &TID = TII.get(Opcode);
+  
+  isLoad  = TID.Flags & M_LOAD_FLAG;
+  isStore = TID.Flags & M_STORE_FLAG;
+  
+  unsigned TSFlags = TID.TSFlags;
+  
+  isFirst   = TSFlags & PPCII::PPC970_First;
+  isSingle  = TSFlags & PPCII::PPC970_Single;
+  isCracked = TSFlags & PPCII::PPC970_Cracked;
+  return (PPCII::PPC970_Unit)(TSFlags & PPCII::PPC970_Mask);
+}
+
+/// isLoadOfStoredAddress - If we have a load from the previously stored pointer
+/// as indicated by StorePtr1/StorePtr2/StoreSize, return true.
+bool PPCHazardRecognizer970::
+isLoadOfStoredAddress(unsigned LoadSize, SDOperand Ptr1, SDOperand Ptr2) const {
+  for (unsigned i = 0, e = NumStores; i != e; ++i) {
+    // Handle exact and commuted addresses.
+    if (Ptr1 == StorePtr1[i] && Ptr2 == StorePtr2[i])
+      return true;
+    if (Ptr2 == StorePtr1[i] && Ptr1 == StorePtr2[i])
+      return true;
+    
+    // Okay, we don't have an exact match, if this is an indexed offset, see if
+    // we have overlap (which happens during fp->int conversion for example).
+    if (StorePtr2[i] == Ptr2) {
+      if (ConstantSDNode *StoreOffset = dyn_cast<ConstantSDNode>(StorePtr1[i]))
+        if (ConstantSDNode *LoadOffset = dyn_cast<ConstantSDNode>(Ptr1)) {
+          // Okay the base pointers match, so we have [c1+r] vs [c2+r].  Check
+          // to see if the load and store actually overlap.
+          int StoreOffs = StoreOffset->getValue();
+          int LoadOffs  = LoadOffset->getValue();
+          if (StoreOffs < LoadOffs) {
+            if (int(StoreOffs+StoreSize[i]) > LoadOffs) return true;
+          } else {
+            if (int(LoadOffs+LoadSize) > StoreOffs) return true;
+          }
+        }
+    }
+  }
+  return false;
+}
+
+/// getHazardType - We return hazard for any non-branch instruction that would
+/// terminate terminate the dispatch group.  We turn NoopHazard for any
+/// instructions that wouldn't terminate the dispatch group that would cause a
+/// pipeline flush.
+HazardRecognizer::HazardType PPCHazardRecognizer970::
+getHazardType(SDNode *Node) {
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType = 
+    GetInstrType(Node->getOpcode(), isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return NoHazard;  
+  unsigned Opcode = Node->getOpcode()-ISD::BUILTIN_OP_END;
+
+  // We can only issue a PPC970_First/PPC970_Single instruction (such as
+  // crand/mtspr/etc) if this is the first cycle of the dispatch group.
+  if (NumIssued != 0 && (isFirst || isSingle))
+    return Hazard;
+  
+  // If this instruction is cracked into two ops by the decoder, we know that
+  // it is not a branch and that it cannot issue if 3 other instructions are
+  // already in the dispatch group.
+  if (isCracked && NumIssued > 2)
+    return Hazard;
+      
+  switch (InstrType) {
+  default: assert(0 && "Unknown instruction type!");
+  case PPCII::PPC970_FXU:
+  case PPCII::PPC970_LSU:
+  case PPCII::PPC970_FPU:
+  case PPCII::PPC970_VALU:
+  case PPCII::PPC970_VPERM:
+    // We can only issue a branch as the last instruction in a group.
+    if (NumIssued == 4) return Hazard;
+    break;
+  case PPCII::PPC970_CRU:
+    // We can only issue a CR instruction in the first two slots.
+    if (NumIssued >= 2) return Hazard;
+    break;
+  case PPCII::PPC970_BRU:
+    break;
+  }
+  
+  // Do not allow MTCTR and BCTRL to be in the same dispatch group.
+  if (HasCTRSet && (Opcode == PPC::BCTRL_Macho || Opcode == PPC::BCTRL_ELF))
+    return NoopHazard;
+  
+  // If this is a load following a store, make sure it's not to the same or
+  // overlapping address.
+  if (isLoad && NumStores) {
+    unsigned LoadSize;
+    switch (Opcode) {
+    default: assert(0 && "Unknown load!");
+    case PPC::LBZ:   case PPC::LBZU:
+    case PPC::LBZX:
+    case PPC::LBZ8:  case PPC::LBZU8:
+    case PPC::LBZX8:
+    case PPC::LVEBX:
+      LoadSize = 1;
+      break;
+    case PPC::LHA:   case PPC::LHAU:
+    case PPC::LHAX:
+    case PPC::LHZ:   case PPC::LHZU:
+    case PPC::LHZX:
+    case PPC::LVEHX:
+    case PPC::LHBRX:
+    case PPC::LHA8:   case PPC::LHAU8:
+    case PPC::LHAX8:
+    case PPC::LHZ8:   case PPC::LHZU8:
+    case PPC::LHZX8:
+      LoadSize = 2;
+      break;
+    case PPC::LFS:    case PPC::LFSU:
+    case PPC::LFSX:
+    case PPC::LWZ:    case PPC::LWZU:
+    case PPC::LWZX:
+    case PPC::LWA:
+    case PPC::LWAX:
+    case PPC::LVEWX:
+    case PPC::LWBRX:
+    case PPC::LWZ8:
+    case PPC::LWZX8:
+      LoadSize = 4;
+      break;
+    case PPC::LFD:    case PPC::LFDU:
+    case PPC::LFDX:
+    case PPC::LD:     case PPC::LDU:
+    case PPC::LDX:
+      LoadSize = 8;
+      break;
+    case PPC::LVX:
+      LoadSize = 16;
+      break;
+    }
+    
+    if (isLoadOfStoredAddress(LoadSize, 
+                              Node->getOperand(0), Node->getOperand(1)))
+      return NoopHazard;
+  }
+  
+  return NoHazard;
+}
+
+void PPCHazardRecognizer970::EmitInstruction(SDNode *Node) {
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType = 
+    GetInstrType(Node->getOpcode(), isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return;  
+  unsigned Opcode = Node->getOpcode()-ISD::BUILTIN_OP_END;
+
+  // Update structural hazard information.
+  if (Opcode == PPC::MTCTR) HasCTRSet = true;
+  
+  // Track the address stored to.
+  if (isStore) {
+    unsigned ThisStoreSize;
+    switch (Opcode) {
+    default: assert(0 && "Unknown store instruction!");
+    case PPC::STB:    case PPC::STB8:
+    case PPC::STBU:   case PPC::STBU8:
+    case PPC::STBX:   case PPC::STBX8:
+    case PPC::STVEBX:
+      ThisStoreSize = 1;
+      break;
+    case PPC::STH:    case PPC::STH8:
+    case PPC::STHU:   case PPC::STHU8:
+    case PPC::STHX:   case PPC::STHX8:
+    case PPC::STVEHX:
+    case PPC::STHBRX:
+      ThisStoreSize = 2;
+      break;
+    case PPC::STFS:
+    case PPC::STFSU:
+    case PPC::STFSX:
+    case PPC::STWX:   case PPC::STWX8:
+    case PPC::STWUX:
+    case PPC::STW:    case PPC::STW8:
+    case PPC::STWU:   case PPC::STWU8:
+    case PPC::STVEWX:
+    case PPC::STFIWX:
+    case PPC::STWBRX:
+      ThisStoreSize = 4;
+      break;
+    case PPC::STD_32:
+    case PPC::STDX_32:
+    case PPC::STD:
+    case PPC::STDU:
+    case PPC::STFD:
+    case PPC::STFDX:
+    case PPC::STDX:
+    case PPC::STDUX:
+      ThisStoreSize = 8;
+      break;
+    case PPC::STVX:
+      ThisStoreSize = 16;
+      break;
+    }
+    
+    StoreSize[NumStores] = ThisStoreSize;
+    StorePtr1[NumStores] = Node->getOperand(1);
+    StorePtr2[NumStores] = Node->getOperand(2);
+    ++NumStores;
+  }
+  
+  if (InstrType == PPCII::PPC970_BRU || isSingle)
+    NumIssued = 4;  // Terminate a d-group.
+  ++NumIssued;
+  
+  // If this instruction is cracked into two ops by the decoder, remember that
+  // we issued two pieces.
+  if (isCracked)
+    ++NumIssued;
+  
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::AdvanceCycle() {
+  assert(NumIssued < 5 && "Illegal dispatch group!");
+  ++NumIssued;
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::EmitNoop() {
+  AdvanceCycle();
+}

diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
new file mode 100644
index 0000000..cbff943
--- /dev/null
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h

@@ -0,0 +1,73 @@
+//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCHAZRECS_H
+#define PPCHAZRECS_H
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "PPCInstrInfo.h"
+
+namespace llvm {
+  
+/// PPCHazardRecognizer970 - This class defines a finite state automata that
+/// models the dispatch logic on the PowerPC 970 (aka G5) processor.  This
+/// promotes good dispatch group formation and implements noop insertion to
+/// avoid structural hazards that cause significant performance penalties (e.g.
+/// setting the CTR register then branching through it within a dispatch group),
+/// or storing then loading from the same address within a dispatch group.
+class PPCHazardRecognizer970 : public HazardRecognizer {
+  const TargetInstrInfo &TII;
+  
+  unsigned NumIssued;  // Number of insts issued, including advanced cycles.
+  
+  // Various things that can cause a structural hazard.
+  
+  // HasCTRSet - If the CTR register is set in this group, disallow BCTRL.
+  bool HasCTRSet;
+  
+  // StoredPtr - Keep track of the address of any store.  If we see a load from
+  // the same address (or one that aliases it), disallow the store.  We can have
+  // up to four stores in one dispatch group, hence we track up to 4.
+  //
+  // This is null if we haven't seen a store yet.  We keep track of both
+  // operands of the store here, since we support [r+r] and [r+i] addressing.
+  SDOperand StorePtr1[4], StorePtr2[4];
+  unsigned  StoreSize[4];
+  unsigned NumStores;
+  
+public:
+  PPCHazardRecognizer970(const TargetInstrInfo &TII);
+  virtual HazardType getHazardType(SDNode *Node);
+  virtual void EmitInstruction(SDNode *Node);
+  virtual void AdvanceCycle();
+  virtual void EmitNoop();
+  
+private:
+  /// EndDispatchGroup - Called when we are finishing a new dispatch group.
+  ///
+  void EndDispatchGroup();
+  
+  /// GetInstrType - Classify the specified powerpc opcode according to its
+  /// pipeline.
+  PPCII::PPC970_Unit GetInstrType(unsigned Opcode,
+                                  bool &isFirst, bool &isSingle,bool &isCracked,
+                                  bool &isLoad, bool &isStore);
+  
+  bool isLoadOfStoredAddress(unsigned LoadSize,
+                             SDOperand Ptr1, SDOperand Ptr2) const;
+};
+
+} // end namespace llvm
+
+#endif
+

diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
new file mode 100644
index 0000000..730bac6
--- /dev/null
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp

@@ -0,0 +1,1122 @@
+//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for PowerPC,
+// converting from a legalized dag to a PPC dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-codegen"
+#include "PPC.h"
+#include "PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "PPCISelLowering.h"
+#include "PPCHazardRecognizers.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Compiler.h"
+#include <queue>
+#include <set>
+using namespace llvm;
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// PPCDAGToDAGISel - PPC specific code to select PPC machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class VISIBILITY_HIDDEN PPCDAGToDAGISel : public SelectionDAGISel {
+    PPCTargetMachine &TM;
+    PPCTargetLowering PPCLowering;
+    unsigned GlobalBaseReg;
+  public:
+    PPCDAGToDAGISel(PPCTargetMachine &tm)
+      : SelectionDAGISel(PPCLowering), TM(tm),
+        PPCLowering(*TM.getTargetLowering()) {}
+    
+    virtual bool runOnFunction(Function &Fn) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      SelectionDAGISel::runOnFunction(Fn);
+      
+      InsertVRSaveCode(Fn);
+      return true;
+    }
+   
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDOperand getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDOperand getI64Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+    
+    /// getSmallIPtrImm - Return a target constant of pointer type.
+    inline SDOperand getSmallIPtrImm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, PPCLowering.getPointerTy());
+    }
+    
+    /// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s 
+    /// with any number of 0s on either side.  The 1s are allowed to wrap from
+    /// LSB to MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.
+    /// 0x0F0F0000 is not, since all 1s are not contiguous.
+    static bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME);
+
+
+    /// isRotateAndMask - Returns true if Mask and Shift can be folded into a
+    /// rotate and mask opcode and mask operation.
+    static bool isRotateAndMask(SDNode *N, unsigned Mask, bool IsShiftMask,
+                                unsigned &SH, unsigned &MB, unsigned &ME);
+    
+    /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+    /// base register.  Return the virtual register that holds this value.
+    SDNode *getGlobalBaseReg();
+    
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    SDNode *Select(SDOperand Op);
+    
+    SDNode *SelectBitfieldInsert(SDNode *N);
+
+    /// SelectCC - Select a comparison of the specified values with the
+    /// specified condition code, returning the CR# of the expression.
+    SDOperand SelectCC(SDOperand LHS, SDOperand RHS, ISD::CondCode CC);
+
+    /// SelectAddrImm - Returns true if the address N can be represented by
+    /// a base register plus a signed 16-bit displacement [r+imm].
+    bool SelectAddrImm(SDOperand Op, SDOperand N, SDOperand &Disp,
+                       SDOperand &Base) {
+      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG);
+    }
+    
+    /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
+    /// immediate field.  Because preinc imms have already been validated, just
+    /// accept it.
+    bool SelectAddrImmOffs(SDOperand Op, SDOperand N, SDOperand &Out) const {
+      Out = N;
+      return true;
+    }
+      
+    /// SelectAddrIdx - Given the specified addressed, check to see if it can be
+    /// represented as an indexed [r+r] operation.  Returns false if it can
+    /// be represented by [r+imm], which are preferred.
+    bool SelectAddrIdx(SDOperand Op, SDOperand N, SDOperand &Base,
+                       SDOperand &Index) {
+      return PPCLowering.SelectAddressRegReg(N, Base, Index, *CurDAG);
+    }
+    
+    /// SelectAddrIdxOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddrIdxOnly(SDOperand Op, SDOperand N, SDOperand &Base,
+                           SDOperand &Index) {
+      return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
+    }
+
+    /// SelectAddrImmShift - Returns true if the address N can be represented by
+    /// a base register plus a signed 14-bit displacement [r+imm*4].  Suitable
+    /// for use by STD and friends.
+    bool SelectAddrImmShift(SDOperand Op, SDOperand N, SDOperand &Disp,
+                            SDOperand &Base) {
+      return PPCLowering.SelectAddressRegImmShift(N, Disp, Base, *CurDAG);
+    }
+      
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDOperand &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDOperand> &OutOps,
+                                              SelectionDAG &DAG) {
+      SDOperand Op0, Op1;
+      switch (ConstraintCode) {
+      default: return true;
+      case 'm':   // memory
+        if (!SelectAddrIdx(Op, Op, Op0, Op1))
+          SelectAddrImm(Op, Op, Op0, Op1);
+        break;
+      case 'o':   // offsetable
+        if (!SelectAddrImm(Op, Op, Op0, Op1)) {
+          Op0 = Op;
+          AddToISelQueue(Op0);     // r+0.
+          Op1 = getSmallIPtrImm(0);
+        }
+        break;
+      case 'v':   // not offsetable
+        SelectAddrIdxOnly(Op, Op, Op0, Op1);
+        break;
+      }
+      
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+    
+    SDOperand BuildSDIVSequence(SDNode *N);
+    SDOperand BuildUDIVSequence(SDNode *N);
+    
+    /// InstructionSelectBasicBlock - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelectBasicBlock(SelectionDAG &DAG);
+    
+    void InsertVRSaveCode(Function &Fn);
+
+    virtual const char *getPassName() const {
+      return "PowerPC DAG->DAG Pattern Instruction Selection";
+    } 
+    
+    /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+    /// this target when scheduling the DAG.
+    virtual HazardRecognizer *CreateTargetHazardRecognizer() {
+      // Should use subtarget info to pick the right hazard recognizer.  For
+      // now, always return a PPC970 recognizer.
+      const TargetInstrInfo *II = PPCLowering.getTargetMachine().getInstrInfo();
+      assert(II && "No InstrInfo?");
+      return new PPCHazardRecognizer970(*II); 
+    }
+
+// Include the pieces autogenerated from the target description.
+#include "PPCGenDAGISel.inc"
+    
+private:
+    SDNode *SelectSETCC(SDOperand Op);
+  };
+}
+
+/// InstructionSelectBasicBlock - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void PPCDAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
+  DEBUG(BB->dump());
+
+  // Select target instructions for the DAG.
+  DAG.setRoot(SelectRoot(DAG.getRoot()));
+  DAG.RemoveDeadNodes();
+  
+  // Emit machine code to BB.
+  ScheduleAndEmitDAG(DAG);
+}
+
+/// InsertVRSaveCode - Once the entire function has been instruction selected,
+/// all virtual registers are created and all machine instructions are built,
+/// check to see if we need to save/restore VRSAVE.  If so, do it.
+void PPCDAGToDAGISel::InsertVRSaveCode(Function &F) {
+  // Check to see if this function uses vector registers, which means we have to
+  // save and restore the VRSAVE register and update it with the regs we use.  
+  //
+  // In this case, there will be virtual registers of vector type type created
+  // by the scheduler.  Detect them now.
+  MachineFunction &Fn = MachineFunction::get(&F);
+  SSARegMap *RegMap = Fn.getSSARegMap();
+  bool HasVectorVReg = false;
+  for (unsigned i = MRegisterInfo::FirstVirtualRegister, 
+       e = RegMap->getLastVirtReg()+1; i != e; ++i)
+    if (RegMap->getRegClass(i) == &PPC::VRRCRegClass) {
+      HasVectorVReg = true;
+      break;
+    }
+  if (!HasVectorVReg) return;  // nothing to do.
+      
+  // If we have a vector register, we want to emit code into the entry and exit
+  // blocks to save and restore the VRSAVE register.  We do this here (instead
+  // of marking all vector instructions as clobbering VRSAVE) for two reasons:
+  //
+  // 1. This (trivially) reduces the load on the register allocator, by not
+  //    having to represent the live range of the VRSAVE register.
+  // 2. This (more significantly) allows us to create a temporary virtual
+  //    register to hold the saved VRSAVE value, allowing this temporary to be
+  //    register allocated, instead of forcing it to be spilled to the stack.
+
+  // Create two vregs - one to hold the VRSAVE register that is live-in to the
+  // function and one for the value after having bits or'd into it.
+  unsigned InVRSAVE = RegMap->createVirtualRegister(&PPC::GPRCRegClass);
+  unsigned UpdatedVRSAVE = RegMap->createVirtualRegister(&PPC::GPRCRegClass);
+  
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  MachineBasicBlock &EntryBB = *Fn.begin();
+  // Emit the following code into the entry block:
+  // InVRSAVE = MFVRSAVE
+  // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
+  // MTVRSAVE UpdatedVRSAVE
+  MachineBasicBlock::iterator IP = EntryBB.begin();  // Insert Point
+  BuildMI(EntryBB, IP, TII.get(PPC::MFVRSAVE), InVRSAVE);
+  BuildMI(EntryBB, IP, TII.get(PPC::UPDATE_VRSAVE), UpdatedVRSAVE).addReg(InVRSAVE);
+  BuildMI(EntryBB, IP, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
+  
+  // Find all return blocks, outputting a restore in each epilog.
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    if (!BB->empty() && TII.isReturn(BB->back().getOpcode())) {
+      IP = BB->end(); --IP;
+      
+      // Skip over all terminator instructions, which are part of the return
+      // sequence.
+      MachineBasicBlock::iterator I2 = IP;
+      while (I2 != BB->begin() && TII.isTerminatorInstr((--I2)->getOpcode()))
+        IP = I2;
+      
+      // Emit: MTVRSAVE InVRSave
+      BuildMI(*BB, IP, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
+    }        
+  }
+}
+
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
+  if (!GlobalBaseReg) {
+    const TargetInstrInfo &TII = *TM.getInstrInfo();
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = BB->getParent()->front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    SSARegMap *RegMap = BB->getParent()->getSSARegMap();
+
+    if (PPCLowering.getPointerTy() == MVT::i32) {
+      GlobalBaseReg = RegMap->createVirtualRegister(PPC::GPRCRegisterClass);
+      BuildMI(FirstMBB, MBBI, TII.get(PPC::MovePCtoLR), PPC::LR);
+      BuildMI(FirstMBB, MBBI, TII.get(PPC::MFLR), GlobalBaseReg);
+    } else {
+      GlobalBaseReg = RegMap->createVirtualRegister(PPC::G8RCRegisterClass);
+      BuildMI(FirstMBB, MBBI, TII.get(PPC::MovePCtoLR8), PPC::LR8);
+      BuildMI(FirstMBB, MBBI, TII.get(PPC::MFLR8), GlobalBaseReg);
+    }
+  }
+  return CurDAG->getRegister(GlobalBaseReg, PPCLowering.getPointerTy()).Val;
+}
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (N->getOpcode() != ISD::Constant)
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getValue();
+}
+
+static bool isIntS16Immediate(SDOperand Op, short &Imm) {
+  return isIntS16Immediate(Op.Val, Imm);
+}
+
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getValue();
+    return true;
+  }
+  return false;
+}
+
+/// isInt64Immediate - This method tests to see if the node is a 64-bit constant
+/// operand.  If so Imm will receive the 64-bit value.
+static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) {
+    Imm = cast<ConstantSDNode>(N)->getValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDOperand N, unsigned &Imm) {
+  return isInt32Immediate(N.Val, Imm);
+}
+
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc && isInt32Immediate(N->getOperand(1).Val, Imm);
+}
+
+bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
+  if (isShiftedMask_32(Val)) {
+    // look for the first non-zero bit
+    MB = CountLeadingZeros_32(Val);
+    // look for the first zero bit after the run of ones
+    ME = CountLeadingZeros_32((Val - 1) ^ Val);
+    return true;
+  } else {
+    Val = ~Val; // invert mask
+    if (isShiftedMask_32(Val)) {
+      // effectively look for the first zero bit
+      ME = CountLeadingZeros_32(Val) - 1;
+      // effectively look for the first one bit after the run of zeros
+      MB = CountLeadingZeros_32((Val - 1) ^ Val) + 1;
+      return true;
+    }
+  }
+  // no run present
+  return false;
+}
+
+bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask, 
+                                      bool IsShiftMask, unsigned &SH, 
+                                      unsigned &MB, unsigned &ME) {
+  // Don't even go down this path for i64, since different logic will be
+  // necessary for rldicl/rldicr/rldimi.
+  if (N->getValueType(0) != MVT::i32)
+    return false;
+
+  unsigned Shift  = 32;
+  unsigned Indeterminant = ~0;  // bit mask marking indeterminant results
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() != 2 ||
+      !isInt32Immediate(N->getOperand(1).Val, Shift) || (Shift > 31))
+    return false;
+  
+  if (Opcode == ISD::SHL) {
+    // apply shift left to mask if it comes first
+    if (IsShiftMask) Mask = Mask << Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu << Shift);
+  } else if (Opcode == ISD::SRL) { 
+    // apply shift right to mask if it comes first
+    if (IsShiftMask) Mask = Mask >> Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu >> Shift);
+    // adjust for the left rotate
+    Shift = 32 - Shift;
+  } else if (Opcode == ISD::ROTL) {
+    Indeterminant = 0;
+  } else {
+    return false;
+  }
+  
+  // if the mask doesn't intersect any Indeterminant bits
+  if (Mask && !(Mask & Indeterminant)) {
+    SH = Shift & 31;
+    // make sure the mask is still a mask (wrap arounds may not be)
+    return isRunOfOnes(Mask, MB, ME);
+  }
+  return false;
+}
+
+/// SelectBitfieldInsert - turn an or of two masked values into
+/// the rotate left word immediate then mask insert (rlwimi) instruction.
+SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
+  SDOperand Op0 = N->getOperand(0);
+  SDOperand Op1 = N->getOperand(1);
+  
+  uint64_t LKZ, LKO, RKZ, RKO;
+  CurDAG->ComputeMaskedBits(Op0, 0xFFFFFFFFULL, LKZ, LKO);
+  CurDAG->ComputeMaskedBits(Op1, 0xFFFFFFFFULL, RKZ, RKO);
+  
+  unsigned TargetMask = LKZ;
+  unsigned InsertMask = RKZ;
+  
+  if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
+    unsigned Op0Opc = Op0.getOpcode();
+    unsigned Op1Opc = Op1.getOpcode();
+    unsigned Value, SH = 0;
+    TargetMask = ~TargetMask;
+    InsertMask = ~InsertMask;
+
+    // If the LHS has a foldable shift and the RHS does not, then swap it to the
+    // RHS so that we can fold the shift into the insert.
+    if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) {
+      if (Op0.getOperand(0).getOpcode() == ISD::SHL ||
+          Op0.getOperand(0).getOpcode() == ISD::SRL) {
+        if (Op1.getOperand(0).getOpcode() != ISD::SHL &&
+            Op1.getOperand(0).getOpcode() != ISD::SRL) {
+          std::swap(Op0, Op1);
+          std::swap(Op0Opc, Op1Opc);
+          std::swap(TargetMask, InsertMask);
+        }
+      }
+    } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) {
+      if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL &&
+          Op1.getOperand(0).getOpcode() != ISD::SRL) {
+        std::swap(Op0, Op1);
+        std::swap(Op0Opc, Op1Opc);
+        std::swap(TargetMask, InsertMask);
+      }
+    }
+    
+    unsigned MB, ME;
+    if (InsertMask && isRunOfOnes(InsertMask, MB, ME)) {
+      SDOperand Tmp1, Tmp2, Tmp3;
+      bool DisjointMask = (TargetMask ^ InsertMask) == 0xFFFFFFFF;
+
+      if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
+          isInt32Immediate(Op1.getOperand(1), Value)) {
+        Op1 = Op1.getOperand(0);
+        SH  = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
+      }
+      if (Op1Opc == ISD::AND) {
+        unsigned SHOpc = Op1.getOperand(0).getOpcode();
+        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) &&
+            isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
+          Op1 = Op1.getOperand(0).getOperand(0);
+          SH  = (SHOpc == ISD::SHL) ? Value : 32 - Value;
+        } else {
+          Op1 = Op1.getOperand(0);
+        }
+      }
+      
+      Tmp3 = (Op0Opc == ISD::AND && DisjointMask) ? Op0.getOperand(0) : Op0;
+      AddToISelQueue(Tmp3);
+      AddToISelQueue(Op1);
+      SH &= 31;
+      SDOperand Ops[] = { Tmp3, Op1, getI32Imm(SH), getI32Imm(MB),
+                          getI32Imm(ME) };
+      return CurDAG->getTargetNode(PPC::RLWIMI, MVT::i32, Ops, 5);
+    }
+  }
+  return 0;
+}
+
+/// SelectCC - Select a comparison of the specified values with the specified
+/// condition code, returning the CR# of the expression.
+SDOperand PPCDAGToDAGISel::SelectCC(SDOperand LHS, SDOperand RHS,
+                                    ISD::CondCode CC) {
+  // Always select the LHS.
+  AddToISelQueue(LHS);
+  unsigned Opc;
+  
+  if (LHS.getValueType() == MVT::i32) {
+    unsigned Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt32Immediate(RHS, Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt16(Imm))
+          return SDOperand(CurDAG->getTargetNode(PPC::CMPLWI, MVT::i32, LHS,
+                                                 getI32Imm(Imm & 0xFFFF)), 0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt16((int)Imm))
+          return SDOperand(CurDAG->getTargetNode(PPC::CMPWI, MVT::i32, LHS,
+                                                 getI32Imm(Imm & 0xFFFF)), 0);
+        
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136 
+        //   cmpw cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmplwi cr0,r0,0x5678
+        //   beq cr0,L6
+        SDOperand Xor(CurDAG->getTargetNode(PPC::XORIS, MVT::i32, LHS,
+                                            getI32Imm(Imm >> 16)), 0);
+        return SDOperand(CurDAG->getTargetNode(PPC::CMPLWI, MVT::i32, Xor,
+                                               getI32Imm(Imm & 0xFFFF)), 0);
+      }
+      Opc = PPC::CMPLW;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt32Immediate(RHS, Imm) && isUInt16(Imm))
+        return SDOperand(CurDAG->getTargetNode(PPC::CMPLWI, MVT::i32, LHS,
+                                               getI32Imm(Imm & 0xFFFF)), 0);
+      Opc = PPC::CMPLW;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDOperand(CurDAG->getTargetNode(PPC::CMPWI, MVT::i32, LHS,
+                                               getI32Imm((int)SImm & 0xFFFF)),
+                         0);
+      Opc = PPC::CMPW;
+    }
+  } else if (LHS.getValueType() == MVT::i64) {
+    uint64_t Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt64Immediate(RHS.Val, Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt16(Imm))
+          return SDOperand(CurDAG->getTargetNode(PPC::CMPLDI, MVT::i64, LHS,
+                                                 getI32Imm(Imm & 0xFFFF)), 0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt16(Imm))
+          return SDOperand(CurDAG->getTargetNode(PPC::CMPDI, MVT::i64, LHS,
+                                                 getI32Imm(Imm & 0xFFFF)), 0);
+        
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136 
+        //   cmpd cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmpldi cr0,r0,0x5678
+        //   beq cr0,L6
+        if (isUInt32(Imm)) {
+          SDOperand Xor(CurDAG->getTargetNode(PPC::XORIS8, MVT::i64, LHS,
+                                              getI64Imm(Imm >> 16)), 0);
+          return SDOperand(CurDAG->getTargetNode(PPC::CMPLDI, MVT::i64, Xor,
+                                                 getI64Imm(Imm & 0xFFFF)), 0);
+        }
+      }
+      Opc = PPC::CMPLD;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt64Immediate(RHS.Val, Imm) && isUInt16(Imm))
+        return SDOperand(CurDAG->getTargetNode(PPC::CMPLDI, MVT::i64, LHS,
+                                               getI64Imm(Imm & 0xFFFF)), 0);
+      Opc = PPC::CMPLD;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDOperand(CurDAG->getTargetNode(PPC::CMPDI, MVT::i64, LHS,
+                                               getI64Imm(SImm & 0xFFFF)),
+                         0);
+      Opc = PPC::CMPD;
+    }
+  } else if (LHS.getValueType() == MVT::f32) {
+    Opc = PPC::FCMPUS;
+  } else {
+    assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
+    Opc = PPC::FCMPUD;
+  }
+  AddToISelQueue(RHS);
+  return SDOperand(CurDAG->getTargetNode(Opc, MVT::i32, LHS, RHS), 0);
+}
+
+static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown condition!"); abort();
+  case ISD::SETOEQ:    // FIXME: This is incorrect see PR642.
+  case ISD::SETUEQ:
+  case ISD::SETEQ:  return PPC::PRED_EQ;
+  case ISD::SETONE:    // FIXME: This is incorrect see PR642.
+  case ISD::SETUNE:
+  case ISD::SETNE:  return PPC::PRED_NE;
+  case ISD::SETOLT:    // FIXME: This is incorrect see PR642.
+  case ISD::SETULT:
+  case ISD::SETLT:  return PPC::PRED_LT;
+  case ISD::SETOLE:    // FIXME: This is incorrect see PR642.
+  case ISD::SETULE:
+  case ISD::SETLE:  return PPC::PRED_LE;
+  case ISD::SETOGT:    // FIXME: This is incorrect see PR642.
+  case ISD::SETUGT:
+  case ISD::SETGT:  return PPC::PRED_GT;
+  case ISD::SETOGE:    // FIXME: This is incorrect see PR642.
+  case ISD::SETUGE:
+  case ISD::SETGE:  return PPC::PRED_GE;
+    
+  case ISD::SETO:   return PPC::PRED_NU;
+  case ISD::SETUO:  return PPC::PRED_UN;
+  }
+}
+
+/// getCRIdxForSetCC - Return the index of the condition register field
+/// associated with the SetCC condition, and whether or not the field is
+/// treated as inverted.  That is, lt = 0; ge = 0 inverted.
+static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool& Inv) {
+  switch (CC) {
+  default: assert(0 && "Unknown condition!"); abort();
+  case ISD::SETOLT:  // FIXME: This is incorrect see PR642.
+  case ISD::SETULT:
+  case ISD::SETLT:  Inv = false;  return 0;
+  case ISD::SETOGE:  // FIXME: This is incorrect see PR642.
+  case ISD::SETUGE:
+  case ISD::SETGE:  Inv = true;   return 0;
+  case ISD::SETOGT:  // FIXME: This is incorrect see PR642.
+  case ISD::SETUGT:
+  case ISD::SETGT:  Inv = false;  return 1;
+  case ISD::SETOLE:  // FIXME: This is incorrect see PR642.
+  case ISD::SETULE:
+  case ISD::SETLE:  Inv = true;   return 1;
+  case ISD::SETOEQ:  // FIXME: This is incorrect see PR642.
+  case ISD::SETUEQ:
+  case ISD::SETEQ:  Inv = false;  return 2;
+  case ISD::SETONE:  // FIXME: This is incorrect see PR642.
+  case ISD::SETUNE:
+  case ISD::SETNE:  Inv = true;   return 2;
+  case ISD::SETO:   Inv = true;   return 3;
+  case ISD::SETUO:  Inv = false;  return 3;
+  }
+  return 0;
+}
+
+SDNode *PPCDAGToDAGISel::SelectSETCC(SDOperand Op) {
+  SDNode *N = Op.Val;
+  unsigned Imm;
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (isInt32Immediate(N->getOperand(1), Imm)) {
+    // We can codegen setcc op, imm very efficiently compared to a brcond.
+    // Check for those cases here.
+    // setcc op, 0
+    if (Imm == 0) {
+      SDOperand Op = N->getOperand(0);
+      AddToISelQueue(Op);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ: {
+        Op = SDOperand(CurDAG->getTargetNode(PPC::CNTLZW, MVT::i32, Op), 0);
+        SDOperand Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETNE: {
+        SDOperand AD =
+          SDOperand(CurDAG->getTargetNode(PPC::ADDIC, MVT::i32, MVT::Flag,
+                                          Op, getI32Imm(~0U)), 0);
+        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, 
+                                    AD.getValue(1));
+      }
+      case ISD::SETLT: {
+        SDOperand Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETGT: {
+        SDOperand T =
+          SDOperand(CurDAG->getTargetNode(PPC::NEG, MVT::i32, Op), 0);
+        T = SDOperand(CurDAG->getTargetNode(PPC::ANDC, MVT::i32, T, Op), 0);
+        SDOperand Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      }
+    } else if (Imm == ~0U) {        // setcc op, -1
+      SDOperand Op = N->getOperand(0);
+      AddToISelQueue(Op);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ:
+        Op = SDOperand(CurDAG->getTargetNode(PPC::ADDIC, MVT::i32, MVT::Flag,
+                                             Op, getI32Imm(1)), 0);
+        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, 
+                              SDOperand(CurDAG->getTargetNode(PPC::LI, MVT::i32,
+                                                              getI32Imm(0)), 0),
+                                    Op.getValue(1));
+      case ISD::SETNE: {
+        Op = SDOperand(CurDAG->getTargetNode(PPC::NOR, MVT::i32, Op, Op), 0);
+        SDNode *AD = CurDAG->getTargetNode(PPC::ADDIC, MVT::i32, MVT::Flag,
+                                           Op, getI32Imm(~0U));
+        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDOperand(AD, 0),
+                                    Op, SDOperand(AD, 1));
+      }
+      case ISD::SETLT: {
+        SDOperand AD = SDOperand(CurDAG->getTargetNode(PPC::ADDI, MVT::i32, Op,
+                                                       getI32Imm(1)), 0);
+        SDOperand AN = SDOperand(CurDAG->getTargetNode(PPC::AND, MVT::i32, AD,
+                                                       Op), 0);
+        SDOperand Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETGT: {
+        SDOperand Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        Op = SDOperand(CurDAG->getTargetNode(PPC::RLWINM, MVT::i32, Ops, 4), 0);
+        return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, 
+                                    getI32Imm(1));
+      }
+      }
+    }
+  }
+  
+  bool Inv;
+  unsigned Idx = getCRIdxForSetCC(CC, Inv);
+  SDOperand CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC);
+  SDOperand IntCR;
+  
+  // Force the ccreg into CR7.
+  SDOperand CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
+  
+  SDOperand InFlag(0, 0);  // Null incoming flag value.
+  CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), CR7Reg, CCReg, 
+                               InFlag).getValue(1);
+  
+  if (TLI.getTargetMachine().getSubtarget<PPCSubtarget>().isGigaProcessor())
+    IntCR = SDOperand(CurDAG->getTargetNode(PPC::MFOCRF, MVT::i32, CR7Reg,
+                                            CCReg), 0);
+  else
+    IntCR = SDOperand(CurDAG->getTargetNode(PPC::MFCR, MVT::i32, CCReg), 0);
+  
+  SDOperand Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31),
+                      getI32Imm(31), getI32Imm(31) };
+  if (!Inv) {
+    return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+  } else {
+    SDOperand Tmp =
+      SDOperand(CurDAG->getTargetNode(PPC::RLWINM, MVT::i32, Ops, 4), 0);
+    return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
+  }
+}
+
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+SDNode *PPCDAGToDAGISel::Select(SDOperand Op) {
+  SDNode *N = Op.Val;
+  if (N->getOpcode() >= ISD::BUILTIN_OP_END &&
+      N->getOpcode() < PPCISD::FIRST_NUMBER)
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  
+  case ISD::Constant: {
+    if (N->getValueType(0) == MVT::i64) {
+      // Get 64 bit value.
+      int64_t Imm = cast<ConstantSDNode>(N)->getValue();
+      // Assume no remaining bits.
+      unsigned Remainder = 0;
+      // Assume no shift required.
+      unsigned Shift = 0;
+      
+      // If it can't be represented as a 32 bit value.
+      if (!isInt32(Imm)) {
+        Shift = CountTrailingZeros_64(Imm);
+        int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+        
+        // If the shifted value fits 32 bits.
+        if (isInt32(ImmSh)) {
+          // Go with the shifted value.
+          Imm = ImmSh;
+        } else {
+          // Still stuck with a 64 bit value.
+          Remainder = Imm;
+          Shift = 32;
+          Imm >>= 32;
+        }
+      }
+      
+      // Intermediate operand.
+      SDNode *Result;
+
+      // Handle first 32 bits.
+      unsigned Lo = Imm & 0xFFFF;
+      unsigned Hi = (Imm >> 16) & 0xFFFF;
+      
+      // Simple value.
+      if (isInt16(Imm)) {
+       // Just the Lo bits.
+        Result = CurDAG->getTargetNode(PPC::LI8, MVT::i64, getI32Imm(Lo));
+      } else if (Lo) {
+        // Handle the Hi bits.
+        unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
+        Result = CurDAG->getTargetNode(OpC, MVT::i64, getI32Imm(Hi));
+        // And Lo bits.
+        Result = CurDAG->getTargetNode(PPC::ORI8, MVT::i64,
+                                       SDOperand(Result, 0), getI32Imm(Lo));
+      } else {
+       // Just the Hi bits.
+        Result = CurDAG->getTargetNode(PPC::LIS8, MVT::i64, getI32Imm(Hi));
+      }
+      
+      // If no shift, we're done.
+      if (!Shift) return Result;
+
+      // Shift for next step if the upper 32-bits were not zero.
+      if (Imm) {
+        Result = CurDAG->getTargetNode(PPC::RLDICR, MVT::i64,
+                                       SDOperand(Result, 0),
+                                       getI32Imm(Shift), getI32Imm(63 - Shift));
+      }
+
+      // Add in the last bits as required.
+      if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+        Result = CurDAG->getTargetNode(PPC::ORIS8, MVT::i64,
+                                       SDOperand(Result, 0), getI32Imm(Hi));
+      } 
+      if ((Lo = Remainder & 0xFFFF)) {
+        Result = CurDAG->getTargetNode(PPC::ORI8, MVT::i64,
+                                       SDOperand(Result, 0), getI32Imm(Lo));
+      }
+      
+      return Result;
+    }
+    break;
+  }
+  
+  case ISD::SETCC:
+    return SelectSETCC(Op);
+  case PPCISD::GlobalBaseReg:
+    return getGlobalBaseReg();
+    
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDOperand TFI = CurDAG->getTargetFrameIndex(FI, Op.getValueType());
+    unsigned Opc = Op.getValueType() == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
+    if (N->hasOneUse())
+      return CurDAG->SelectNodeTo(N, Opc, Op.getValueType(), TFI,
+                                  getSmallIPtrImm(0));
+    return CurDAG->getTargetNode(Opc, Op.getValueType(), TFI,
+                                 getSmallIPtrImm(0));
+  }
+
+  case PPCISD::MFCR: {
+    SDOperand InFlag = N->getOperand(1);
+    AddToISelQueue(InFlag);
+    // Use MFOCRF if supported.
+    if (TLI.getTargetMachine().getSubtarget<PPCSubtarget>().isGigaProcessor())
+      return CurDAG->getTargetNode(PPC::MFOCRF, MVT::i32,
+                                   N->getOperand(0), InFlag);
+    else
+      return CurDAG->getTargetNode(PPC::MFCR, MVT::i32, InFlag);
+  }
+    
+  case ISD::SDIV: {
+    // FIXME: since this depends on the setting of the carry flag from the srawi
+    //        we should really be making notes about that for the scheduler.
+    // FIXME: It sure would be nice if we could cheaply recognize the 
+    //        srl/add/sra pattern the dag combiner will generate for this as
+    //        sra/addze rather than having to handle sdiv ourselves.  oh well.
+    unsigned Imm;
+    if (isInt32Immediate(N->getOperand(1), Imm)) {
+      SDOperand N0 = N->getOperand(0);
+      AddToISelQueue(N0);
+      if ((signed)Imm > 0 && isPowerOf2_32(Imm)) {
+        SDNode *Op =
+          CurDAG->getTargetNode(PPC::SRAWI, MVT::i32, MVT::Flag,
+                                N0, getI32Imm(Log2_32(Imm)));
+        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, 
+                                    SDOperand(Op, 0), SDOperand(Op, 1));
+      } else if ((signed)Imm < 0 && isPowerOf2_32(-Imm)) {
+        SDNode *Op =
+          CurDAG->getTargetNode(PPC::SRAWI, MVT::i32, MVT::Flag,
+                                N0, getI32Imm(Log2_32(-Imm)));
+        SDOperand PT =
+          SDOperand(CurDAG->getTargetNode(PPC::ADDZE, MVT::i32,
+                                          SDOperand(Op, 0), SDOperand(Op, 1)),
+                    0);
+        return CurDAG->SelectNodeTo(N, PPC::NEG, MVT::i32, PT);
+      }
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+    
+  case ISD::LOAD: {
+    // Handle preincrement loads.
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    MVT::ValueType LoadedVT = LD->getLoadedVT();
+    
+    // Normal loads are handled by code generated from the .td file.
+    if (LD->getAddressingMode() != ISD::PRE_INC)
+      break;
+    
+    SDOperand Offset = LD->getOffset();
+    if (isa<ConstantSDNode>(Offset) ||
+        Offset.getOpcode() == ISD::TargetGlobalAddress) {
+      
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert(!isSExt || LoadedVT == MVT::i16 && "Invalid sext update load");
+        switch (LoadedVT) {
+          default: assert(0 && "Invalid PPC load type!");
+          case MVT::f64: Opcode = PPC::LFDU; break;
+          case MVT::f32: Opcode = PPC::LFSU; break;
+          case MVT::i32: Opcode = PPC::LWZU; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert(!isSExt || LoadedVT == MVT::i16 && "Invalid sext update load");
+        switch (LoadedVT) {
+          default: assert(0 && "Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDU; break;
+          case MVT::i32: Opcode = PPC::LWZU8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU8; break;
+        }
+      }
+      
+      SDOperand Chain = LD->getChain();
+      SDOperand Base = LD->getBasePtr();
+      AddToISelQueue(Chain);
+      AddToISelQueue(Base);
+      AddToISelQueue(Offset);
+      SDOperand Ops[] = { Offset, Base, Chain };
+      // FIXME: PPC64
+      return CurDAG->getTargetNode(Opcode, MVT::i32, MVT::i32,
+                                   MVT::Other, Ops, 3);
+    } else {
+      assert(0 && "R+R preindex loads not supported yet!");
+    }
+  }
+    
+  case ISD::AND: {
+    unsigned Imm, Imm2, SH, MB, ME;
+
+    // If this is an and of a value rotated between 0 and 31 bits and then and'd
+    // with a mask, emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRotateAndMask(N->getOperand(0).Val, Imm, false, SH, MB, ME)) {
+      SDOperand Val = N->getOperand(0).getOperand(0);
+      AddToISelQueue(Val);
+      SDOperand Ops[] = { Val, getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    // If this is just a masked value where the input is not handled above, and
+    // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRunOfOnes(Imm, MB, ME) && 
+        N->getOperand(0).getOpcode() != ISD::ROTL) {
+      SDOperand Val = N->getOperand(0);
+      AddToISelQueue(Val);
+      SDOperand Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    // AND X, 0 -> 0, not "rlwinm 32".
+    if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
+      AddToISelQueue(N->getOperand(1));
+      ReplaceUses(SDOperand(N, 0), N->getOperand(1));
+      return NULL;
+    }
+    // ISD::OR doesn't get all the bitfield insertion fun.
+    // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) is a bitfield insert
+    if (isInt32Immediate(N->getOperand(1), Imm) && 
+        N->getOperand(0).getOpcode() == ISD::OR &&
+        isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
+      unsigned MB, ME;
+      Imm = ~(Imm^Imm2);
+      if (isRunOfOnes(Imm, MB, ME)) {
+        AddToISelQueue(N->getOperand(0).getOperand(0));
+        AddToISelQueue(N->getOperand(0).getOperand(1));
+        SDOperand Ops[] = { N->getOperand(0).getOperand(0),
+                            N->getOperand(0).getOperand(1),
+                            getI32Imm(0), getI32Imm(MB),getI32Imm(ME) };
+        return CurDAG->getTargetNode(PPC::RLWIMI, MVT::i32, Ops, 5);
+      }
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::OR:
+    if (N->getValueType(0) == MVT::i32)
+      if (SDNode *I = SelectBitfieldInsert(N))
+        return I;
+      
+    // Other cases are autogenerated.
+    break;
+  case ISD::SHL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).Val, ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+      AddToISelQueue(N->getOperand(0).getOperand(0));
+      SDOperand Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::SRL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).Val, ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) { 
+      AddToISelQueue(N->getOperand(0).getOperand(0));
+      SDOperand Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::SELECT_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+    
+    // Handle the setcc cases here.  select_cc lhs, 0, 1, 0, cc
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+      if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+        if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+          if (N1C->isNullValue() && N3C->isNullValue() &&
+              N2C->getValue() == 1ULL && CC == ISD::SETNE &&
+              // FIXME: Implement this optzn for PPC64.
+              N->getValueType(0) == MVT::i32) {
+            AddToISelQueue(N->getOperand(0));
+            SDNode *Tmp =
+              CurDAG->getTargetNode(PPC::ADDIC, MVT::i32, MVT::Flag,
+                                    N->getOperand(0), getI32Imm(~0U));
+            return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32,
+                                        SDOperand(Tmp, 0), N->getOperand(0),
+                                        SDOperand(Tmp, 1));
+          }
+
+    SDOperand CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC);
+    unsigned BROpc = getPredicateForSetCC(CC);
+
+    unsigned SelectCCOp;
+    if (N->getValueType(0) == MVT::i32)
+      SelectCCOp = PPC::SELECT_CC_I4;
+    else if (N->getValueType(0) == MVT::i64)
+      SelectCCOp = PPC::SELECT_CC_I8;
+    else if (N->getValueType(0) == MVT::f32)
+      SelectCCOp = PPC::SELECT_CC_F4;
+    else if (N->getValueType(0) == MVT::f64)
+      SelectCCOp = PPC::SELECT_CC_F8;
+    else
+      SelectCCOp = PPC::SELECT_CC_VRRC;
+
+    AddToISelQueue(N->getOperand(2));
+    AddToISelQueue(N->getOperand(3));
+    SDOperand Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
+                        getI32Imm(BROpc) };
+    return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4);
+  }
+  case PPCISD::COND_BRANCH: {
+    AddToISelQueue(N->getOperand(0));  // Op #0 is the Chain.
+    // Op #1 is the PPC::PRED_* number.
+    // Op #2 is the CR#
+    // Op #3 is the Dest MBB
+    AddToISelQueue(N->getOperand(4));  // Op #4 is the Flag.
+    // Prevent PPC::PRED_* from being selected into LI.
+    SDOperand Pred =
+      getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getValue());
+    SDOperand Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
+      N->getOperand(0), N->getOperand(4) };
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 5);
+  }
+  case ISD::BR_CC: {
+    AddToISelQueue(N->getOperand(0));
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    SDOperand CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC);
+    SDOperand Ops[] = { getI32Imm(getPredicateForSetCC(CC)), CondCode, 
+                        N->getOperand(4), N->getOperand(0) };
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 4);
+  }
+  case ISD::BRIND: {
+    // FIXME: Should custom lower this.
+    SDOperand Chain = N->getOperand(0);
+    SDOperand Target = N->getOperand(1);
+    AddToISelQueue(Chain);
+    AddToISelQueue(Target);
+    unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
+    Chain = SDOperand(CurDAG->getTargetNode(Opc, MVT::Other, Target,
+                                            Chain), 0);
+    return CurDAG->SelectNodeTo(N, PPC::BCTR, MVT::Other, Chain);
+  }
+  }
+  
+  return SelectCode(Op);
+}
+
+
+
+/// createPPCISelDag - This pass converts a legalized DAG into a 
+/// PowerPC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
+  return new PPCDAGToDAGISel(TM);
+}
+

diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
new file mode 100644
index 0000000..6c2f383
--- /dev/null
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp

@@ -0,0 +1,3451 @@
+//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCISelLowering.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "PPCPerfectShuffle.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<bool> EnablePPCPreinc("enable-ppc-preinc", 
+cl::desc("enable preincrement load/store generation on PPC (experimental)"),
+                                     cl::Hidden);
+
+PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
+  : TargetLowering(TM), PPCSubTarget(*TM.getSubtargetImpl()) {
+    
+  setPow2DivIsCheap();
+  
+  // Use _setjmp/_longjmp instead of setjmp/longjmp.
+  setUseUnderscoreSetJmp(true);
+  setUseUnderscoreLongJmp(true);
+    
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, PPC::GPRCRegisterClass);
+  addRegisterClass(MVT::f32, PPC::F4RCRegisterClass);
+  addRegisterClass(MVT::f64, PPC::F8RCRegisterClass);
+  
+  // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
+  setLoadXAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  setLoadXAction(ISD::SEXTLOAD, MVT::i8, Expand);
+  
+  // PowerPC does not have truncstore for i1.
+  setStoreXAction(MVT::i1, Promote);
+
+  // PowerPC has pre-inc load and store's.
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
+
+  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+
+  // PowerPC has no intrinsics for these particular operations
+  setOperationAction(ISD::MEMMOVE, MVT::Other, Expand);
+  setOperationAction(ISD::MEMSET, MVT::Other, Expand);
+  setOperationAction(ISD::MEMCPY, MVT::Other, Expand);
+  
+  // PowerPC has no SREM/UREM instructions
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+  
+  // We don't support sin/cos/sqrt/fmod
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  
+  // If we're enabling GP optimizations, use hardware square root
+  if (!TM.getSubtarget<PPCSubtarget>().hasFSQRT()) {
+    setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+  }
+  
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  
+  // PowerPC does not have BSWAP, CTPOP or CTTZ
+  setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
+  setOperationAction(ISD::CTPOP, MVT::i64  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
+  
+  // PowerPC does not have ROTR
+  setOperationAction(ISD::ROTR, MVT::i32   , Expand);
+  
+  // PowerPC does not have Select
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  
+  // PowerPC wants to turn select_cc of FP into fsel when possible.
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  // PowerPC wants to optimize integer setcc a bit
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  
+  // PowerPC does not have BRCOND which requires SetCC
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
+  
+  // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+
+  // PowerPC does not have [U|S]INT_TO_FP
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i64, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::f64, Expand);
+
+  // We cannot sextinreg(i1).  Expand to shifts.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // Support label based line numbers.
+  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  if (!TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    setOperationAction(ISD::LABEL, MVT::Other, Expand);
+  } else {
+    setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+    setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+    setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+    setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+  }
+  
+  // We want to legalize GlobalAddress and ConstantPool nodes into the 
+  // appropriate instructions to materialize the address.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i64, Custom);
+  
+  // RET must be custom lowered, to meet ABI requirements
+  setOperationAction(ISD::RET               , MVT::Other, Custom);
+  
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  
+  // VAARG is custom lowered with ELF 32 ABI
+  if (TM.getSubtarget<PPCSubtarget>().isELF32_ABI())
+    setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  else
+    setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand); 
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  
+  if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+    // They also have instructions for converting between i64 and fp.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+ 
+    // FIXME: disable this lowered code.  This generates 64-bit register values,
+    // and we don't model the fact that the top part is clobbered by calls.  We
+    // need to flag these together so that the value isn't live across a call.
+    //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+    
+    // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
+  } else {
+    // PowerPC does not have FP_TO_UINT on 32-bit implementations.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  }
+
+  if (TM.getSubtarget<PPCSubtarget>().use64BitRegs()) {
+    // 64 bit PowerPC implementations can support i64 types directly
+    addRegisterClass(MVT::i64, PPC::G8RCRegisterClass);
+    // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+    setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+  } else {
+    // 32 bit PowerPC wants to expand i64 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  }
+
+  if (TM.getSubtarget<PPCSubtarget>().hasAltivec()) {
+    // First set operation action for all vector types to expand. Then we
+    // will selectively turn on ones that can be effectively codegen'd.
+    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+      // add/sub are legal for all supported vector VT's.
+      setOperationAction(ISD::ADD , (MVT::ValueType)VT, Legal);
+      setOperationAction(ISD::SUB , (MVT::ValueType)VT, Legal);
+      
+      // We promote all shuffles to v16i8.
+      setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, MVT::v16i8);
+
+      // We promote all non-typed operations to v4i32.
+      setOperationAction(ISD::AND   , (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::AND   , (MVT::ValueType)VT, MVT::v4i32);
+      setOperationAction(ISD::OR    , (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::OR    , (MVT::ValueType)VT, MVT::v4i32);
+      setOperationAction(ISD::XOR   , (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::XOR   , (MVT::ValueType)VT, MVT::v4i32);
+      setOperationAction(ISD::LOAD  , (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::LOAD  , (MVT::ValueType)VT, MVT::v4i32);
+      setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::SELECT, (MVT::ValueType)VT, MVT::v4i32);
+      setOperationAction(ISD::STORE, (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::STORE, (MVT::ValueType)VT, MVT::v4i32);
+      
+      // No other operations are legal.
+      setOperationAction(ISD::MUL , (MVT::ValueType)VT, Expand);
+      setOperationAction(ISD::SDIV, (MVT::ValueType)VT, Expand);
+      setOperationAction(ISD::SREM, (MVT::ValueType)VT, Expand);
+      setOperationAction(ISD::UDIV, (MVT::ValueType)VT, Expand);
+      setOperationAction(ISD::UREM, (MVT::ValueType)VT, Expand);
+      setOperationAction(ISD::FDIV, (MVT::ValueType)VT, Expand);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Expand);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, (MVT::ValueType)VT, Expand);
+      setOperationAction(ISD::BUILD_VECTOR, (MVT::ValueType)VT, Expand);
+
+      setOperationAction(ISD::SCALAR_TO_VECTOR, (MVT::ValueType)VT, Expand);
+    }
+
+    // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
+    // with merges, splats, etc.
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::AND   , MVT::v4i32, Legal);
+    setOperationAction(ISD::OR    , MVT::v4i32, Legal);
+    setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
+    setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
+    setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+    setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+    
+    addRegisterClass(MVT::v4f32, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v4i32, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v8i16, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v16i8, PPC::VRRCRegisterClass);
+    
+    setOperationAction(ISD::MUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+    
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+  }
+  
+  setSetCCResultType(MVT::i32);
+  setShiftAmountType(MVT::i32);
+  setSetCCResultContents(ZeroOrOneSetCCResult);
+  
+  if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+    setStackPointerRegisterToSaveRestore(PPC::X1);
+    setExceptionPointerRegister(PPC::X3);
+    setExceptionSelectorRegister(PPC::X4);
+  } else {
+    setStackPointerRegisterToSaveRestore(PPC::R1);
+    setExceptionPointerRegister(PPC::R3);
+    setExceptionSelectorRegister(PPC::R4);
+  }
+  
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::BR_CC);
+  setTargetDAGCombine(ISD::BSWAP);
+  
+  computeRegisterProperties();
+}
+
+const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case PPCISD::FSEL:          return "PPCISD::FSEL";
+  case PPCISD::FCFID:         return "PPCISD::FCFID";
+  case PPCISD::FCTIDZ:        return "PPCISD::FCTIDZ";
+  case PPCISD::FCTIWZ:        return "PPCISD::FCTIWZ";
+  case PPCISD::STFIWX:        return "PPCISD::STFIWX";
+  case PPCISD::VMADDFP:       return "PPCISD::VMADDFP";
+  case PPCISD::VNMSUBFP:      return "PPCISD::VNMSUBFP";
+  case PPCISD::VPERM:         return "PPCISD::VPERM";
+  case PPCISD::Hi:            return "PPCISD::Hi";
+  case PPCISD::Lo:            return "PPCISD::Lo";
+  case PPCISD::DYNALLOC:      return "PPCISD::DYNALLOC";
+  case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
+  case PPCISD::SRL:           return "PPCISD::SRL";
+  case PPCISD::SRA:           return "PPCISD::SRA";
+  case PPCISD::SHL:           return "PPCISD::SHL";
+  case PPCISD::EXTSW_32:      return "PPCISD::EXTSW_32";
+  case PPCISD::STD_32:        return "PPCISD::STD_32";
+  case PPCISD::CALL_ELF:      return "PPCISD::CALL_ELF";
+  case PPCISD::CALL_Macho:    return "PPCISD::CALL_Macho";
+  case PPCISD::MTCTR:         return "PPCISD::MTCTR";
+  case PPCISD::BCTRL_Macho:   return "PPCISD::BCTRL_Macho";
+  case PPCISD::BCTRL_ELF:     return "PPCISD::BCTRL_ELF";
+  case PPCISD::RET_FLAG:      return "PPCISD::RET_FLAG";
+  case PPCISD::MFCR:          return "PPCISD::MFCR";
+  case PPCISD::VCMP:          return "PPCISD::VCMP";
+  case PPCISD::VCMPo:         return "PPCISD::VCMPo";
+  case PPCISD::LBRX:          return "PPCISD::LBRX";
+  case PPCISD::STBRX:         return "PPCISD::STBRX";
+  case PPCISD::COND_BRANCH:   return "PPCISD::COND_BRANCH";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Node matching predicates, for use by the tblgen matching code.
+//===----------------------------------------------------------------------===//
+
+/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
+static bool isFloatingPointZero(SDOperand Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->isExactlyValue(-0.0) || CFP->isExactlyValue(0.0);
+  else if (ISD::isEXTLoad(Op.Val) || ISD::isNON_EXTLoad(Op.Val)) {
+    // Maybe this has already been legalized into the constant pool?
+    if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
+      if (ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+        return CFP->isExactlyValue(-0.0) || CFP->isExactlyValue(0.0);
+  }
+  return false;
+}
+
+/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode.  Return
+/// true if Op is undef or if it matches the specified value.
+static bool isConstantOrUndef(SDOperand Op, unsigned Val) {
+  return Op.getOpcode() == ISD::UNDEF || 
+         cast<ConstantSDNode>(Op)->getValue() == Val;
+}
+
+/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUHUM instruction.
+bool PPC::isVPKUHUMShuffleMask(SDNode *N, bool isUnary) {
+  if (!isUnary) {
+    for (unsigned i = 0; i != 16; ++i)
+      if (!isConstantOrUndef(N->getOperand(i),  i*2+1))
+        return false;
+  } else {
+    for (unsigned i = 0; i != 8; ++i)
+      if (!isConstantOrUndef(N->getOperand(i),  i*2+1) ||
+          !isConstantOrUndef(N->getOperand(i+8),  i*2+1))
+        return false;
+  }
+  return true;
+}
+
+/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUWUM instruction.
+bool PPC::isVPKUWUMShuffleMask(SDNode *N, bool isUnary) {
+  if (!isUnary) {
+    for (unsigned i = 0; i != 16; i += 2)
+      if (!isConstantOrUndef(N->getOperand(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getOperand(i+1),  i*2+3))
+        return false;
+  } else {
+    for (unsigned i = 0; i != 8; i += 2)
+      if (!isConstantOrUndef(N->getOperand(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getOperand(i+1),  i*2+3) ||
+          !isConstantOrUndef(N->getOperand(i+8),  i*2+2) ||
+          !isConstantOrUndef(N->getOperand(i+9),  i*2+3))
+        return false;
+  }
+  return true;
+}
+
+/// isVMerge - Common function, used to match vmrg* shuffles.
+///
+static bool isVMerge(SDNode *N, unsigned UnitSize, 
+                     unsigned LHSStart, unsigned RHSStart) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR &&
+         N->getNumOperands() == 16 && "PPC only supports shuffles by bytes!");
+  assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
+         "Unsupported merge size!");
+  
+  for (unsigned i = 0; i != 8/UnitSize; ++i)     // Step over units
+    for (unsigned j = 0; j != UnitSize; ++j) {   // Step over bytes within unit
+      if (!isConstantOrUndef(N->getOperand(i*UnitSize*2+j),
+                             LHSStart+j+i*UnitSize) ||
+          !isConstantOrUndef(N->getOperand(i*UnitSize*2+UnitSize+j),
+                             RHSStart+j+i*UnitSize))
+        return false;
+    }
+      return true;
+}
+
+/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+bool PPC::isVMRGLShuffleMask(SDNode *N, unsigned UnitSize, bool isUnary) {
+  if (!isUnary)
+    return isVMerge(N, UnitSize, 8, 24);
+  return isVMerge(N, UnitSize, 8, 8);
+}
+
+/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+bool PPC::isVMRGHShuffleMask(SDNode *N, unsigned UnitSize, bool isUnary) {
+  if (!isUnary)
+    return isVMerge(N, UnitSize, 0, 16);
+  return isVMerge(N, UnitSize, 0, 0);
+}
+
+
+/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+/// amount, otherwise return -1.
+int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR &&
+         N->getNumOperands() == 16 && "PPC only supports shuffles by bytes!");
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i;
+  for (i = 0; i != 16 && N->getOperand(i).getOpcode() == ISD::UNDEF; ++i)
+    /*search*/;
+  
+  if (i == 16) return -1;  // all undef.
+  
+  // Otherwise, check to see if the rest of the elements are consequtively
+  // numbered from this value.
+  unsigned ShiftAmt = cast<ConstantSDNode>(N->getOperand(i))->getValue();
+  if (ShiftAmt < i) return -1;
+  ShiftAmt -= i;
+
+  if (!isUnary) {
+    // Check the rest of the elements to see if they are consequtive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(N->getOperand(i), ShiftAmt+i))
+        return -1;
+  } else {
+    // Check the rest of the elements to see if they are consequtive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(N->getOperand(i), (ShiftAmt+i) & 15))
+        return -1;
+  }
+  
+  return ShiftAmt;
+}
+
+/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a splat of a single element that is suitable for input to
+/// VSPLTB/VSPLTH/VSPLTW.
+bool PPC::isSplatShuffleMask(SDNode *N, unsigned EltSize) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR &&
+         N->getNumOperands() == 16 &&
+         (EltSize == 1 || EltSize == 2 || EltSize == 4));
+  
+  // This is a splat operation if each element of the permute is the same, and
+  // if the value doesn't reference the second vector.
+  unsigned ElementBase = 0;
+  SDOperand Elt = N->getOperand(0);
+  if (ConstantSDNode *EltV = dyn_cast<ConstantSDNode>(Elt))
+    ElementBase = EltV->getValue();
+  else
+    return false;   // FIXME: Handle UNDEF elements too!
+
+  if (cast<ConstantSDNode>(Elt)->getValue() >= 16)
+    return false;
+  
+  // Check that they are consequtive.
+  for (unsigned i = 1; i != EltSize; ++i) {
+    if (!isa<ConstantSDNode>(N->getOperand(i)) ||
+        cast<ConstantSDNode>(N->getOperand(i))->getValue() != i+ElementBase)
+      return false;
+  }
+  
+  assert(isa<ConstantSDNode>(Elt) && "Invalid VECTOR_SHUFFLE mask!");
+  for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(N->getOperand(i)) &&
+           "Invalid VECTOR_SHUFFLE mask!");
+    for (unsigned j = 0; j != EltSize; ++j)
+      if (N->getOperand(i+j) != N->getOperand(j))
+        return false;
+  }
+
+  return true;
+}
+
+/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) {
+  assert(isSplatShuffleMask(N, EltSize));
+  return cast<ConstantSDNode>(N->getOperand(0))->getValue() / EltSize;
+}
+
+/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
+/// by using a vspltis[bhw] instruction of the specified element size, return
+/// the constant being splatted.  The ByteSize field indicates the number of
+/// bytes of each element [124] -> [bhw].
+SDOperand PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+  SDOperand OpVal(0, 0);
+
+  // If ByteSize of the splat is bigger than the element size of the
+  // build_vector, then we have a case where we are checking for a splat where
+  // multiple elements of the buildvector are folded together into a single
+  // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
+  unsigned EltSize = 16/N->getNumOperands();
+  if (EltSize < ByteSize) {
+    unsigned Multiple = ByteSize/EltSize;   // Number of BV entries per spltval.
+    SDOperand UniquedVals[4];
+    assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
+    
+    // See if all of the elements in the buildvector agree across.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+      if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+      // If the element isn't a constant, bail fully out.
+      if (!isa<ConstantSDNode>(N->getOperand(i))) return SDOperand();
+
+          
+      if (UniquedVals[i&(Multiple-1)].Val == 0)
+        UniquedVals[i&(Multiple-1)] = N->getOperand(i);
+      else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
+        return SDOperand();  // no match.
+    }
+    
+    // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
+    // either constant or undef values that are identical for each chunk.  See
+    // if these chunks can form into a larger vspltis*.
+    
+    // Check to see if all of the leading entries are either 0 or -1.  If
+    // neither, then this won't fit into the immediate field.
+    bool LeadingZero = true;
+    bool LeadingOnes = true;
+    for (unsigned i = 0; i != Multiple-1; ++i) {
+      if (UniquedVals[i].Val == 0) continue;  // Must have been undefs.
+      
+      LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue();
+      LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue();
+    }
+    // Finally, check the least significant entry.
+    if (LeadingZero) {
+      if (UniquedVals[Multiple-1].Val == 0)
+        return DAG.getTargetConstant(0, MVT::i32);  // 0,0,0,undef
+      int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getValue();
+      if (Val < 16)
+        return DAG.getTargetConstant(Val, MVT::i32);  // 0,0,0,4 -> vspltisw(4)
+    }
+    if (LeadingOnes) {
+      if (UniquedVals[Multiple-1].Val == 0)
+        return DAG.getTargetConstant(~0U, MVT::i32);  // -1,-1,-1,undef
+      int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSignExtended();
+      if (Val >= -16)                            // -1,-1,-1,-2 -> vspltisw(-2)
+        return DAG.getTargetConstant(Val, MVT::i32);
+    }
+    
+    return SDOperand();
+  }
+  
+  // Check to see if this buildvec has a single non-undef value in its elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.Val == 0)
+      OpVal = N->getOperand(i);
+    else if (OpVal != N->getOperand(i))
+      return SDOperand();
+  }
+  
+  if (OpVal.Val == 0) return SDOperand();  // All UNDEF: use implicit def.
+  
+  unsigned ValSizeInBytes = 0;
+  uint64_t Value = 0;
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+    Value = CN->getValue();
+    ValSizeInBytes = MVT::getSizeInBits(CN->getValueType(0))/8;
+  } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
+    assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
+    Value = FloatToBits(CN->getValue());
+    ValSizeInBytes = 4;
+  }
+
+  // If the splat value is larger than the element value, then we can never do
+  // this splat.  The only case that we could fit the replicated bits into our
+  // immediate field for would be zero, and we prefer to use vxor for it.
+  if (ValSizeInBytes < ByteSize) return SDOperand();
+  
+  // If the element value is larger than the splat value, cut it in half and
+  // check to see if the two halves are equal.  Continue doing this until we
+  // get to ByteSize.  This allows us to handle 0x01010101 as 0x01.
+  while (ValSizeInBytes > ByteSize) {
+    ValSizeInBytes >>= 1;
+    
+    // If the top half equals the bottom half, we're still ok.
+    if (((Value >> (ValSizeInBytes*8)) & ((1 << (8*ValSizeInBytes))-1)) !=
+         (Value                        & ((1 << (8*ValSizeInBytes))-1)))
+      return SDOperand();
+  }
+
+  // Properly sign extend the value.
+  int ShAmt = (4-ByteSize)*8;
+  int MaskVal = ((int)Value << ShAmt) >> ShAmt;
+  
+  // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
+  if (MaskVal == 0) return SDOperand();
+
+  // Finally, if this value fits in a 5 bit sext field, return it
+  if (((MaskVal << (32-5)) >> (32-5)) == MaskVal)
+    return DAG.getTargetConstant(MaskVal, MVT::i32);
+  return SDOperand();
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing Mode Selection
+//===----------------------------------------------------------------------===//
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (N->getOpcode() != ISD::Constant)
+    return false;
+  
+  Imm = (short)cast<ConstantSDNode>(N)->getValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getValue();
+}
+static bool isIntS16Immediate(SDOperand Op, short &Imm) {
+  return isIntS16Immediate(Op.Val, Imm);
+}
+
+
+/// SelectAddressRegReg - Given the specified addressed, check to see if it
+/// can be represented as an indexed [r+r] operation.  Returns false if it
+/// can be more efficiently represented with [r+imm].
+bool PPCTargetLowering::SelectAddressRegReg(SDOperand N, SDOperand &Base,
+                                            SDOperand &Index,
+                                            SelectionDAG &DAG) {
+  short imm = 0;
+  if (N.getOpcode() == ISD::ADD) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i
+    if (N.getOperand(1).getOpcode() == PPCISD::Lo)
+      return false;    // r+i
+    
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  } else if (N.getOpcode() == ISD::OR) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i can fold it if we can.
+    
+    // If this is an or of disjoint bitfields, we can codegen this as an add
+    // (for better address arithmetic) if the LHS and RHS of the OR are provably
+    // disjoint.
+    uint64_t LHSKnownZero, LHSKnownOne;
+    uint64_t RHSKnownZero, RHSKnownOne;
+    DAG.ComputeMaskedBits(N.getOperand(0), ~0U, LHSKnownZero, LHSKnownOne);
+    
+    if (LHSKnownZero) {
+      DAG.ComputeMaskedBits(N.getOperand(1), ~0U, RHSKnownZero, RHSKnownOne);
+      // If all of the bits are known zero on the LHS or RHS, the add won't
+      // carry.
+      if ((LHSKnownZero | RHSKnownZero) == ~0U) {
+        Base = N.getOperand(0);
+        Index = N.getOperand(1);
+        return true;
+      }
+    }
+  }
+  
+  return false;
+}
+
+/// Returns true if the address N can be represented by a base register plus
+/// a signed 16-bit displacement [r+imm], and if it is not better
+/// represented as reg+reg.
+bool PPCTargetLowering::SelectAddressRegImm(SDOperand N, SDOperand &Disp,
+                                            SDOperand &Base, SelectionDAG &DAG){
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddressRegReg(N, Disp, Base, DAG))
+    return false;
+  
+  if (N.getOpcode() == ISD::ADD) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm)) {
+      Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+      // Match LOAD (ADD (X, Lo(G))).
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getValue()
+             && "Cannot handle constant offsets yet!");
+      Disp = N.getOperand(1).getOperand(0);  // The global address.
+      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetConstantPool ||
+             Disp.getOpcode() == ISD::TargetJumpTable);
+      Base = N.getOperand(0);
+      return true;  // [&g+r]
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm)) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      uint64_t LHSKnownZero, LHSKnownOne;
+      DAG.ComputeMaskedBits(N.getOperand(0), ~0U, LHSKnownZero, LHSKnownOne);
+      if ((LHSKnownZero|~(unsigned)imm) == ~0U) {
+        // If all of the bits are known zero on the LHS or RHS, the add won't
+        // carry.
+        Base = N.getOperand(0);
+        Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+        return true;
+      }
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.
+    
+    // If this address fits entirely in a 16-bit sext immediate field, codegen
+    // this as "d, 0"
+    short Imm;
+    if (isIntS16Immediate(CN, Imm)) {
+      Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
+      Base = DAG.getRegister(PPC::R0, CN->getValueType(0));
+      return true;
+    }
+
+    // Handle 32-bit sext immediates with LIS + addr mode.
+    if (CN->getValueType(0) == MVT::i32 ||
+        (int64_t)CN->getValue() == (int)CN->getValue()) {
+      int Addr = (int)CN->getValue();
+      
+      // Otherwise, break this down into an LIS + disp.
+      Disp = DAG.getTargetConstant((short)Addr, MVT::i32);
+      
+      Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, MVT::i32);
+      unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+      Base = SDOperand(DAG.getTargetNode(Opc, CN->getValueType(0), Base), 0);
+      return true;
+    }
+  }
+  
+  Disp = DAG.getTargetConstant(0, getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+  else
+    Base = N;
+  return true;      // [r+0]
+}
+
+/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+/// represented as an indexed [r+r] operation.
+bool PPCTargetLowering::SelectAddressRegRegOnly(SDOperand N, SDOperand &Base,
+                                                SDOperand &Index,
+                                                SelectionDAG &DAG) {
+  // Check to see if we can easily represent this as an [r+r] address.  This
+  // will fail if it thinks that the address is more profitably represented as
+  // reg+imm, e.g. where imm = 0.
+  if (SelectAddressRegReg(N, Base, Index, DAG))
+    return true;
+  
+  // If the operand is an addition, always emit this as [r+r], since this is
+  // better (for code size, and execution, as the memop does the add for free)
+  // than emitting an explicit add.
+  if (N.getOpcode() == ISD::ADD) {
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  }
+  
+  // Otherwise, do it the hard way, using R0 as the base register.
+  Base = DAG.getRegister(PPC::R0, N.getValueType());
+  Index = N;
+  return true;
+}
+
+/// SelectAddressRegImmShift - Returns true if the address N can be
+/// represented by a base register plus a signed 14-bit displacement
+/// [r+imm*4].  Suitable for use by STD and friends.
+bool PPCTargetLowering::SelectAddressRegImmShift(SDOperand N, SDOperand &Disp,
+                                                 SDOperand &Base,
+                                                 SelectionDAG &DAG) {
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddressRegReg(N, Disp, Base, DAG))
+    return false;
+  
+  if (N.getOpcode() == ISD::ADD) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
+      Disp =  DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+      // Match LOAD (ADD (X, Lo(G))).
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getValue()
+             && "Cannot handle constant offsets yet!");
+      Disp = N.getOperand(1).getOperand(0);  // The global address.
+      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetConstantPool ||
+             Disp.getOpcode() == ISD::TargetJumpTable);
+      Base = N.getOperand(0);
+      return true;  // [&g+r]
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      uint64_t LHSKnownZero, LHSKnownOne;
+      DAG.ComputeMaskedBits(N.getOperand(0), ~0U, LHSKnownZero, LHSKnownOne);
+      if ((LHSKnownZero|~(unsigned)imm) == ~0U) {
+        // If all of the bits are known zero on the LHS or RHS, the add won't
+        // carry.
+        Base = N.getOperand(0);
+        Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+        return true;
+      }
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.  Verify low two bits are clear.
+    if ((CN->getValue() & 3) == 0) {
+      // If this address fits entirely in a 14-bit sext immediate field, codegen
+      // this as "d, 0"
+      short Imm;
+      if (isIntS16Immediate(CN, Imm)) {
+        Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy());
+        Base = DAG.getRegister(PPC::R0, CN->getValueType(0));
+        return true;
+      }
+    
+      // Fold the low-part of 32-bit absolute addresses into addr mode.
+      if (CN->getValueType(0) == MVT::i32 ||
+          (int64_t)CN->getValue() == (int)CN->getValue()) {
+        int Addr = (int)CN->getValue();
+      
+        // Otherwise, break this down into an LIS + disp.
+        Disp = DAG.getTargetConstant((short)Addr >> 2, MVT::i32);
+        
+        Base = DAG.getTargetConstant((Addr-(signed short)Addr) >> 16, MVT::i32);
+        unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+        Base = SDOperand(DAG.getTargetNode(Opc, CN->getValueType(0), Base), 0);
+        return true;
+      }
+    }
+  }
+  
+  Disp = DAG.getTargetConstant(0, getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+  else
+    Base = N;
+  return true;      // [r+0]
+}
+
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDOperand &Base,
+                                                  SDOperand &Offset,
+                                                  ISD::MemIndexedMode &AM,
+                                                  SelectionDAG &DAG) {
+  // Disabled by default for now.
+  if (!EnablePPCPreinc) return false;
+  
+  SDOperand Ptr;
+  MVT::ValueType VT;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT = LD->getLoadedVT();
+    
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ST = ST;
+    Ptr = ST->getBasePtr();
+    VT  = ST->getStoredVT();
+  } else
+    return false;
+
+  // PowerPC doesn't have preinc load/store instructions for vectors.
+  if (MVT::isVector(VT))
+    return false;
+  
+  // TODO: Check reg+reg first.
+  
+  // LDU/STU use reg+imm*4, others use reg+imm.
+  if (VT != MVT::i64) {
+    // reg + imm
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG))
+      return false;
+  } else {
+    // reg + imm * 4.
+    if (!SelectAddressRegImmShift(Ptr, Offset, Base, DAG))
+      return false;
+  }
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    // PPC64 doesn't have lwau, but it does have lwaux.  Reject preinc load of
+    // sext i32 to i64 when addr mode is r+i.
+    if (LD->getValueType(0) == MVT::i64 && LD->getLoadedVT() == MVT::i32 &&
+        LD->getExtensionType() == ISD::SEXTLOAD &&
+        isa<ConstantSDNode>(Offset))
+      return false;
+  }    
+  
+  AM = ISD::PRE_INC;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+static SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  Constant *C = CP->getConstVal();
+  SDOperand CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
+  SDOperand Zero = DAG.getConstant(0, PtrVT);
+
+  const TargetMachine &TM = DAG.getTarget();
+  
+  SDOperand Hi = DAG.getNode(PPCISD::Hi, PtrVT, CPI, Zero);
+  SDOperand Lo = DAG.getNode(PPCISD::Lo, PtrVT, CPI, Zero);
+
+  // If this is a non-darwin platform, we don't support non-static relo models
+  // yet.
+  if (TM.getRelocationModel() == Reloc::Static ||
+      !TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    // Generate non-pic code that has direct accesses to the constant pool.
+    // The address of the global is just (hi(&g)+lo(&g)).
+    return DAG.getNode(ISD::ADD, PtrVT, Hi, Lo);
+  }
+  
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // With PIC, the first instruction is actually "GR+hi(&G)".
+    Hi = DAG.getNode(ISD::ADD, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg, PtrVT), Hi);
+  }
+  
+  Lo = DAG.getNode(ISD::ADD, PtrVT, Hi, Lo);
+  return Lo;
+}
+
+static SDOperand LowerJumpTable(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDOperand JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  SDOperand Zero = DAG.getConstant(0, PtrVT);
+  
+  const TargetMachine &TM = DAG.getTarget();
+
+  SDOperand Hi = DAG.getNode(PPCISD::Hi, PtrVT, JTI, Zero);
+  SDOperand Lo = DAG.getNode(PPCISD::Lo, PtrVT, JTI, Zero);
+
+  // If this is a non-darwin platform, we don't support non-static relo models
+  // yet.
+  if (TM.getRelocationModel() == Reloc::Static ||
+      !TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    // Generate non-pic code that has direct accesses to the constant pool.
+    // The address of the global is just (hi(&g)+lo(&g)).
+    return DAG.getNode(ISD::ADD, PtrVT, Hi, Lo);
+  }
+  
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // With PIC, the first instruction is actually "GR+hi(&G)".
+    Hi = DAG.getNode(ISD::ADD, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg, PtrVT), Hi);
+  }
+  
+  Lo = DAG.getNode(ISD::ADD, PtrVT, Hi, Lo);
+  return Lo;
+}
+
+static SDOperand LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) {
+  assert(0 && "TLS not implemented for PPC.");
+}
+
+static SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType PtrVT = Op.getValueType();
+  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+  GlobalValue *GV = GSDN->getGlobal();
+  SDOperand GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
+  SDOperand Zero = DAG.getConstant(0, PtrVT);
+  
+  const TargetMachine &TM = DAG.getTarget();
+
+  SDOperand Hi = DAG.getNode(PPCISD::Hi, PtrVT, GA, Zero);
+  SDOperand Lo = DAG.getNode(PPCISD::Lo, PtrVT, GA, Zero);
+
+  // If this is a non-darwin platform, we don't support non-static relo models
+  // yet.
+  if (TM.getRelocationModel() == Reloc::Static ||
+      !TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    // Generate non-pic code that has direct accesses to globals.
+    // The address of the global is just (hi(&g)+lo(&g)).
+    return DAG.getNode(ISD::ADD, PtrVT, Hi, Lo);
+  }
+  
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // With PIC, the first instruction is actually "GR+hi(&G)".
+    Hi = DAG.getNode(ISD::ADD, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg, PtrVT), Hi);
+  }
+  
+  Lo = DAG.getNode(ISD::ADD, PtrVT, Hi, Lo);
+  
+  if (!TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV))
+    return Lo;
+  
+  // If the global is weak or external, we have to go through the lazy
+  // resolution stub.
+  return DAG.getLoad(PtrVT, DAG.getEntryNode(), Lo, NULL, 0);
+}
+
+static SDOperand LowerSETCC(SDOperand Op, SelectionDAG &DAG) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  
+  // If we're comparing for equality to zero, expose the fact that this is
+  // implented as a ctlz/srl pair on ppc, so that the dag combiner can
+  // fold the new nodes.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+    if (C->isNullValue() && CC == ISD::SETEQ) {
+      MVT::ValueType VT = Op.getOperand(0).getValueType();
+      SDOperand Zext = Op.getOperand(0);
+      if (VT < MVT::i32) {
+        VT = MVT::i32;
+        Zext = DAG.getNode(ISD::ZERO_EXTEND, VT, Op.getOperand(0));
+      } 
+      unsigned Log2b = Log2_32(MVT::getSizeInBits(VT));
+      SDOperand Clz = DAG.getNode(ISD::CTLZ, VT, Zext);
+      SDOperand Scc = DAG.getNode(ISD::SRL, VT, Clz,
+                                  DAG.getConstant(Log2b, MVT::i32));
+      return DAG.getNode(ISD::TRUNCATE, MVT::i32, Scc);
+    }
+    // Leave comparisons against 0 and -1 alone for now, since they're usually 
+    // optimized.  FIXME: revisit this when we can custom lower all setcc
+    // optimizations.
+    if (C->isAllOnesValue() || C->isNullValue())
+      return SDOperand();
+  }
+  
+  // If we have an integer seteq/setne, turn it into a compare against zero
+  // by xor'ing the rhs with the lhs, which is faster than setting a
+  // condition register, reading it back out, and masking the correct bit.  The
+  // normal approach here uses sub to do this instead of xor.  Using xor exposes
+  // the result to other bit-twiddling opportunities.
+  MVT::ValueType LHSVT = Op.getOperand(0).getValueType();
+  if (MVT::isInteger(LHSVT) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    MVT::ValueType VT = Op.getValueType();
+    SDOperand Sub = DAG.getNode(ISD::XOR, LHSVT, Op.getOperand(0), 
+                                Op.getOperand(1));
+    return DAG.getSetCC(VT, Sub, DAG.getConstant(0, LHSVT), CC);
+  }
+  return SDOperand();
+}
+
+static SDOperand LowerVAARG(SDOperand Op, SelectionDAG &DAG,
+                              int VarArgsFrameIndex,
+                              int VarArgsStackOffset,
+                              unsigned VarArgsNumGPR,
+                              unsigned VarArgsNumFPR,
+                              const PPCSubtarget &Subtarget) {
+  
+  assert(0 && "VAARG in ELF32 ABI not implemented yet!");
+}
+
+static SDOperand LowerVASTART(SDOperand Op, SelectionDAG &DAG,
+                              int VarArgsFrameIndex,
+                              int VarArgsStackOffset,
+                              unsigned VarArgsNumGPR,
+                              unsigned VarArgsNumFPR,
+                              const PPCSubtarget &Subtarget) {
+
+  if (Subtarget.isMachoABI()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+    SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2));
+    return DAG.getStore(Op.getOperand(0), FR, Op.getOperand(1), SV->getValue(),
+                        SV->getOffset());
+  }
+
+  // For ELF 32 ABI we follow the layout of the va_list struct.
+  // We suppose the given va_list is already allocated.
+  //
+  // typedef struct {
+  //  char gpr;     /* index into the array of 8 GPRs
+  //                 * stored in the register save area
+  //                 * gpr=0 corresponds to r3,
+  //                 * gpr=1 to r4, etc.
+  //                 */
+  //  char fpr;     /* index into the array of 8 FPRs
+  //                 * stored in the register save area
+  //                 * fpr=0 corresponds to f1,
+  //                 * fpr=1 to f2, etc.
+  //                 */
+  //  char *overflow_arg_area;
+  //                /* location on stack that holds
+  //                 * the next overflow argument
+  //                 */
+  //  char *reg_save_area;
+  //               /* where r3:r10 and f1:f8 (if saved)
+  //                * are stored
+  //                */
+  // } va_list[1];
+
+
+  SDOperand ArgGPR = DAG.getConstant(VarArgsNumGPR, MVT::i8);
+  SDOperand ArgFPR = DAG.getConstant(VarArgsNumFPR, MVT::i8);
+  
+
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  
+  SDOperand StackOffset = DAG.getFrameIndex(VarArgsStackOffset, PtrVT);
+  SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+  
+  SDOperand ConstFrameOffset = DAG.getConstant(MVT::getSizeInBits(PtrVT)/8,
+                                               PtrVT);
+  SDOperand ConstStackOffset = DAG.getConstant(MVT::getSizeInBits(PtrVT)/8 - 1,
+                                               PtrVT);
+  SDOperand ConstFPROffset   = DAG.getConstant(1, PtrVT);
+  
+  SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2));
+  
+  // Store first byte : number of int regs
+  SDOperand firstStore = DAG.getStore(Op.getOperand(0), ArgGPR,
+                                      Op.getOperand(1), SV->getValue(),
+                                      SV->getOffset());
+  SDOperand nextPtr = DAG.getNode(ISD::ADD, PtrVT, Op.getOperand(1),
+                                  ConstFPROffset);
+  
+  // Store second byte : number of float regs
+  SDOperand secondStore = DAG.getStore(firstStore, ArgFPR, nextPtr,
+                                       SV->getValue(), SV->getOffset());
+  nextPtr = DAG.getNode(ISD::ADD, PtrVT, nextPtr, ConstStackOffset);
+  
+  // Store second word : arguments given on stack
+  SDOperand thirdStore = DAG.getStore(secondStore, StackOffset, nextPtr,
+                                      SV->getValue(), SV->getOffset());
+  nextPtr = DAG.getNode(ISD::ADD, PtrVT, nextPtr, ConstFrameOffset);
+
+  // Store third word : arguments given in registers
+  return DAG.getStore(thirdStore, FR, nextPtr, SV->getValue(),
+                      SV->getOffset());
+
+}
+
+#include "PPCGenCallingConv.inc"
+
+/// GetFPR - Get the set of FP registers that should be allocated for arguments,
+/// depending on which subtarget is selected.
+static const unsigned *GetFPR(const PPCSubtarget &Subtarget) {
+  if (Subtarget.isMachoABI()) {
+    static const unsigned FPR[] = {
+      PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+      PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
+    };
+    return FPR;
+  }
+  
+  
+  static const unsigned FPR[] = {
+    PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+    PPC::F8
+  };
+  return FPR;
+}
+
+static SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG,
+                                       int &VarArgsFrameIndex,
+                                       int &VarArgsStackOffset,
+                                       unsigned &VarArgsNumGPR,
+                                       unsigned &VarArgsNumFPR,
+                                       const PPCSubtarget &Subtarget) {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SSARegMap *RegMap = MF.getSSARegMap();
+  SmallVector<SDOperand, 8> ArgValues;
+  SDOperand Root = Op.getOperand(0);
+  
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  bool isMachoABI = Subtarget.isMachoABI();
+  bool isELF32_ABI = Subtarget.isELF32_ABI();
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  unsigned ArgOffset = PPCFrameInfo::getLinkageSize(isPPC64, isMachoABI);
+  
+  static const unsigned GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const unsigned GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  
+  static const unsigned *FPR = GetFPR(Subtarget);
+  
+  static const unsigned VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = sizeof(GPR_32)/sizeof(GPR_32[0]);
+  const unsigned Num_FPR_Regs = isMachoABI ? 13 : 8;
+  const unsigned Num_VR_Regs  = sizeof( VR)/sizeof( VR[0]);
+
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+  
+  const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32;
+  
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+  // 
+  // In the ELF 32 ABI, GPRs and stack are double word align: an argument
+  // represented with two words (long long or double) must be copied to an
+  // even GPR_idx value or to an even ArgOffset value.
+
+  for (unsigned ArgNo = 0, e = Op.Val->getNumValues()-1; ArgNo != e; ++ArgNo) {
+    SDOperand ArgVal;
+    bool needsLoad = false;
+    MVT::ValueType ObjectVT = Op.getValue(ArgNo).getValueType();
+    unsigned ObjSize = MVT::getSizeInBits(ObjectVT)/8;
+    unsigned ArgSize = ObjSize;
+    unsigned Flags = cast<ConstantSDNode>(Op.getOperand(ArgNo+3))->getValue();
+    unsigned AlignFlag = 1 << ISD::ParamFlags::OrigAlignmentOffs;
+    // See if next argument requires stack alignment in ELF
+    bool Expand = (ObjectVT == MVT::f64) || ((ArgNo + 1 < e) &&
+      (cast<ConstantSDNode>(Op.getOperand(ArgNo+4))->getValue() & AlignFlag) &&
+      (!(Flags & AlignFlag)));
+
+    unsigned CurArgOffset = ArgOffset;
+    switch (ObjectVT) {
+    default: assert(0 && "Unhandled argument type!");
+    case MVT::i32:
+      // Double word align in ELF
+      if (Expand && isELF32_ABI) GPR_idx += (GPR_idx % 2);
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = RegMap->createVirtualRegister(&PPC::GPRCRegClass);
+        MF.addLiveIn(GPR[GPR_idx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i32);
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      // Stack align in ELF
+      if (needsLoad && Expand && isELF32_ABI) 
+        ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize;
+      // All int arguments reserve stack space in Macho ABI.
+      if (isMachoABI || needsLoad) ArgOffset += PtrByteSize;
+      break;
+      
+    case MVT::i64:  // PPC64
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = RegMap->createVirtualRegister(&PPC::G8RCRegClass);
+        MF.addLiveIn(GPR[GPR_idx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i64);
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+      }
+      // All int arguments reserve stack space in Macho ABI.
+      if (isMachoABI || needsLoad) ArgOffset += 8;
+      break;
+      
+    case MVT::f32:
+    case MVT::f64:
+      // Every 4 bytes of argument space consumes one of the GPRs available for
+      // argument passing.
+      if (GPR_idx != Num_GPR_Regs && isMachoABI) {
+        ++GPR_idx;
+        if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
+          ++GPR_idx;
+      }
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+        if (ObjectVT == MVT::f32)
+          VReg = RegMap->createVirtualRegister(&PPC::F4RCRegClass);
+        else
+          VReg = RegMap->createVirtualRegister(&PPC::F8RCRegClass);
+        MF.addLiveIn(FPR[FPR_idx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, ObjectVT);
+        ++FPR_idx;
+      } else {
+        needsLoad = true;
+      }
+      
+      // Stack align in ELF
+      if (needsLoad && Expand && isELF32_ABI)
+        ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize;
+      // All FP arguments reserve stack space in Macho ABI.
+      if (isMachoABI || needsLoad) ArgOffset += isPPC64 ? 8 : ObjSize;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      // Note that vector arguments in registers don't reserve stack space.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = RegMap->createVirtualRegister(&PPC::VRRCRegClass);
+        MF.addLiveIn(VR[VR_idx], VReg);
+        ArgVal = DAG.getCopyFromReg(Root, VReg, ObjectVT);
+        ++VR_idx;
+      } else {
+        // This should be simple, but requires getting 16-byte aligned stack
+        // values.
+        assert(0 && "Loading VR argument not implemented yet!");
+        needsLoad = true;
+      }
+      break;
+    }
+    
+    // We need to load the argument to a virtual register if we determined above
+    // that we ran out of physical registers of the appropriate type
+    if (needsLoad) {
+      // If the argument is actually used, emit a load from the right stack
+      // slot.
+      if (!Op.Val->hasNUsesOfValue(0, ArgNo)) {
+        int FI = MFI->CreateFixedObject(ObjSize,
+                                        CurArgOffset + (ArgSize - ObjSize));
+        SDOperand FIN = DAG.getFrameIndex(FI, PtrVT);
+        ArgVal = DAG.getLoad(ObjectVT, Root, FIN, NULL, 0);
+      } else {
+        // Don't emit a dead load.
+        ArgVal = DAG.getNode(ISD::UNDEF, ObjectVT);
+      }
+    }
+    
+    ArgValues.push_back(ArgVal);
+  }
+  
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  if (isVarArg) {
+    
+    int depth;
+    if (isELF32_ABI) {
+      VarArgsNumGPR = GPR_idx;
+      VarArgsNumFPR = FPR_idx;
+   
+      // Make room for Num_GPR_Regs, Num_FPR_Regs and for a possible frame
+      // pointer.
+      depth = -(Num_GPR_Regs * MVT::getSizeInBits(PtrVT)/8 +
+                Num_FPR_Regs * MVT::getSizeInBits(MVT::f64)/8 +
+                MVT::getSizeInBits(PtrVT)/8);
+      
+      VarArgsStackOffset = MFI->CreateFixedObject(MVT::getSizeInBits(PtrVT)/8,
+                                                  ArgOffset);
+
+    }
+    else
+      depth = ArgOffset;
+    
+    VarArgsFrameIndex = MFI->CreateFixedObject(MVT::getSizeInBits(PtrVT)/8,
+                                               depth);
+    SDOperand FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+    
+    SmallVector<SDOperand, 8> MemOps;
+    
+    // In ELF 32 ABI, the fixed integer arguments of a variadic function are
+    // stored to the VarArgsFrameIndex on the stack.
+    if (isELF32_ABI) {
+      for (GPR_idx = 0; GPR_idx != VarArgsNumGPR; ++GPR_idx) {
+        SDOperand Val = DAG.getRegister(GPR[GPR_idx], PtrVT);
+        SDOperand Store = DAG.getStore(Root, Val, FIN, NULL, 0);
+        MemOps.push_back(Store);
+        // Increment the address by four for the next argument to store
+        SDOperand PtrOff = DAG.getConstant(MVT::getSizeInBits(PtrVT)/8, PtrVT);
+        FIN = DAG.getNode(ISD::ADD, PtrOff.getValueType(), FIN, PtrOff);
+      }
+    }
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg;
+      if (isPPC64)
+        VReg = RegMap->createVirtualRegister(&PPC::G8RCRegClass);
+      else
+        VReg = RegMap->createVirtualRegister(&PPC::GPRCRegClass);
+
+      MF.addLiveIn(GPR[GPR_idx], VReg);
+      SDOperand Val = DAG.getCopyFromReg(Root, VReg, PtrVT);
+      SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDOperand PtrOff = DAG.getConstant(MVT::getSizeInBits(PtrVT)/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, PtrOff.getValueType(), FIN, PtrOff);
+    }
+
+    // In ELF 32 ABI, the double arguments are stored to the VarArgsFrameIndex
+    // on the stack.
+    if (isELF32_ABI) {
+      for (FPR_idx = 0; FPR_idx != VarArgsNumFPR; ++FPR_idx) {
+        SDOperand Val = DAG.getRegister(FPR[FPR_idx], MVT::f64);
+        SDOperand Store = DAG.getStore(Root, Val, FIN, NULL, 0);
+        MemOps.push_back(Store);
+        // Increment the address by eight for the next argument to store
+        SDOperand PtrOff = DAG.getConstant(MVT::getSizeInBits(MVT::f64)/8,
+                                           PtrVT);
+        FIN = DAG.getNode(ISD::ADD, PtrOff.getValueType(), FIN, PtrOff);
+      }
+
+      for (; FPR_idx != Num_FPR_Regs; ++FPR_idx) {
+        unsigned VReg;
+        VReg = RegMap->createVirtualRegister(&PPC::F8RCRegClass);
+
+        MF.addLiveIn(FPR[FPR_idx], VReg);
+        SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::f64);
+        SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
+        MemOps.push_back(Store);
+        // Increment the address by eight for the next argument to store
+        SDOperand PtrOff = DAG.getConstant(MVT::getSizeInBits(MVT::f64)/8,
+                                           PtrVT);
+        FIN = DAG.getNode(ISD::ADD, PtrOff.getValueType(), FIN, PtrOff);
+      }
+    }
+
+    if (!MemOps.empty())
+      Root = DAG.getNode(ISD::TokenFactor, MVT::Other,&MemOps[0],MemOps.size());
+  }
+  
+  ArgValues.push_back(Root);
+ 
+  // Return the new list of results.
+  std::vector<MVT::ValueType> RetVT(Op.Val->value_begin(),
+                                    Op.Val->value_end());
+  return DAG.getNode(ISD::MERGE_VALUES, RetVT, &ArgValues[0], ArgValues.size());
+}
+
+/// isCallCompatibleAddress - Return the immediate to use if the specified
+/// 32-bit value is representable in the immediate field of a BxA instruction.
+static SDNode *isBLACompatibleAddress(SDOperand Op, SelectionDAG &DAG) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+  if (!C) return 0;
+  
+  int Addr = C->getValue();
+  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
+      (Addr << 6 >> 6) != Addr)
+    return 0;  // Top 6 bits have to be sext of immediate.
+  
+  return DAG.getConstant((int)C->getValue() >> 2, MVT::i32).Val;
+}
+
+
+static SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG,
+                           const PPCSubtarget &Subtarget) {
+  SDOperand Chain  = Op.getOperand(0);
+  bool isVarArg    = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  SDOperand Callee = Op.getOperand(4);
+  unsigned NumOps  = (Op.getNumOperands() - 5) / 2;
+  
+  bool isMachoABI = Subtarget.isMachoABI();
+  bool isELF32_ABI  = Subtarget.isELF32_ABI();
+
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+  
+  // args_to_use will accumulate outgoing args for the PPCISD::CALL case in
+  // SelectExpr to use to put the arguments in the appropriate registers.
+  std::vector<SDOperand> args_to_use;
+  
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with 24/48 bytes, which is
+  // prereserved space for [SP][CR][LR][3 x unused].
+  unsigned NumBytes = PPCFrameInfo::getLinkageSize(isPPC64, isMachoABI);
+  
+  // Add up all the space actually used.
+  for (unsigned i = 0; i != NumOps; ++i) {
+    unsigned ArgSize =MVT::getSizeInBits(Op.getOperand(5+2*i).getValueType())/8;
+    ArgSize = std::max(ArgSize, PtrByteSize);
+    NumBytes += ArgSize;
+  }
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  NumBytes = std::max(NumBytes,
+                      PPCFrameInfo::getMinCallFrameSize(isPPC64, isMachoABI));
+  
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain,
+                               DAG.getConstant(NumBytes, PtrVT));
+  
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDOperand StackPtr;
+  if (isPPC64)
+    StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+  else
+    StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+  
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = PPCFrameInfo::getLinkageSize(isPPC64, isMachoABI);
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+  
+  static const unsigned GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const unsigned GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const unsigned *FPR = GetFPR(Subtarget);
+  
+  static const unsigned VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  const unsigned NumGPRs = sizeof(GPR_32)/sizeof(GPR_32[0]);
+  const unsigned NumFPRs = isMachoABI ? 13 : 8;
+  const unsigned NumVRs  = sizeof( VR)/sizeof( VR[0]);
+  
+  const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  std::vector<std::pair<unsigned, SDOperand> > RegsToPass;
+  SmallVector<SDOperand, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    bool inMem = false;
+    SDOperand Arg = Op.getOperand(5+2*i);
+    unsigned Flags = cast<ConstantSDNode>(Op.getOperand(5+2*i+1))->getValue();
+    unsigned AlignFlag = 1 << ISD::ParamFlags::OrigAlignmentOffs;
+    // See if next argument requires stack alignment in ELF
+    unsigned next = 5+2*(i+1)+1;
+    bool Expand = (Arg.getValueType() == MVT::f64) || ((i + 1 < NumOps) &&
+      (cast<ConstantSDNode>(Op.getOperand(next))->getValue() & AlignFlag) &&
+      (!(Flags & AlignFlag)));
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDOperand PtrOff;
+    
+    // Stack align in ELF 32
+    if (isELF32_ABI && Expand)
+      PtrOff = DAG.getConstant(ArgOffset + ((ArgOffset/4) % 2) * PtrByteSize,
+                               StackPtr.getValueType());
+    else
+      PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, PtrVT, StackPtr, PtrOff);
+
+    // On PPC64, promote integers to 64-bit values.
+    if (isPPC64 && Arg.getValueType() == MVT::i32) {
+      unsigned ExtOp = (Flags & 1) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+      Arg = DAG.getNode(ExtOp, MVT::i64, Arg);
+    }
+    
+    switch (Arg.getValueType()) {
+    default: assert(0 && "Unexpected ValueType for argument!");
+    case MVT::i32:
+    case MVT::i64:
+      // Double word align in ELF
+      if (isELF32_ABI && Expand) GPR_idx += (GPR_idx % 2);
+      if (GPR_idx != NumGPRs) {
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+        inMem = true;
+      }
+      if (inMem || isMachoABI) {
+        // Stack align in ELF
+        if (isELF32_ABI && Expand)
+          ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize;
+
+        ArgOffset += PtrByteSize;
+      }
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (isVarArg) {
+        // Float varargs need to be promoted to double.
+        if (Arg.getValueType() == MVT::f32)
+          Arg = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Arg);
+      }
+    
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+        if (isVarArg) {
+          SDOperand Store = DAG.getStore(Chain, Arg, PtrOff, NULL, 0);
+          MemOpChains.push_back(Store);
+
+          // Float varargs are always shadowed in available integer registers
+          if (GPR_idx != NumGPRs) {
+            SDOperand Load = DAG.getLoad(PtrVT, Store, PtrOff, NULL, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            if (isMachoABI) RegsToPass.push_back(std::make_pair(GPR[GPR_idx++],
+                                                                Load));
+          }
+          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
+            SDOperand ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+            PtrOff = DAG.getNode(ISD::ADD, PtrVT, PtrOff, ConstFour);
+            SDOperand Load = DAG.getLoad(PtrVT, Store, PtrOff, NULL, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            if (isMachoABI) RegsToPass.push_back(std::make_pair(GPR[GPR_idx++],
+                                                                Load));
+          }
+        } else {
+          // If we have any FPRs remaining, we may also have GPRs remaining.
+          // Args passed in FPRs consume either 1 (f32) or 2 (f64) available
+          // GPRs.
+          if (isMachoABI) {
+            if (GPR_idx != NumGPRs)
+              ++GPR_idx;
+            if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
+                !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
+              ++GPR_idx;
+          }
+        }
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+        inMem = true;
+      }
+      if (inMem || isMachoABI) {
+        // Stack align in ELF
+        if (isELF32_ABI && Expand)
+          ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize;
+        if (isPPC64)
+          ArgOffset += 8;
+        else
+          ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
+      }
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      assert(!isVarArg && "Don't support passing vectors to varargs yet!");
+      assert(VR_idx != NumVRs &&
+             "Don't support passing more than 12 vector args yet!");
+      RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      break;
+    }
+  }
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+  
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+ 
+  // With the ELF 32 ABI, set CR6 to true if this is a vararg call.
+  if (isVarArg && isELF32_ABI) {
+    SDOperand SetCR(DAG.getTargetNode(PPC::SETCR, MVT::i32), 0);
+    Chain = DAG.getCopyToReg(Chain, PPC::CR6, SetCR, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  std::vector<MVT::ValueType> NodeTys;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+
+  SmallVector<SDOperand, 8> Ops;
+  unsigned CallOpc = isMachoABI? PPCISD::CALL_Macho : PPCISD::CALL_ELF;
+  
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), Callee.getValueType());
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType());
+  else if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
+    // If this is an absolute destination address, use the munged value.
+    Callee = SDOperand(Dest, 0);
+  else {
+    // Otherwise, this is an indirect call.  We have to use a MTCTR/BCTRL pair
+    // to do the call, we can't use PPCISD::CALL.
+    SDOperand MTCTROps[] = {Chain, Callee, InFlag};
+    Chain = DAG.getNode(PPCISD::MTCTR, NodeTys, MTCTROps, 2+(InFlag.Val!=0));
+    InFlag = Chain.getValue(1);
+    
+    // Copy the callee address into R12 on darwin.
+    if (isMachoABI) {
+      Chain = DAG.getCopyToReg(Chain, PPC::R12, Callee, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::Other);
+    NodeTys.push_back(MVT::Flag);
+    Ops.push_back(Chain);
+    CallOpc = isMachoABI ? PPCISD::BCTRL_Macho : PPCISD::BCTRL_ELF;
+    Callee.Val = 0;
+  }
+
+  // If this is a direct call, pass the chain and the callee.
+  if (Callee.Val) {
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+  }
+  
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first, 
+                                  RegsToPass[i].second.getValueType()));
+  
+  if (InFlag.Val)
+    Ops.push_back(InFlag);
+  Chain = DAG.getNode(CallOpc, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  SDOperand ResultVals[3];
+  unsigned NumResults = 0;
+  NodeTys.clear();
+  
+  // If the call has results, copy the values out of the ret val registers.
+  switch (Op.Val->getValueType(0)) {
+  default: assert(0 && "Unexpected ret value!");
+  case MVT::Other: break;
+  case MVT::i32:
+    if (Op.Val->getValueType(1) == MVT::i32) {
+      Chain = DAG.getCopyFromReg(Chain, PPC::R3, MVT::i32, InFlag).getValue(1);
+      ResultVals[0] = Chain.getValue(0);
+      Chain = DAG.getCopyFromReg(Chain, PPC::R4, MVT::i32,
+                                 Chain.getValue(2)).getValue(1);
+      ResultVals[1] = Chain.getValue(0);
+      NumResults = 2;
+      NodeTys.push_back(MVT::i32);
+    } else {
+      Chain = DAG.getCopyFromReg(Chain, PPC::R3, MVT::i32, InFlag).getValue(1);
+      ResultVals[0] = Chain.getValue(0);
+      NumResults = 1;
+    }
+    NodeTys.push_back(MVT::i32);
+    break;
+  case MVT::i64:
+    Chain = DAG.getCopyFromReg(Chain, PPC::X3, MVT::i64, InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    NodeTys.push_back(MVT::i64);
+    break;
+  case MVT::f32:
+  case MVT::f64:
+    Chain = DAG.getCopyFromReg(Chain, PPC::F1, Op.Val->getValueType(0),
+                               InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    NodeTys.push_back(Op.Val->getValueType(0));
+    break;
+  case MVT::v4f32:
+  case MVT::v4i32:
+  case MVT::v8i16:
+  case MVT::v16i8:
+    Chain = DAG.getCopyFromReg(Chain, PPC::V2, Op.Val->getValueType(0),
+                                   InFlag).getValue(1);
+    ResultVals[0] = Chain.getValue(0);
+    NumResults = 1;
+    NodeTys.push_back(Op.Val->getValueType(0));
+    break;
+  }
+  
+  Chain = DAG.getNode(ISD::CALLSEQ_END, MVT::Other, Chain,
+                      DAG.getConstant(NumBytes, PtrVT));
+  NodeTys.push_back(MVT::Other);
+  
+  // If the function returns void, just return the chain.
+  if (NumResults == 0)
+    return Chain;
+  
+  // Otherwise, merge everything together with a MERGE_VALUES node.
+  ResultVals[NumResults++] = Chain;
+  SDOperand Res = DAG.getNode(ISD::MERGE_VALUES, NodeTys,
+                              ResultVals, NumResults);
+  return Res.getValue(Op.ResNo);
+}
+
+static SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG, TargetMachine &TM) {
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  CCState CCInfo(CC, isVarArg, TM, RVLocs);
+  CCInfo.AnalyzeReturn(Op.Val, RetCC_PPC);
+  
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Flag;
+  
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), Op.getOperand(i*2+1), Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.Val)
+    return DAG.getNode(PPCISD::RET_FLAG, MVT::Other, Chain, Flag);
+  else
+    return DAG.getNode(PPCISD::RET_FLAG, MVT::Other, Chain);
+}
+
+static SDOperand LowerSTACKRESTORE(SDOperand Op, SelectionDAG &DAG,
+                                   const PPCSubtarget &Subtarget) {
+  // When we pop the dynamic allocation we need to restore the SP link.
+  
+  // Get the corect type for pointers.
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Construct the stack pointer operand.
+  bool IsPPC64 = Subtarget.isPPC64();
+  unsigned SP = IsPPC64 ? PPC::X1 : PPC::R1;
+  SDOperand StackPtr = DAG.getRegister(SP, PtrVT);
+
+  // Get the operands for the STACKRESTORE.
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand SaveSP = Op.getOperand(1);
+  
+  // Load the old link SP.
+  SDOperand LoadLinkSP = DAG.getLoad(PtrVT, Chain, StackPtr, NULL, 0);
+  
+  // Restore the stack pointer.
+  Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), SP, SaveSP);
+  
+  // Store the old link SP.
+  return DAG.getStore(Chain, LoadLinkSP, StackPtr, NULL, 0);
+}
+
+static SDOperand LowerDYNAMIC_STACKALLOC(SDOperand Op, SelectionDAG &DAG,
+                                         const PPCSubtarget &Subtarget) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsPPC64 = Subtarget.isPPC64();
+  bool isMachoABI = Subtarget.isMachoABI();
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+   
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64, isMachoABI);
+    
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MF.getFrameInfo()->CreateFixedObject(IsPPC64? 8 : 4, FPOffset); 
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);                      
+  }
+
+  // Get the inputs.
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Size  = Op.getOperand(1);
+  
+  // Get the corect type for pointers.
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Negate the size.
+  SDOperand NegSize = DAG.getNode(ISD::SUB, PtrVT,
+                                  DAG.getConstant(0, PtrVT), Size);
+  // Construct a node for the frame pointer save index.
+  SDOperand FPSIdx = DAG.getFrameIndex(FPSI, PtrVT);
+  // Build a DYNALLOC node.
+  SDOperand Ops[3] = { Chain, NegSize, FPSIdx };
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
+  return DAG.getNode(PPCISD::DYNALLOC, VTs, Ops, 3);
+}
+
+
+/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
+/// possible.
+static SDOperand LowerSELECT_CC(SDOperand Op, SelectionDAG &DAG) {
+  // Not FP? Not a fsel.
+  if (!MVT::isFloatingPoint(Op.getOperand(0).getValueType()) ||
+      !MVT::isFloatingPoint(Op.getOperand(2).getValueType()))
+    return SDOperand();
+  
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  
+  // Cannot handle SETEQ/SETNE.
+  if (CC == ISD::SETEQ || CC == ISD::SETNE) return SDOperand();
+  
+  MVT::ValueType ResVT = Op.getValueType();
+  MVT::ValueType CmpVT = Op.getOperand(0).getValueType();
+  SDOperand LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+  SDOperand TV  = Op.getOperand(2), FV  = Op.getOperand(3);
+  
+  // If the RHS of the comparison is a 0.0, we don't need to do the
+  // subtraction at all.
+  if (isFloatingPointZero(RHS))
+    switch (CC) {
+    default: break;       // SETUO etc aren't handled by fsel.
+    case ISD::SETULT:
+    case ISD::SETOLT:
+    case ISD::SETLT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETUGE:
+    case ISD::SETOGE:
+    case ISD::SETGE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, ResVT, LHS, TV, FV);
+    case ISD::SETUGT:
+    case ISD::SETOGT:
+    case ISD::SETGT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETULE:
+    case ISD::SETOLE:
+    case ISD::SETLE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, ResVT,
+                         DAG.getNode(ISD::FNEG, MVT::f64, LHS), TV, FV);
+    }
+      
+      SDOperand Cmp;
+  switch (CC) {
+  default: break;       // SETUO etc aren't handled by fsel.
+  case ISD::SETULT:
+  case ISD::SETOLT:
+  case ISD::SETLT:
+    Cmp = DAG.getNode(ISD::FSUB, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, ResVT, Cmp, FV, TV);
+  case ISD::SETUGE:
+  case ISD::SETOGE:
+  case ISD::SETGE:
+    Cmp = DAG.getNode(ISD::FSUB, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, ResVT, Cmp, TV, FV);
+  case ISD::SETUGT:
+  case ISD::SETOGT:
+  case ISD::SETGT:
+    Cmp = DAG.getNode(ISD::FSUB, CmpVT, RHS, LHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, ResVT, Cmp, FV, TV);
+  case ISD::SETULE:
+  case ISD::SETOLE:
+  case ISD::SETLE:
+    Cmp = DAG.getNode(ISD::FSUB, CmpVT, RHS, LHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, ResVT, Cmp, TV, FV);
+  }
+  return SDOperand();
+}
+
+static SDOperand LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) {
+  assert(MVT::isFloatingPoint(Op.getOperand(0).getValueType()));
+  SDOperand Src = Op.getOperand(0);
+  if (Src.getValueType() == MVT::f32)
+    Src = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Src);
+  
+  SDOperand Tmp;
+  switch (Op.getValueType()) {
+  default: assert(0 && "Unhandled FP_TO_SINT type in custom expander!");
+  case MVT::i32:
+    Tmp = DAG.getNode(PPCISD::FCTIWZ, MVT::f64, Src);
+    break;
+  case MVT::i64:
+    Tmp = DAG.getNode(PPCISD::FCTIDZ, MVT::f64, Src);
+    break;
+  }
+  
+  // Convert the FP value to an int value through memory.
+  SDOperand Bits = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Tmp);
+  if (Op.getValueType() == MVT::i32)
+    Bits = DAG.getNode(ISD::TRUNCATE, MVT::i32, Bits);
+  return Bits;
+}
+
+static SDOperand LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG) {
+  if (Op.getOperand(0).getValueType() == MVT::i64) {
+    SDOperand Bits = DAG.getNode(ISD::BIT_CONVERT, MVT::f64, Op.getOperand(0));
+    SDOperand FP = DAG.getNode(PPCISD::FCFID, MVT::f64, Bits);
+    if (Op.getValueType() == MVT::f32)
+      FP = DAG.getNode(ISD::FP_ROUND, MVT::f32, FP);
+    return FP;
+  }
+  
+  assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+         "Unhandled SINT_TO_FP type in custom expander!");
+  // Since we only generate this in 64-bit mode, we can take advantage of
+  // 64-bit registers.  In particular, sign extend the input value into the
+  // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
+  // then lfd it and fcfid it.
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(8, 8);
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDOperand FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+  
+  SDOperand Ext64 = DAG.getNode(PPCISD::EXTSW_32, MVT::i32,
+                                Op.getOperand(0));
+  
+  // STD the extended value into the stack slot.
+  SDOperand Store = DAG.getNode(PPCISD::STD_32, MVT::Other,
+                                DAG.getEntryNode(), Ext64, FIdx,
+                                DAG.getSrcValue(NULL));
+  // Load the value as a double.
+  SDOperand Ld = DAG.getLoad(MVT::f64, Store, FIdx, NULL, 0);
+  
+  // FCFID it and return it.
+  SDOperand FP = DAG.getNode(PPCISD::FCFID, MVT::f64, Ld);
+  if (Op.getValueType() == MVT::f32)
+    FP = DAG.getNode(ISD::FP_ROUND, MVT::f32, FP);
+  return FP;
+}
+
+static SDOperand LowerSHL_PARTS(SDOperand Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 &&
+         Op.getOperand(1).getValueType() == MVT::i32 && "Unexpected SHL!");
+  
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDOperand Lo = Op.getOperand(0);
+  SDOperand Hi = Op.getOperand(1);
+  SDOperand Amt = Op.getOperand(2);
+  
+  SDOperand Tmp1 = DAG.getNode(ISD::SUB, MVT::i32,
+                               DAG.getConstant(32, MVT::i32), Amt);
+  SDOperand Tmp2 = DAG.getNode(PPCISD::SHL, MVT::i32, Hi, Amt);
+  SDOperand Tmp3 = DAG.getNode(PPCISD::SRL, MVT::i32, Lo, Tmp1);
+  SDOperand Tmp4 = DAG.getNode(ISD::OR , MVT::i32, Tmp2, Tmp3);
+  SDOperand Tmp5 = DAG.getNode(ISD::ADD, MVT::i32, Amt,
+                               DAG.getConstant(-32U, MVT::i32));
+  SDOperand Tmp6 = DAG.getNode(PPCISD::SHL, MVT::i32, Lo, Tmp5);
+  SDOperand OutHi = DAG.getNode(ISD::OR, MVT::i32, Tmp4, Tmp6);
+  SDOperand OutLo = DAG.getNode(PPCISD::SHL, MVT::i32, Lo, Amt);
+  SDOperand OutOps[] = { OutLo, OutHi };
+  return DAG.getNode(ISD::MERGE_VALUES, DAG.getVTList(MVT::i32, MVT::i32),
+                     OutOps, 2);
+}
+
+static SDOperand LowerSRL_PARTS(SDOperand Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 &&
+         Op.getOperand(1).getValueType() == MVT::i32 && "Unexpected SRL!");
+  
+  // Otherwise, expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDOperand Lo = Op.getOperand(0);
+  SDOperand Hi = Op.getOperand(1);
+  SDOperand Amt = Op.getOperand(2);
+  
+  SDOperand Tmp1 = DAG.getNode(ISD::SUB, MVT::i32,
+                               DAG.getConstant(32, MVT::i32), Amt);
+  SDOperand Tmp2 = DAG.getNode(PPCISD::SRL, MVT::i32, Lo, Amt);
+  SDOperand Tmp3 = DAG.getNode(PPCISD::SHL, MVT::i32, Hi, Tmp1);
+  SDOperand Tmp4 = DAG.getNode(ISD::OR , MVT::i32, Tmp2, Tmp3);
+  SDOperand Tmp5 = DAG.getNode(ISD::ADD, MVT::i32, Amt,
+                               DAG.getConstant(-32U, MVT::i32));
+  SDOperand Tmp6 = DAG.getNode(PPCISD::SRL, MVT::i32, Hi, Tmp5);
+  SDOperand OutLo = DAG.getNode(ISD::OR, MVT::i32, Tmp4, Tmp6);
+  SDOperand OutHi = DAG.getNode(PPCISD::SRL, MVT::i32, Hi, Amt);
+  SDOperand OutOps[] = { OutLo, OutHi };
+  return DAG.getNode(ISD::MERGE_VALUES, DAG.getVTList(MVT::i32, MVT::i32),
+                     OutOps, 2);
+}
+
+static SDOperand LowerSRA_PARTS(SDOperand Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 &&
+         Op.getOperand(1).getValueType() == MVT::i32 && "Unexpected SRA!");
+  
+  // Otherwise, expand into a bunch of logical ops, followed by a select_cc.
+  SDOperand Lo = Op.getOperand(0);
+  SDOperand Hi = Op.getOperand(1);
+  SDOperand Amt = Op.getOperand(2);
+  
+  SDOperand Tmp1 = DAG.getNode(ISD::SUB, MVT::i32,
+                               DAG.getConstant(32, MVT::i32), Amt);
+  SDOperand Tmp2 = DAG.getNode(PPCISD::SRL, MVT::i32, Lo, Amt);
+  SDOperand Tmp3 = DAG.getNode(PPCISD::SHL, MVT::i32, Hi, Tmp1);
+  SDOperand Tmp4 = DAG.getNode(ISD::OR , MVT::i32, Tmp2, Tmp3);
+  SDOperand Tmp5 = DAG.getNode(ISD::ADD, MVT::i32, Amt,
+                               DAG.getConstant(-32U, MVT::i32));
+  SDOperand Tmp6 = DAG.getNode(PPCISD::SRA, MVT::i32, Hi, Tmp5);
+  SDOperand OutHi = DAG.getNode(PPCISD::SRA, MVT::i32, Hi, Amt);
+  SDOperand OutLo = DAG.getSelectCC(Tmp5, DAG.getConstant(0, MVT::i32),
+                                    Tmp4, Tmp6, ISD::SETLE);
+  SDOperand OutOps[] = { OutLo, OutHi };
+  return DAG.getNode(ISD::MERGE_VALUES, DAG.getVTList(MVT::i32, MVT::i32),
+                     OutOps, 2);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering.
+//
+
+// If this is a vector of constants or undefs, get the bits.  A bit in
+// UndefBits is set if the corresponding element of the vector is an 
+// ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
+// zero.   Return true if this is not an array of constants, false if it is.
+//
+static bool GetConstantBuildVectorBits(SDNode *BV, uint64_t VectorBits[2],
+                                       uint64_t UndefBits[2]) {
+  // Start with zero'd results.
+  VectorBits[0] = VectorBits[1] = UndefBits[0] = UndefBits[1] = 0;
+  
+  unsigned EltBitSize = MVT::getSizeInBits(BV->getOperand(0).getValueType());
+  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+    SDOperand OpVal = BV->getOperand(i);
+    
+    unsigned PartNo = i >= e/2;     // In the upper 128 bits?
+    unsigned SlotNo = e/2 - (i & (e/2-1))-1;  // Which subpiece of the uint64_t.
+
+    uint64_t EltBits = 0;
+    if (OpVal.getOpcode() == ISD::UNDEF) {
+      uint64_t EltUndefBits = ~0U >> (32-EltBitSize);
+      UndefBits[PartNo] |= EltUndefBits << (SlotNo*EltBitSize);
+      continue;
+    } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+      EltBits = CN->getValue() & (~0U >> (32-EltBitSize));
+    } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
+      assert(CN->getValueType(0) == MVT::f32 &&
+             "Only one legal FP vector type!");
+      EltBits = FloatToBits(CN->getValue());
+    } else {
+      // Nonconstant element.
+      return true;
+    }
+    
+    VectorBits[PartNo] |= EltBits << (SlotNo*EltBitSize);
+  }
+  
+  //printf("%llx %llx  %llx %llx\n", 
+  //       VectorBits[0], VectorBits[1], UndefBits[0], UndefBits[1]);
+  return false;
+}
+
+// If this is a splat (repetition) of a value across the whole vector, return
+// the smallest size that splats it.  For example, "0x01010101010101..." is a
+// splat of 0x01, 0x0101, and 0x01010101.  We return SplatBits = 0x01 and 
+// SplatSize = 1 byte.
+static bool isConstantSplat(const uint64_t Bits128[2], 
+                            const uint64_t Undef128[2],
+                            unsigned &SplatBits, unsigned &SplatUndef,
+                            unsigned &SplatSize) {
+  
+  // Don't let undefs prevent splats from matching.  See if the top 64-bits are
+  // the same as the lower 64-bits, ignoring undefs.
+  if ((Bits128[0] & ~Undef128[1]) != (Bits128[1] & ~Undef128[0]))
+    return false;  // Can't be a splat if two pieces don't match.
+  
+  uint64_t Bits64  = Bits128[0] | Bits128[1];
+  uint64_t Undef64 = Undef128[0] & Undef128[1];
+  
+  // Check that the top 32-bits are the same as the lower 32-bits, ignoring
+  // undefs.
+  if ((Bits64 & (~Undef64 >> 32)) != ((Bits64 >> 32) & ~Undef64))
+    return false;  // Can't be a splat if two pieces don't match.
+
+  uint32_t Bits32  = uint32_t(Bits64) | uint32_t(Bits64 >> 32);
+  uint32_t Undef32 = uint32_t(Undef64) & uint32_t(Undef64 >> 32);
+
+  // If the top 16-bits are different than the lower 16-bits, ignoring
+  // undefs, we have an i32 splat.
+  if ((Bits32 & (~Undef32 >> 16)) != ((Bits32 >> 16) & ~Undef32)) {
+    SplatBits = Bits32;
+    SplatUndef = Undef32;
+    SplatSize = 4;
+    return true;
+  }
+  
+  uint16_t Bits16  = uint16_t(Bits32)  | uint16_t(Bits32 >> 16);
+  uint16_t Undef16 = uint16_t(Undef32) & uint16_t(Undef32 >> 16);
+
+  // If the top 8-bits are different than the lower 8-bits, ignoring
+  // undefs, we have an i16 splat.
+  if ((Bits16 & (uint16_t(~Undef16) >> 8)) != ((Bits16 >> 8) & ~Undef16)) {
+    SplatBits = Bits16;
+    SplatUndef = Undef16;
+    SplatSize = 2;
+    return true;
+  }
+  
+  // Otherwise, we have an 8-bit splat.
+  SplatBits  = uint8_t(Bits16)  | uint8_t(Bits16 >> 8);
+  SplatUndef = uint8_t(Undef16) & uint8_t(Undef16 >> 8);
+  SplatSize = 1;
+  return true;
+}
+
+/// BuildSplatI - Build a canonical splati of Val with an element size of
+/// SplatSize.  Cast the result to VT.
+static SDOperand BuildSplatI(int Val, unsigned SplatSize, MVT::ValueType VT,
+                             SelectionDAG &DAG) {
+  assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
+
+  static const MVT::ValueType VTys[] = { // canonical VT to use for each size.
+    MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
+  };
+
+  MVT::ValueType ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
+  
+  // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
+  if (Val == -1)
+    SplatSize = 1;
+  
+  MVT::ValueType CanonicalVT = VTys[SplatSize-1];
+  
+  // Build a canonical splat for this value.
+  SDOperand Elt = DAG.getConstant(Val, MVT::getVectorElementType(CanonicalVT));
+  SmallVector<SDOperand, 8> Ops;
+  Ops.assign(MVT::getVectorNumElements(CanonicalVT), Elt);
+  SDOperand Res = DAG.getNode(ISD::BUILD_VECTOR, CanonicalVT,
+                              &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BIT_CONVERT, ReqVT, Res);
+}
+
+/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDOperand BuildIntrinsicOp(unsigned IID, SDOperand LHS, SDOperand RHS,
+                                  SelectionDAG &DAG, 
+                                  MVT::ValueType DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = LHS.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DestVT,
+                     DAG.getConstant(IID, MVT::i32), LHS, RHS);
+}
+
+/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDOperand BuildIntrinsicOp(unsigned IID, SDOperand Op0, SDOperand Op1,
+                                  SDOperand Op2, SelectionDAG &DAG, 
+                                  MVT::ValueType DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = Op0.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DestVT,
+                     DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2);
+}
+
+
+/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
+/// amount.  The result has the specified value type.
+static SDOperand BuildVSLDOI(SDOperand LHS, SDOperand RHS, unsigned Amt,
+                             MVT::ValueType VT, SelectionDAG &DAG) {
+  // Force LHS/RHS to be the right type.
+  LHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, LHS);
+  RHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, RHS);
+  
+  SDOperand Ops[16];
+  for (unsigned i = 0; i != 16; ++i)
+    Ops[i] = DAG.getConstant(i+Amt, MVT::i32);
+  SDOperand T = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v16i8, LHS, RHS,
+                            DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8, Ops,16));
+  return DAG.getNode(ISD::BIT_CONVERT, VT, T);
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.  If we CAN select this case, and if it
+// selects to a single instruction, return Op.  Otherwise, if we can codegen
+// this case more efficiently than a constant pool load, lower it to the
+// sequence of ops that should be used.
+static SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
+  // If this is a vector of constants or undefs, get the bits.  A bit in
+  // UndefBits is set if the corresponding element of the vector is an 
+  // ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
+  // zero. 
+  uint64_t VectorBits[2];
+  uint64_t UndefBits[2];
+  if (GetConstantBuildVectorBits(Op.Val, VectorBits, UndefBits))
+    return SDOperand();   // Not a constant vector.
+  
+  // If this is a splat (repetition) of a value across the whole vector, return
+  // the smallest size that splats it.  For example, "0x01010101010101..." is a
+  // splat of 0x01, 0x0101, and 0x01010101.  We return SplatBits = 0x01 and 
+  // SplatSize = 1 byte.
+  unsigned SplatBits, SplatUndef, SplatSize;
+  if (isConstantSplat(VectorBits, UndefBits, SplatBits, SplatUndef, SplatSize)){
+    bool HasAnyUndefs = (UndefBits[0] | UndefBits[1]) != 0;
+    
+    // First, handle single instruction cases.
+    
+    // All zeros?
+    if (SplatBits == 0) {
+      // Canonicalize all zero vectors to be v4i32.
+      if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
+        SDOperand Z = DAG.getConstant(0, MVT::i32);
+        Z = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Z, Z, Z, Z);
+        Op = DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Z);
+      }
+      return Op;
+    }
+
+    // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
+    int32_t SextVal= int32_t(SplatBits << (32-8*SplatSize)) >> (32-8*SplatSize);
+    if (SextVal >= -16 && SextVal <= 15)
+      return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG);
+    
+    
+    // Two instruction sequences.
+    
+    // If this value is in the range [-32,30] and is even, use:
+    //    tmp = VSPLTI[bhw], result = add tmp, tmp
+    if (SextVal >= -32 && SextVal <= 30 && (SextVal & 1) == 0) {
+      Op = BuildSplatI(SextVal >> 1, SplatSize, Op.getValueType(), DAG);
+      return DAG.getNode(ISD::ADD, Op.getValueType(), Op, Op);
+    }
+    
+    // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is 
+    // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000).  This is important
+    // for fneg/fabs.
+    if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
+      // Make -1 and vspltisw -1:
+      SDOperand OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG);
+      
+      // Make the VSLW intrinsic, computing 0x8000_0000.
+      SDOperand Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV, 
+                                       OnesV, DAG);
+      
+      // xor by OnesV to invert it.
+      Res = DAG.getNode(ISD::XOR, MVT::v4i32, Res, OnesV);
+      return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Res);
+    }
+
+    // Check to see if this is a wide variety of vsplti*, binop self cases.
+    unsigned SplatBitSize = SplatSize*8;
+    static const signed char SplatCsts[] = {
+      -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
+      -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
+    };
+    
+    for (unsigned idx = 0; idx < sizeof(SplatCsts)/sizeof(SplatCsts[0]); ++idx){
+      // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
+      // cases which are ambiguous (e.g. formation of 0x8000_0000).  'vsplti -1'
+      int i = SplatCsts[idx];
+      
+      // Figure out what shift amount will be used by altivec if shifted by i in
+      // this splat size.
+      unsigned TypeShiftAmt = i & (SplatBitSize-1);
+      
+      // vsplti + shl self.
+      if (SextVal == (i << (int)TypeShiftAmt)) {
+        SDOperand Res = BuildSplatI(i, SplatSize, MVT::Other, DAG);
+        static const unsigned IIDs[] = { // Intrinsic to use for each size.
+          Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
+          Intrinsic::ppc_altivec_vslw
+        };
+        Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG);
+        return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Res);
+      }
+      
+      // vsplti + srl self.
+      if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+        SDOperand Res = BuildSplatI(i, SplatSize, MVT::Other, DAG);
+        static const unsigned IIDs[] = { // Intrinsic to use for each size.
+          Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
+          Intrinsic::ppc_altivec_vsrw
+        };
+        Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG);
+        return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Res);
+      }
+      
+      // vsplti + sra self.
+      if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+        SDOperand Res = BuildSplatI(i, SplatSize, MVT::Other, DAG);
+        static const unsigned IIDs[] = { // Intrinsic to use for each size.
+          Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
+          Intrinsic::ppc_altivec_vsraw
+        };
+        Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG);
+        return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Res);
+      }
+      
+      // vsplti + rol self.
+      if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
+                           ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
+        SDOperand Res = BuildSplatI(i, SplatSize, MVT::Other, DAG);
+        static const unsigned IIDs[] = { // Intrinsic to use for each size.
+          Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
+          Intrinsic::ppc_altivec_vrlw
+        };
+        Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG);
+        return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Res);
+      }
+
+      // t = vsplti c, result = vsldoi t, t, 1
+      if (SextVal == ((i << 8) | (i >> (TypeShiftAmt-8)))) {
+        SDOperand T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG);
+        return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG);
+      }
+      // t = vsplti c, result = vsldoi t, t, 2
+      if (SextVal == ((i << 16) | (i >> (TypeShiftAmt-16)))) {
+        SDOperand T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG);
+        return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG);
+      }
+      // t = vsplti c, result = vsldoi t, t, 3
+      if (SextVal == ((i << 24) | (i >> (TypeShiftAmt-24)))) {
+        SDOperand T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG);
+        return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG);
+      }
+    }
+    
+    // Three instruction sequences.
+    
+    // Odd, in range [17,31]:  (vsplti C)-(vsplti -16).
+    if (SextVal >= 0 && SextVal <= 31) {
+      SDOperand LHS = BuildSplatI(SextVal-16, SplatSize, MVT::Other, DAG);
+      SDOperand RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG);
+      LHS = DAG.getNode(ISD::SUB, Op.getValueType(), LHS, RHS);
+      return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), LHS);
+    }
+    // Odd, in range [-31,-17]:  (vsplti C)+(vsplti -16).
+    if (SextVal >= -31 && SextVal <= 0) {
+      SDOperand LHS = BuildSplatI(SextVal+16, SplatSize, MVT::Other, DAG);
+      SDOperand RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG);
+      LHS = DAG.getNode(ISD::ADD, Op.getValueType(), LHS, RHS);
+      return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), LHS);
+    }
+  }
+    
+  return SDOperand();
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDOperand GeneratePerfectShuffle(unsigned PFEntry, SDOperand LHS,
+                                        SDOperand RHS, SelectionDAG &DAG) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID  = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+  
+  enum {
+    OP_COPY = 0,  // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VMRGHW,
+    OP_VMRGLW,
+    OP_VSPLTISW0,
+    OP_VSPLTISW1,
+    OP_VSPLTISW2,
+    OP_VSPLTISW3,
+    OP_VSLDOI4,
+    OP_VSLDOI8,
+    OP_VSLDOI12
+  };
+  
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+  
+  SDOperand OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG);
+  
+  unsigned ShufIdxs[16];
+  switch (OpNum) {
+  default: assert(0 && "Unknown i32 permute!");
+  case OP_VMRGHW:
+    ShufIdxs[ 0] =  0; ShufIdxs[ 1] =  1; ShufIdxs[ 2] =  2; ShufIdxs[ 3] =  3;
+    ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
+    ShufIdxs[ 8] =  4; ShufIdxs[ 9] =  5; ShufIdxs[10] =  6; ShufIdxs[11] =  7;
+    ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
+    break;
+  case OP_VMRGLW:
+    ShufIdxs[ 0] =  8; ShufIdxs[ 1] =  9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
+    ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
+    ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
+    ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
+    break;
+  case OP_VSPLTISW0:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+0;
+    break;
+  case OP_VSPLTISW1:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+4;
+    break;
+  case OP_VSPLTISW2:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+8;
+    break;
+  case OP_VSPLTISW3:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+12;
+    break;
+  case OP_VSLDOI4:
+    return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG);
+  case OP_VSLDOI8:
+    return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG);
+  case OP_VSLDOI12:
+    return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG);
+  }
+  SDOperand Ops[16];
+  for (unsigned i = 0; i != 16; ++i)
+    Ops[i] = DAG.getConstant(ShufIdxs[i], MVT::i32);
+  
+  return DAG.getNode(ISD::VECTOR_SHUFFLE, OpLHS.getValueType(), OpLHS, OpRHS,
+                     DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8, Ops, 16));
+}
+
+/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
+/// is a shuffle we can handle in a single instruction, return it.  Otherwise,
+/// return the code it can be lowered into.  Worst case, it can always be
+/// lowered into a vperm.
+static SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand V1 = Op.getOperand(0);
+  SDOperand V2 = Op.getOperand(1);
+  SDOperand PermMask = Op.getOperand(2);
+  
+  // Cases that are handled by instructions that take permute immediates
+  // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
+  // selected by the instruction selector.
+  if (V2.getOpcode() == ISD::UNDEF) {
+    if (PPC::isSplatShuffleMask(PermMask.Val, 1) ||
+        PPC::isSplatShuffleMask(PermMask.Val, 2) ||
+        PPC::isSplatShuffleMask(PermMask.Val, 4) ||
+        PPC::isVPKUWUMShuffleMask(PermMask.Val, true) ||
+        PPC::isVPKUHUMShuffleMask(PermMask.Val, true) ||
+        PPC::isVSLDOIShuffleMask(PermMask.Val, true) != -1 ||
+        PPC::isVMRGLShuffleMask(PermMask.Val, 1, true) ||
+        PPC::isVMRGLShuffleMask(PermMask.Val, 2, true) ||
+        PPC::isVMRGLShuffleMask(PermMask.Val, 4, true) ||
+        PPC::isVMRGHShuffleMask(PermMask.Val, 1, true) ||
+        PPC::isVMRGHShuffleMask(PermMask.Val, 2, true) ||
+        PPC::isVMRGHShuffleMask(PermMask.Val, 4, true)) {
+      return Op;
+    }
+  }
+  
+  // Altivec has a variety of "shuffle immediates" that take two vector inputs
+  // and produce a fixed permutation.  If any of these match, do not lower to
+  // VPERM.
+  if (PPC::isVPKUWUMShuffleMask(PermMask.Val, false) ||
+      PPC::isVPKUHUMShuffleMask(PermMask.Val, false) ||
+      PPC::isVSLDOIShuffleMask(PermMask.Val, false) != -1 ||
+      PPC::isVMRGLShuffleMask(PermMask.Val, 1, false) ||
+      PPC::isVMRGLShuffleMask(PermMask.Val, 2, false) ||
+      PPC::isVMRGLShuffleMask(PermMask.Val, 4, false) ||
+      PPC::isVMRGHShuffleMask(PermMask.Val, 1, false) ||
+      PPC::isVMRGHShuffleMask(PermMask.Val, 2, false) ||
+      PPC::isVMRGHShuffleMask(PermMask.Val, 4, false))
+    return Op;
+  
+  // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
+  // perfect shuffle table to emit an optimal matching sequence.
+  unsigned PFIndexes[4];
+  bool isFourElementShuffle = true;
+  for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
+    unsigned EltNo = 8;   // Start out undef.
+    for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
+      if (PermMask.getOperand(i*4+j).getOpcode() == ISD::UNDEF)
+        continue;   // Undef, ignore it.
+      
+      unsigned ByteSource = 
+        cast<ConstantSDNode>(PermMask.getOperand(i*4+j))->getValue();
+      if ((ByteSource & 3) != j) {
+        isFourElementShuffle = false;
+        break;
+      }
+      
+      if (EltNo == 8) {
+        EltNo = ByteSource/4;
+      } else if (EltNo != ByteSource/4) {
+        isFourElementShuffle = false;
+        break;
+      }
+    }
+    PFIndexes[i] = EltNo;
+  }
+    
+  // If this shuffle can be expressed as a shuffle of 4-byte elements, use the 
+  // perfect shuffle vector to determine if it is cost effective to do this as
+  // discrete instructions, or whether we should use a vperm.
+  if (isFourElementShuffle) {
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = 
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+    
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost  = (PFEntry >> 30);
+    
+    // Determining when to avoid vperm is tricky.  Many things affect the cost
+    // of vperm, particularly how many times the perm mask needs to be computed.
+    // For example, if the perm mask can be hoisted out of a loop or is already
+    // used (perhaps because there are multiple permutes with the same shuffle
+    // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
+    // the loop requires an extra register.
+    //
+    // As a compromise, we only emit discrete instructions if the shuffle can be
+    // generated in 3 or fewer operations.  When we have loop information 
+    // available, if this block is within a loop, we should avoid using vperm
+    // for 3-operation perms and use a constant pool load instead.
+    if (Cost < 3) 
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG);
+  }
+  
+  // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
+  // vector that will get spilled to the constant pool.
+  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+  
+  // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
+  // that it is in input element units, not in bytes.  Convert now.
+  MVT::ValueType EltVT = MVT::getVectorElementType(V1.getValueType());
+  unsigned BytesPerElement = MVT::getSizeInBits(EltVT)/8;
+  
+  SmallVector<SDOperand, 16> ResultMask;
+  for (unsigned i = 0, e = PermMask.getNumOperands(); i != e; ++i) {
+    unsigned SrcElt;
+    if (PermMask.getOperand(i).getOpcode() == ISD::UNDEF)
+      SrcElt = 0;
+    else 
+      SrcElt = cast<ConstantSDNode>(PermMask.getOperand(i))->getValue();
+    
+    for (unsigned j = 0; j != BytesPerElement; ++j)
+      ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
+                                           MVT::i8));
+  }
+  
+  SDOperand VPermMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8,
+                                    &ResultMask[0], ResultMask.size());
+  return DAG.getNode(PPCISD::VPERM, V1.getValueType(), V1, V2, VPermMask);
+}
+
+/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an
+/// altivec comparison.  If it is, return true and fill in Opc/isDot with
+/// information about the intrinsic.
+static bool getAltivecCompareInfo(SDOperand Intrin, int &CompareOpc,
+                                  bool &isDot) {
+  unsigned IntrinsicID = cast<ConstantSDNode>(Intrin.getOperand(0))->getValue();
+  CompareOpc = -1;
+  isDot = false;
+  switch (IntrinsicID) {
+  default: return false;
+    // Comparison predicates.
+  case Intrinsic::ppc_altivec_vcmpbfp_p:  CompareOpc = 966; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc =   6; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc =  70; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
+    
+    // Normal Comparisons.
+  case Intrinsic::ppc_altivec_vcmpbfp:    CompareOpc = 966; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp:   CompareOpc = 198; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequb:   CompareOpc =   6; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequh:   CompareOpc =  70; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequw:   CompareOpc = 134; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgefp:   CompareOpc = 454; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp:   CompareOpc = 710; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb:   CompareOpc = 774; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh:   CompareOpc = 838; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw:   CompareOpc = 902; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtub:   CompareOpc = 518; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh:   CompareOpc = 582; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw:   CompareOpc = 646; isDot = 0; break;
+  }
+  return true;
+}
+
+/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
+/// lower, do it, otherwise return null.
+static SDOperand LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG) {
+  // If this is a lowered altivec predicate compare, CompareOpc is set to the
+  // opcode number of the comparison.
+  int CompareOpc;
+  bool isDot;
+  if (!getAltivecCompareInfo(Op, CompareOpc, isDot))
+    return SDOperand();    // Don't custom lower most intrinsics.
+  
+  // If this is a non-dot comparison, make the VCMP node and we are done.
+  if (!isDot) {
+    SDOperand Tmp = DAG.getNode(PPCISD::VCMP, Op.getOperand(2).getValueType(),
+                                Op.getOperand(1), Op.getOperand(2),
+                                DAG.getConstant(CompareOpc, MVT::i32));
+    return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Tmp);
+  }
+  
+  // Create the PPCISD altivec 'dot' comparison node.
+  SDOperand Ops[] = {
+    Op.getOperand(2),  // LHS
+    Op.getOperand(3),  // RHS
+    DAG.getConstant(CompareOpc, MVT::i32)
+  };
+  std::vector<MVT::ValueType> VTs;
+  VTs.push_back(Op.getOperand(2).getValueType());
+  VTs.push_back(MVT::Flag);
+  SDOperand CompNode = DAG.getNode(PPCISD::VCMPo, VTs, Ops, 3);
+  
+  // Now that we have the comparison, emit a copy from the CR to a GPR.
+  // This is flagged to the above dot comparison.
+  SDOperand Flags = DAG.getNode(PPCISD::MFCR, MVT::i32,
+                                DAG.getRegister(PPC::CR6, MVT::i32),
+                                CompNode.getValue(1)); 
+  
+  // Unpack the result based on how the target uses it.
+  unsigned BitNo;   // Bit # of CR6.
+  bool InvertBit;   // Invert result?
+  switch (cast<ConstantSDNode>(Op.getOperand(1))->getValue()) {
+  default:  // Can't happen, don't crash on invalid number though.
+  case 0:   // Return the value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = false;
+    break;
+  case 1:   // Return the inverted value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = true;
+    break;
+  case 2:   // Return the value of the LT bit of CR6.
+    BitNo = 2; InvertBit = false;
+    break;
+  case 3:   // Return the inverted value of the LT bit of CR6.
+    BitNo = 2; InvertBit = true;
+    break;
+  }
+  
+  // Shift the bit into the low position.
+  Flags = DAG.getNode(ISD::SRL, MVT::i32, Flags,
+                      DAG.getConstant(8-(3-BitNo), MVT::i32));
+  // Isolate the bit.
+  Flags = DAG.getNode(ISD::AND, MVT::i32, Flags,
+                      DAG.getConstant(1, MVT::i32));
+  
+  // If we are supposed to, toggle the bit.
+  if (InvertBit)
+    Flags = DAG.getNode(ISD::XOR, MVT::i32, Flags,
+                        DAG.getConstant(1, MVT::i32));
+  return Flags;
+}
+
+static SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) {
+  // Create a stack slot that is 16-byte aligned.
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(16, 16);
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDOperand FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+  
+  // Store the input value into Value#0 of the stack slot.
+  SDOperand Store = DAG.getStore(DAG.getEntryNode(),
+                                 Op.getOperand(0), FIdx, NULL, 0);
+  // Load it out.
+  return DAG.getLoad(Op.getValueType(), Store, FIdx, NULL, 0);
+}
+
+static SDOperand LowerMUL(SDOperand Op, SelectionDAG &DAG) {
+  if (Op.getValueType() == MVT::v4i32) {
+    SDOperand LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+    
+    SDOperand Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG);
+    SDOperand Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG); // +16 as shift amt.
+    
+    SDOperand RHSSwap =   // = vrlw RHS, 16
+      BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG);
+    
+    // Shrinkify inputs to v8i16.
+    LHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, LHS);
+    RHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, RHS);
+    RHSSwap = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, RHSSwap);
+    
+    // Low parts multiplied together, generating 32-bit results (we ignore the
+    // top parts).
+    SDOperand LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
+                                        LHS, RHS, DAG, MVT::v4i32);
+    
+    SDOperand HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
+                                        LHS, RHSSwap, Zero, DAG, MVT::v4i32);
+    // Shift the high parts up 16 bits.
+    HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, Neg16, DAG);
+    return DAG.getNode(ISD::ADD, MVT::v4i32, LoProd, HiProd);
+  } else if (Op.getValueType() == MVT::v8i16) {
+    SDOperand LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+    
+    SDOperand Zero = BuildSplatI(0, 1, MVT::v8i16, DAG);
+
+    return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
+                            LHS, RHS, Zero, DAG);
+  } else if (Op.getValueType() == MVT::v16i8) {
+    SDOperand LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+    
+    // Multiply the even 8-bit parts, producing 16-bit sums.
+    SDOperand EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
+                                           LHS, RHS, DAG, MVT::v8i16);
+    EvenParts = DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, EvenParts);
+    
+    // Multiply the odd 8-bit parts, producing 16-bit sums.
+    SDOperand OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
+                                          LHS, RHS, DAG, MVT::v8i16);
+    OddParts = DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, OddParts);
+    
+    // Merge the results together.
+    SDOperand Ops[16];
+    for (unsigned i = 0; i != 8; ++i) {
+      Ops[i*2  ] = DAG.getConstant(2*i+1, MVT::i8);
+      Ops[i*2+1] = DAG.getConstant(2*i+1+16, MVT::i8);
+    }
+    return DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v16i8, EvenParts, OddParts,
+                       DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8, Ops, 16));
+  } else {
+    assert(0 && "Unknown mul to lower!");
+    abort();
+  }
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDOperand PPCTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Wasn't expecting to be able to lower this!"); 
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::VASTART:            
+    return LowerVASTART(Op, DAG, VarArgsFrameIndex, VarArgsStackOffset,
+                        VarArgsNumGPR, VarArgsNumFPR, PPCSubTarget);
+  
+  case ISD::VAARG:            
+    return LowerVAARG(Op, DAG, VarArgsFrameIndex, VarArgsStackOffset,
+                      VarArgsNumGPR, VarArgsNumFPR, PPCSubTarget);
+
+  case ISD::FORMAL_ARGUMENTS:
+    return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex, 
+                                 VarArgsStackOffset, VarArgsNumGPR,
+                                 VarArgsNumFPR, PPCSubTarget);
+
+  case ISD::CALL:               return LowerCALL(Op, DAG, PPCSubTarget);
+  case ISD::RET:                return LowerRET(Op, DAG, getTargetMachine());
+  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, PPCSubTarget);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget);
+    
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+
+  // Lower 64-bit shifts.
+  case ISD::SHL_PARTS:          return LowerSHL_PARTS(Op, DAG);
+  case ISD::SRL_PARTS:          return LowerSRL_PARTS(Op, DAG);
+  case ISD::SRA_PARTS:          return LowerSRA_PARTS(Op, DAG);
+
+  // Vector-related lowering.
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, DAG);
+  
+  // Frame & Return address.  Currently unimplemented
+  case ISD::RETURNADDR:         break;
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  }
+  return SDOperand();
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+PPCTargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                           MachineBasicBlock *BB) {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  assert((MI->getOpcode() == PPC::SELECT_CC_I4 ||
+          MI->getOpcode() == PPC::SELECT_CC_I8 ||
+          MI->getOpcode() == PPC::SELECT_CC_F4 ||
+          MI->getOpcode() == PPC::SELECT_CC_F8 ||
+          MI->getOpcode() == PPC::SELECT_CC_VRRC) &&
+         "Unexpected instr type to insert");
+  
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  ilist<MachineBasicBlock>::iterator It = BB;
+  ++It;
+  
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB);
+  unsigned SelectPred = MI->getOperand(4).getImm();
+  BuildMI(BB, TII->get(PPC::BCC))
+    .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+  MachineFunction *F = BB->getParent();
+  F->getBasicBlockList().insert(It, copy0MBB);
+  F->getBasicBlockList().insert(It, sinkMBB);
+  // Update machine-CFG edges by first adding all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  for(MachineBasicBlock::succ_iterator i = BB->succ_begin(), 
+      e = BB->succ_end(); i != e; ++i)
+    sinkMBB->addSuccessor(*i);
+  // Next, remove all successors of the current block, and add the true
+  // and fallthrough blocks as its successors.
+  while(!BB->succ_empty())
+    BB->removeSuccessor(BB->succ_begin());
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+  
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+  
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+  
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(BB, TII->get(PPC::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+  delete MI;   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDOperand PPCTargetLowering::PerformDAGCombine(SDNode *N, 
+                                               DAGCombinerInfo &DCI) const {
+  TargetMachine &TM = getTargetMachine();
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default: break;
+  case PPCISD::SHL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->getValue() == 0)   // 0 << V -> 0.
+        return N->getOperand(0);
+    }
+    break;
+  case PPCISD::SRL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->getValue() == 0)   // 0 >>u V -> 0.
+        return N->getOperand(0);
+    }
+    break;
+  case PPCISD::SRA:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->getValue() == 0 ||   //  0 >>s V -> 0.
+          C->isAllOnesValue())    // -1 >>s V -> -1.
+        return N->getOperand(0);
+    }
+    break;
+    
+  case ISD::SINT_TO_FP:
+    if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+      if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
+        // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores.
+        // We allow the src/dst to be either f32/f64, but the intermediate
+        // type must be i64.
+        if (N->getOperand(0).getValueType() == MVT::i64) {
+          SDOperand Val = N->getOperand(0).getOperand(0);
+          if (Val.getValueType() == MVT::f32) {
+            Val = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Val);
+            DCI.AddToWorklist(Val.Val);
+          }
+            
+          Val = DAG.getNode(PPCISD::FCTIDZ, MVT::f64, Val);
+          DCI.AddToWorklist(Val.Val);
+          Val = DAG.getNode(PPCISD::FCFID, MVT::f64, Val);
+          DCI.AddToWorklist(Val.Val);
+          if (N->getValueType(0) == MVT::f32) {
+            Val = DAG.getNode(ISD::FP_ROUND, MVT::f32, Val);
+            DCI.AddToWorklist(Val.Val);
+          }
+          return Val;
+        } else if (N->getOperand(0).getValueType() == MVT::i32) {
+          // If the intermediate type is i32, we can avoid the load/store here
+          // too.
+        }
+      }
+    }
+    break;
+  case ISD::STORE:
+    // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
+    if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() &&
+        N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
+        N->getOperand(1).getValueType() == MVT::i32) {
+      SDOperand Val = N->getOperand(1).getOperand(0);
+      if (Val.getValueType() == MVT::f32) {
+        Val = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Val);
+        DCI.AddToWorklist(Val.Val);
+      }
+      Val = DAG.getNode(PPCISD::FCTIWZ, MVT::f64, Val);
+      DCI.AddToWorklist(Val.Val);
+
+      Val = DAG.getNode(PPCISD::STFIWX, MVT::Other, N->getOperand(0), Val,
+                        N->getOperand(2), N->getOperand(3));
+      DCI.AddToWorklist(Val.Val);
+      return Val;
+    }
+    
+    // Turn STORE (BSWAP) -> sthbrx/stwbrx.
+    if (N->getOperand(1).getOpcode() == ISD::BSWAP &&
+        N->getOperand(1).Val->hasOneUse() &&
+        (N->getOperand(1).getValueType() == MVT::i32 ||
+         N->getOperand(1).getValueType() == MVT::i16)) {
+      SDOperand BSwapOp = N->getOperand(1).getOperand(0);
+      // Do an any-extend to 32-bits if this is a half-word input.
+      if (BSwapOp.getValueType() == MVT::i16)
+        BSwapOp = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, BSwapOp);
+
+      return DAG.getNode(PPCISD::STBRX, MVT::Other, N->getOperand(0), BSwapOp,
+                         N->getOperand(2), N->getOperand(3),
+                         DAG.getValueType(N->getOperand(1).getValueType()));
+    }
+    break;
+  case ISD::BSWAP:
+    // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
+    if (ISD::isNON_EXTLoad(N->getOperand(0).Val) &&
+        N->getOperand(0).hasOneUse() &&
+        (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16)) {
+      SDOperand Load = N->getOperand(0);
+      LoadSDNode *LD = cast<LoadSDNode>(Load);
+      // Create the byte-swapping load.
+      std::vector<MVT::ValueType> VTs;
+      VTs.push_back(MVT::i32);
+      VTs.push_back(MVT::Other);
+      SDOperand SV = DAG.getSrcValue(LD->getSrcValue(), LD->getSrcValueOffset());
+      SDOperand Ops[] = {
+        LD->getChain(),    // Chain
+        LD->getBasePtr(),  // Ptr
+        SV,                // SrcValue
+        DAG.getValueType(N->getValueType(0)) // VT
+      };
+      SDOperand BSLoad = DAG.getNode(PPCISD::LBRX, VTs, Ops, 4);
+
+      // If this is an i16 load, insert the truncate.  
+      SDOperand ResVal = BSLoad;
+      if (N->getValueType(0) == MVT::i16)
+        ResVal = DAG.getNode(ISD::TRUNCATE, MVT::i16, BSLoad);
+      
+      // First, combine the bswap away.  This makes the value produced by the
+      // load dead.
+      DCI.CombineTo(N, ResVal);
+
+      // Next, combine the load away, we give it a bogus result value but a real
+      // chain result.  The result value is dead because the bswap is dead.
+      DCI.CombineTo(Load.Val, ResVal, BSLoad.getValue(1));
+      
+      // Return N so it doesn't get rechecked!
+      return SDOperand(N, 0);
+    }
+    
+    break;
+  case PPCISD::VCMP: {
+    // If a VCMPo node already exists with exactly the same operands as this
+    // node, use its result instead of this node (VCMPo computes both a CR6 and
+    // a normal output).
+    //
+    if (!N->getOperand(0).hasOneUse() &&
+        !N->getOperand(1).hasOneUse() &&
+        !N->getOperand(2).hasOneUse()) {
+      
+      // Scan all of the users of the LHS, looking for VCMPo's that match.
+      SDNode *VCMPoNode = 0;
+      
+      SDNode *LHSN = N->getOperand(0).Val;
+      for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
+           UI != E; ++UI)
+        if ((*UI)->getOpcode() == PPCISD::VCMPo &&
+            (*UI)->getOperand(1) == N->getOperand(1) &&
+            (*UI)->getOperand(2) == N->getOperand(2) &&
+            (*UI)->getOperand(0) == N->getOperand(0)) {
+          VCMPoNode = *UI;
+          break;
+        }
+      
+      // If there is no VCMPo node, or if the flag value has a single use, don't
+      // transform this.
+      if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
+        break;
+        
+      // Look at the (necessarily single) use of the flag value.  If it has a 
+      // chain, this transformation is more complex.  Note that multiple things
+      // could use the value result, which we should ignore.
+      SDNode *FlagUser = 0;
+      for (SDNode::use_iterator UI = VCMPoNode->use_begin(); 
+           FlagUser == 0; ++UI) {
+        assert(UI != VCMPoNode->use_end() && "Didn't find user!");
+        SDNode *User = *UI;
+        for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+          if (User->getOperand(i) == SDOperand(VCMPoNode, 1)) {
+            FlagUser = User;
+            break;
+          }
+        }
+      }
+      
+      // If the user is a MFCR instruction, we know this is safe.  Otherwise we
+      // give up for right now.
+      if (FlagUser->getOpcode() == PPCISD::MFCR)
+        return SDOperand(VCMPoNode, 0);
+    }
+    break;
+  }
+  case ISD::BR_CC: {
+    // If this is a branch on an altivec predicate comparison, lower this so
+    // that we don't have to do a MFCR: instead, branch directly on CR6.  This
+    // lowering is done pre-legalize, because the legalizer lowers the predicate
+    // compare down to code that is difficult to reassemble.
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    SDOperand LHS = N->getOperand(2), RHS = N->getOperand(3);
+    int CompareOpc;
+    bool isDot;
+    
+    if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+        isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+        getAltivecCompareInfo(LHS, CompareOpc, isDot)) {
+      assert(isDot && "Can't compare against a vector result!");
+      
+      // If this is a comparison against something other than 0/1, then we know
+      // that the condition is never/always true.
+      unsigned Val = cast<ConstantSDNode>(RHS)->getValue();
+      if (Val != 0 && Val != 1) {
+        if (CC == ISD::SETEQ)      // Cond never true, remove branch.
+          return N->getOperand(0);
+        // Always !=, turn it into an unconditional branch.
+        return DAG.getNode(ISD::BR, MVT::Other, 
+                           N->getOperand(0), N->getOperand(4));
+      }
+    
+      bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
+      
+      // Create the PPCISD altivec 'dot' comparison node.
+      std::vector<MVT::ValueType> VTs;
+      SDOperand Ops[] = {
+        LHS.getOperand(2),  // LHS of compare
+        LHS.getOperand(3),  // RHS of compare
+        DAG.getConstant(CompareOpc, MVT::i32)
+      };
+      VTs.push_back(LHS.getOperand(2).getValueType());
+      VTs.push_back(MVT::Flag);
+      SDOperand CompNode = DAG.getNode(PPCISD::VCMPo, VTs, Ops, 3);
+      
+      // Unpack the result based on how the target uses it.
+      PPC::Predicate CompOpc;
+      switch (cast<ConstantSDNode>(LHS.getOperand(1))->getValue()) {
+      default:  // Can't happen, don't crash on invalid number though.
+      case 0:   // Branch on the value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
+        break;
+      case 1:   // Branch on the inverted value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
+        break;
+      case 2:   // Branch on the value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
+        break;
+      case 3:   // Branch on the inverted value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
+        break;
+      }
+
+      return DAG.getNode(PPCISD::COND_BRANCH, MVT::Other, N->getOperand(0),
+                         DAG.getConstant(CompOpc, MVT::i32),
+                         DAG.getRegister(PPC::CR6, MVT::i32),
+                         N->getOperand(4), CompNode.getValue(1));
+    }
+    break;
+  }
+  }
+  
+  return SDOperand();
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                       uint64_t Mask,
+                                                       uint64_t &KnownZero, 
+                                                       uint64_t &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  KnownZero = 0;
+  KnownOne = 0;
+  switch (Op.getOpcode()) {
+  default: break;
+  case PPCISD::LBRX: {
+    // lhbrx is known to have the top bits cleared out.
+    if (cast<VTSDNode>(Op.getOperand(3))->getVT() == MVT::i16)
+      KnownZero = 0xFFFF0000;
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Op.getOperand(0))->getValue()) {
+    default: break;
+    case Intrinsic::ppc_altivec_vcmpbfp_p:
+    case Intrinsic::ppc_altivec_vcmpeqfp_p:
+    case Intrinsic::ppc_altivec_vcmpequb_p:
+    case Intrinsic::ppc_altivec_vcmpequh_p:
+    case Intrinsic::ppc_altivec_vcmpequw_p:
+    case Intrinsic::ppc_altivec_vcmpgefp_p:
+    case Intrinsic::ppc_altivec_vcmpgtfp_p:
+    case Intrinsic::ppc_altivec_vcmpgtsb_p:
+    case Intrinsic::ppc_altivec_vcmpgtsh_p:
+    case Intrinsic::ppc_altivec_vcmpgtsw_p:
+    case Intrinsic::ppc_altivec_vcmpgtub_p:
+    case Intrinsic::ppc_altivec_vcmpgtuh_p:
+    case Intrinsic::ppc_altivec_vcmpgtuw_p:
+      KnownZero = ~1U;  // All bits but the low one are known to be zero.
+      break;
+    }        
+  }
+  }
+}
+
+
+/// getConstraintType - Given a constraint, return the type of
+/// constraint it is for this target.
+PPCTargetLowering::ConstraintType 
+PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'b':
+    case 'r':
+    case 'f':
+    case 'v':
+    case 'y':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*> 
+PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT::ValueType VT) const {
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'b':   // R1-R31
+    case 'r':   // R0-R31
+      if (VT == MVT::i64 && PPCSubTarget.isPPC64())
+        return std::make_pair(0U, PPC::G8RCRegisterClass);
+      return std::make_pair(0U, PPC::GPRCRegisterClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, PPC::F4RCRegisterClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, PPC::F8RCRegisterClass);
+      break;
+    case 'v': 
+      return std::make_pair(0U, PPC::VRRCRegisterClass);
+    case 'y':   // crrc
+      return std::make_pair(0U, PPC::CRRCRegisterClass);
+    }
+  }
+  
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+
+// isOperandValidForConstraint
+SDOperand PPCTargetLowering::
+isOperandValidForConstraint(SDOperand Op, char Letter, SelectionDAG &DAG) {
+  switch (Letter) {
+  default: break;
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P': {
+    ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
+    if (!CST) return SDOperand(0, 0); // Must be an immediate to match.
+    unsigned Value = CST->getValue();
+    switch (Letter) {
+    default: assert(0 && "Unknown constraint letter!");
+    case 'I':  // "I" is a signed 16-bit constant.
+      if ((short)Value == (int)Value)
+        return DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'J':  // "J" is a constant with only the high-order 16 bits nonzero.
+    case 'L':  // "L" is a signed 16-bit constant shifted left 16 bits.
+      if ((short)Value == 0)
+        return DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'K':  // "K" is a constant with only the low-order 16 bits nonzero.
+      if ((Value >> 16) == 0)
+        return DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'M':  // "M" is a constant that is greater than 31.
+      if (Value > 31)
+        return DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'N':  // "N" is a positive constant that is an exact power of two.
+      if ((int)Value > 0 && isPowerOf2_32(Value))
+        return DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'O':  // "O" is the constant zero. 
+      if (Value == 0)
+        return DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'P':  // "P" is a constant whose negation is a signed 16-bit constant.
+      if ((short)-Value == (int)-Value)
+        return DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    }
+    break;
+  }
+  }
+  
+  // Handle standard constraint letters.
+  return TargetLowering::isOperandValidForConstraint(Op, Letter, DAG);
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM, 
+                                              const Type *Ty) const {
+  // FIXME: PPC does not allow r+i addressing modes for vectors!
+  
+  // PPC allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+  
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+  
+  // PPC only support r+r, 
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  default:
+    // No other scales are supported.
+    return false;
+  }
+  
+  return true;
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,const Type *Ty) const{
+  // PPC allows a sign-extended 16-bit immediate field.
+  return (V > -(1 << 16) && V < (1 << 16)-1);
+}
+
+bool PPCTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
+  return false; 
+}
+
+SDOperand PPCTargetLowering::LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG)
+{
+  // Depths > 0 not supported yet! 
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0)
+    return SDOperand();
+  
+  MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool is31 = (NoFramePointerElim || MFI->hasVarSizedObjects()) 
+                  && MFI->getStackSize();
+
+  if (isPPC64)
+    return DAG.getCopyFromReg(DAG.getEntryNode(), is31 ? PPC::X31 : PPC::X1,
+      MVT::i32);
+  else
+    return DAG.getCopyFromReg(DAG.getEntryNode(), is31 ? PPC::R31 : PPC::R1,
+      MVT::i32);
+}

diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
new file mode 100644
index 0000000..0581865
--- /dev/null
+++ b/lib/Target/PowerPC/PPCISelLowering.h

@@ -0,0 +1,263 @@
+//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PPC uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+#define LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "PPC.h"
+#include "PPCSubtarget.h"
+
+namespace llvm {
+  namespace PPCISD {
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+PPC::INSTRUCTION_LIST_END,
+
+      /// FSEL - Traditional three-operand fsel node.
+      ///
+      FSEL,
+      
+      /// FCFID - The FCFID instruction, taking an f64 operand and producing
+      /// and f64 value containing the FP representation of the integer that
+      /// was temporarily in the f64 operand.
+      FCFID,
+      
+      /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64 
+      /// operand, producing an f64 value containing the integer representation
+      /// of that FP value.
+      FCTIDZ, FCTIWZ,
+      
+      /// STFIWX - The STFIWX instruction.  The first operand is an input token
+      /// chain, then an f64 value to store, then an address to store it to,
+      /// then a SRCVALUE for the address.
+      STFIWX,
+      
+      // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
+      // three v4f32 operands and producing a v4f32 result.
+      VMADDFP, VNMSUBFP,
+      
+      /// VPERM - The PPC VPERM Instruction.
+      ///
+      VPERM,
+      
+      /// Hi/Lo - These represent the high and low 16-bit parts of a global
+      /// address respectively.  These nodes have two operands, the first of
+      /// which must be a TargetGlobalAddress, and the second of which must be a
+      /// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
+      /// though these are usually folded into other nodes.
+      Hi, Lo,
+      
+      /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
+      /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+      /// compute an allocation on the stack.
+      DYNALLOC,
+      
+      /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+      
+      /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
+      /// shift amounts.  These nodes are generated by the multi-precision shift
+      /// code.
+      SRL, SRA, SHL,
+      
+      /// EXTSW_32 - This is the EXTSW instruction for use with "32-bit"
+      /// registers.
+      EXTSW_32,
+
+      /// STD_32 - This is the STD instruction for use with "32-bit" registers.
+      STD_32,
+      
+      /// CALL - A direct function call.
+      CALL_Macho, CALL_ELF,
+      
+      /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
+      /// MTCTR instruction.
+      MTCTR,
+      
+      /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
+      /// BCTRL instruction.
+      BCTRL_Macho, BCTRL_ELF,
+      
+      /// Return with a flag operand, matched by 'blr'
+      RET_FLAG,
+      
+      /// R32 = MFCR(CRREG, INFLAG) - Represents the MFCR/MFOCRF instructions.
+      /// This copies the bits corresponding to the specified CRREG into the
+      /// resultant GPR.  Bits corresponding to other CR regs are undefined.
+      MFCR,
+
+      /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
+      /// instructions.  For lack of better number, we use the opcode number
+      /// encoding for the OPC field to identify the compare.  For example, 838
+      /// is VCMPGTSH.
+      VCMP,
+      
+      /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
+      /// altivec VCMP*o instructions.  For lack of better number, we use the 
+      /// opcode number encoding for the OPC field to identify the compare.  For
+      /// example, 838 is VCMPGTSH.
+      VCMPo,
+      
+      /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
+      /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
+      /// condition register to branch on, OPC is the branch opcode to use (e.g.
+      /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
+      /// an optional input flag argument.
+      COND_BRANCH,
+      
+      /// CHAIN = STBRX CHAIN, GPRC, Ptr, SRCVALUE, Type - This is a 
+      /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
+      /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
+      /// i32.
+      STBRX, 
+      
+      /// GPRC, CHAIN = LBRX CHAIN, Ptr, SRCVALUE, Type - This is a 
+      /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
+      /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
+      /// or i32.
+      LBRX
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace PPC {
+    /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUHUM instruction.
+    bool isVPKUHUMShuffleMask(SDNode *N, bool isUnary);
+    
+    /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUWUM instruction.
+    bool isVPKUWUMShuffleMask(SDNode *N, bool isUnary);
+
+    /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGLShuffleMask(SDNode *N, unsigned UnitSize, bool isUnary);
+
+    /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGHShuffleMask(SDNode *N, unsigned UnitSize, bool isUnary);
+    
+    /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+    /// amount, otherwise return -1.
+    int isVSLDOIShuffleMask(SDNode *N, bool isUnary);
+    
+    /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a splat of a single element that is suitable for input to
+    /// VSPLTB/VSPLTH/VSPLTW.
+    bool isSplatShuffleMask(SDNode *N, unsigned EltSize);
+    
+    /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+    /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize);
+    
+    /// get_VSPLTI_elt - If this is a build_vector of constants which can be
+    /// formed by using a vspltis[bhw] instruction of the specified element
+    /// size, return the constant being splatted.  The ByteSize field indicates
+    /// the number of bytes of each element [124] -> [bhw].
+    SDOperand get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+  }
+  
+  class PPCTargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    int VarArgsStackOffset;           // StackOffset for start of stack
+                                      // arguments.
+    unsigned VarArgsNumGPR;           // Index of the first unused integer
+                                      // register for parameter passing.
+    unsigned VarArgsNumFPR;           // Index of the first unused double
+                                      // register for parameter passing.
+    int ReturnAddrIndex;              // FrameIndex for return slot.
+    const PPCSubtarget &PPCSubTarget;
+  public:
+    PPCTargetLowering(PPCTargetMachine &TM);
+    
+    /// getTargetNodeName() - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    virtual bool getPreIndexedAddressParts(SDNode *N, SDOperand &Base,
+                                           SDOperand &Offset,
+                                           ISD::MemIndexedMode &AM,
+                                           SelectionDAG &DAG);
+    
+    /// SelectAddressRegReg - Given the specified addressed, check to see if it
+    /// can be represented as an indexed [r+r] operation.  Returns false if it
+    /// can be more efficiently represented with [r+imm].
+    bool SelectAddressRegReg(SDOperand N, SDOperand &Base, SDOperand &Index,
+                             SelectionDAG &DAG);
+    
+    /// SelectAddressRegImm - Returns true if the address N can be represented
+    /// by a base register plus a signed 16-bit displacement [r+imm], and if it
+    /// is not better represented as reg+reg.
+    bool SelectAddressRegImm(SDOperand N, SDOperand &Disp, SDOperand &Base,
+                             SelectionDAG &DAG);
+    
+    /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddressRegRegOnly(SDOperand N, SDOperand &Base, SDOperand &Index,
+                                 SelectionDAG &DAG);
+
+    /// SelectAddressRegImmShift - Returns true if the address N can be
+    /// represented by a base register plus a signed 14-bit displacement
+    /// [r+imm*4].  Suitable for use by STD and friends.
+    bool SelectAddressRegImmShift(SDOperand N, SDOperand &Disp, SDOperand &Base,
+                                  SelectionDAG &DAG);
+
+    
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+    
+    virtual SDOperand PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    
+    virtual void computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                uint64_t Mask,
+                                                uint64_t &KnownZero, 
+                                                uint64_t &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                                       MachineBasicBlock *MBB);
+    
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*> 
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT::ValueType VT) const;
+    SDOperand isOperandValidForConstraint(SDOperand Op, char ConstraintLetter,
+                                          SelectionDAG &DAG);
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    
+    /// isLegalAddressImmediate - Return true if the integer value can be used
+    /// as the offset of the target addressing mode for load / store of the
+    /// given type.
+    virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const;
+
+    /// isLegalAddressImmediate - Return true if the GlobalValue can be used as
+    /// the offset of the target addressing mode.
+    virtual bool isLegalAddressImmediate(GlobalValue *GV) const;
+
+    SDOperand LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG);
+  };
+}
+
+#endif   // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H

diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
new file mode 100644
index 0000000..a7e25cf
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td

@@ -0,0 +1,590 @@
+//===- PPCInstr64Bit.td - The PowerPC 64-bit Support -------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC 64-bit instructions.  These patterns are used
+// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 64-bit operands.
+//
+def s16imm64 : Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+def u16imm64 : Operand<i64> {
+  let PrintMethod = "printU16ImmOperand";
+}
+def symbolHi64 : Operand<i64> {
+  let PrintMethod = "printSymbolHi";
+}
+def symbolLo64 : Operand<i64> {
+  let PrintMethod = "printSymbolLo";
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit transformation functions.
+//
+
+def SHL64 : SDNodeXForm<imm, [{
+  // Transformation function: 63 - imm
+  return getI32Imm(63 - N->getValue());
+}]>;
+
+def SRL64 : SDNodeXForm<imm, [{
+  // Transformation function: 64 - imm
+  return N->getValue() ? getI32Imm(64 - N->getValue()) : getI32Imm(0);
+}]>;
+
+def HI32_48 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned short)(N->getValue() >> 32));
+}]>;
+
+def HI48_64 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned short)(N->getValue() >> 48));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions.
+//
+
+def IMPLICIT_DEF_G8RC : Pseudo<(ops G8RC:$rD), "; IMPLICIT_DEF_G8RC $rD",
+                              [(set G8RC:$rD, (undef))]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calls.
+//
+
+let Defs = [LR8] in
+  def MovePCtoLR8 : Pseudo<(ops piclabel:$label), "bl $label", []>,
+                    PPC970_Unit_BRU;
+
+// Macho ABI Calls.
+let isCall = 1, noResults = 1, PPC970_Unit = 7, 
+  // All calls clobber the PPC64 non-callee saved registers.
+  Defs = [X0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR8,CTR8,
+          CR0,CR1,CR5,CR6,CR7] in {
+  // Convenient aliases for call instructions
+  def BL8_Macho  : IForm<18, 0, 1,
+                         (ops calltarget:$func, variable_ops), 
+                         "bl $func", BrB, []>;  // See Pat patterns below.
+                            
+  def BLA8_Macho : IForm<18, 1, 1,
+                         (ops aaddr:$func, variable_ops),
+                         "bla $func", BrB, [(PPCcall_Macho (i64 imm:$func))]>;
+}
+
+// ELF 64 ABI Calls = Macho ABI Calls
+// Used to define BL8_ELF and BLA8_ELF
+let isCall = 1, noResults = 1, PPC970_Unit = 7, 
+  // All calls clobber the PPC64 non-callee saved registers.
+  Defs = [X0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR8,CTR8,
+          CR0,CR1,CR5,CR6,CR7] in {
+  // Convenient aliases for call instructions
+  def BL8_ELF  : IForm<18, 0, 1,
+                       (ops calltarget:$func, variable_ops), 
+                       "bl $func", BrB, []>;  // See Pat patterns below.
+                            
+  def BLA8_ELF : IForm<18, 1, 1,
+                       (ops aaddr:$func, variable_ops),
+                       "bla $func", BrB, [(PPCcall_ELF (i64 imm:$func))]>;
+}
+
+
+// Calls
+def : Pat<(PPCcall_Macho (i64 tglobaladdr:$dst)),
+          (BL8_Macho tglobaladdr:$dst)>;
+def : Pat<(PPCcall_Macho (i64 texternalsym:$dst)),
+          (BL8_Macho texternalsym:$dst)>;
+
+def : Pat<(PPCcall_ELF (i64 tglobaladdr:$dst)),
+          (BL8_ELF tglobaladdr:$dst)>;
+def : Pat<(PPCcall_ELF (i64 texternalsym:$dst)),
+          (BL8_ELF texternalsym:$dst)>;
+
+//===----------------------------------------------------------------------===//
+// 64-bit SPR manipulation instrs.
+
+def MFCTR8 : XFXForm_1_ext<31, 339, 9, (ops G8RC:$rT), "mfctr $rT", SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+let Pattern = [(PPCmtctr G8RC:$rS)] in {
+def MTCTR8 : XFXForm_7_ext<31, 467, 9, (ops G8RC:$rS), "mtctr $rS", SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+def DYNALLOC8 : Pseudo<(ops G8RC:$result, G8RC:$negsize, memri:$fpsi),
+                       "${:comment} DYNALLOC8 $result, $negsize, $fpsi",
+                       [(set G8RC:$result,
+                             (PPCdynalloc G8RC:$negsize, iaddr:$fpsi))]>,
+                        Imp<[X1],[X1]>;
+
+def MTLR8  : XFXForm_7_ext<31, 467, 8, (ops G8RC:$rS), "mtlr $rS", SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+def MFLR8  : XFXForm_1_ext<31, 339, 8, (ops G8RC:$rT), "mflr $rT", SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+
+
+//===----------------------------------------------------------------------===//
+// Fixed point instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+
+// Copies, extends, truncates.
+def OR4To8  : XForm_6<31, 444, (ops G8RC:$rA, GPRC:$rS, GPRC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   []>;
+def OR8To4  : XForm_6<31, 444, (ops GPRC:$rA, G8RC:$rS, G8RC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   []>;
+
+def LI8  : DForm_2_r0<14, (ops G8RC:$rD, symbolLo64:$imm),
+                      "li $rD, $imm", IntGeneral,
+                      [(set G8RC:$rD, immSExt16:$imm)]>;
+def LIS8 : DForm_2_r0<15, (ops G8RC:$rD, symbolHi64:$imm),
+                      "lis $rD, $imm", IntGeneral,
+                      [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>;
+
+// Logical ops.
+def NAND8: XForm_6<31, 476, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB),
+                   "nand $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (not (and G8RC:$rS, G8RC:$rB)))]>;
+def AND8 : XForm_6<31,  28, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB),
+                   "and $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (and G8RC:$rS, G8RC:$rB))]>;
+def ANDC8: XForm_6<31,  60, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB),
+                   "andc $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (and G8RC:$rS, (not G8RC:$rB)))]>;
+def OR8  : XForm_6<31, 444, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (or G8RC:$rS, G8RC:$rB))]>;
+def NOR8 : XForm_6<31, 124, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB),
+                   "nor $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (not (or G8RC:$rS, G8RC:$rB)))]>;
+def ORC8 : XForm_6<31, 412, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB),
+                   "orc $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (or G8RC:$rS, (not G8RC:$rB)))]>;
+def EQV8 : XForm_6<31, 284, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB),
+                   "eqv $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (not (xor G8RC:$rS, G8RC:$rB)))]>;
+def XOR8 : XForm_6<31, 316, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB),
+                   "xor $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (xor G8RC:$rS, G8RC:$rB))]>;
+
+// Logical ops with immediate.
+def ANDIo8  : DForm_4<28, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2),
+                      "andi. $dst, $src1, $src2", IntGeneral,
+                      [(set G8RC:$dst, (and G8RC:$src1, immZExt16:$src2))]>,
+                      isDOT;
+def ANDISo8 : DForm_4<29, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2),
+                     "andis. $dst, $src1, $src2", IntGeneral,
+                    [(set G8RC:$dst, (and G8RC:$src1,imm16ShiftedZExt:$src2))]>,
+                     isDOT;
+def ORI8    : DForm_4<24, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2),
+                      "ori $dst, $src1, $src2", IntGeneral,
+                      [(set G8RC:$dst, (or G8RC:$src1, immZExt16:$src2))]>;
+def ORIS8   : DForm_4<25, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2),
+                      "oris $dst, $src1, $src2", IntGeneral,
+                    [(set G8RC:$dst, (or G8RC:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI8   : DForm_4<26, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2),
+                      "xori $dst, $src1, $src2", IntGeneral,
+                      [(set G8RC:$dst, (xor G8RC:$src1, immZExt16:$src2))]>;
+def XORIS8  : DForm_4<27, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2),
+                      "xoris $dst, $src1, $src2", IntGeneral,
+                   [(set G8RC:$dst, (xor G8RC:$src1, imm16ShiftedZExt:$src2))]>;
+
+def ADD8  : XOForm_1<31, 266, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                     "add $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (add G8RC:$rA, G8RC:$rB))]>;
+                     
+def ADDC8 : XOForm_1<31, 10, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                     "addc $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (addc G8RC:$rA, G8RC:$rB))]>,
+                     PPC970_DGroup_Cracked;
+def ADDE8 : XOForm_1<31, 138, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                     "adde $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (adde G8RC:$rA, G8RC:$rB))]>;
+                     
+def ADDI8  : DForm_2<14, (ops G8RC:$rD, G8RC:$rA, s16imm64:$imm),
+                     "addi $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>;
+def ADDIS8 : DForm_2<15, (ops G8RC:$rD, G8RC:$rA, symbolHi64:$imm),
+                     "addis $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (add G8RC:$rA, imm16ShiftedSExt:$imm))]>;
+
+def SUBFIC8: DForm_2< 8, (ops G8RC:$rD, G8RC:$rA, s16imm64:$imm),
+                     "subfic $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (subc immSExt16:$imm, G8RC:$rA))]>;
+def SUBF8 : XOForm_1<31, 40, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                     "subf $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (sub G8RC:$rB, G8RC:$rA))]>;
+
+def SUBFC8 : XOForm_1<31, 8, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                      "subfc $rT, $rA, $rB", IntGeneral,
+                      [(set G8RC:$rT, (subc G8RC:$rB, G8RC:$rA))]>,
+                      PPC970_DGroup_Cracked;
+
+def SUBFE8 : XOForm_1<31, 136, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                      "subfe $rT, $rA, $rB", IntGeneral,
+                      [(set G8RC:$rT, (sube G8RC:$rB, G8RC:$rA))]>;
+def ADDME8  : XOForm_3<31, 234, 0, (ops G8RC:$rT, G8RC:$rA),
+                       "addme $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (adde G8RC:$rA, immAllOnes))]>;
+def ADDZE8  : XOForm_3<31, 202, 0, (ops G8RC:$rT, G8RC:$rA),
+                       "addze $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (adde G8RC:$rA, 0))]>;
+def NEG8    : XOForm_3<31, 104, 0, (ops G8RC:$rT, G8RC:$rA),
+                       "neg $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (ineg G8RC:$rA))]>;
+def SUBFME8 : XOForm_3<31, 232, 0, (ops G8RC:$rT, G8RC:$rA),
+                       "subfme $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (sube immAllOnes, G8RC:$rA))]>;
+def SUBFZE8 : XOForm_3<31, 200, 0, (ops G8RC:$rT, G8RC:$rA),
+                       "subfze $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (sube 0, G8RC:$rA))]>;
+
+
+
+def MULHD : XOForm_1<31, 73, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                     "mulhd $rT, $rA, $rB", IntMulHW,
+                     [(set G8RC:$rT, (mulhs G8RC:$rA, G8RC:$rB))]>;
+def MULHDU : XOForm_1<31, 9, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                     "mulhdu $rT, $rA, $rB", IntMulHWU,
+                     [(set G8RC:$rT, (mulhu G8RC:$rA, G8RC:$rB))]>;
+
+def CMPD   : XForm_16_ext<31, 0, (ops CRRC:$crD, G8RC:$rA, G8RC:$rB),
+                          "cmpd $crD, $rA, $rB", IntCompare>, isPPC64;
+def CMPLD  : XForm_16_ext<31, 32, (ops CRRC:$crD, G8RC:$rA, G8RC:$rB),
+                          "cmpld $crD, $rA, $rB", IntCompare>, isPPC64;
+def CMPDI  : DForm_5_ext<11, (ops CRRC:$crD, G8RC:$rA, s16imm:$imm),
+                         "cmpdi $crD, $rA, $imm", IntCompare>, isPPC64;
+def CMPLDI : DForm_6_ext<10, (ops CRRC:$dst, G8RC:$src1, u16imm:$src2),
+                         "cmpldi $dst, $src1, $src2", IntCompare>, isPPC64;
+
+def SLD  : XForm_6<31,  27, (ops G8RC:$rA, G8RC:$rS, GPRC:$rB),
+                   "sld $rA, $rS, $rB", IntRotateD,
+                   [(set G8RC:$rA, (shl G8RC:$rS, GPRC:$rB))]>, isPPC64;
+def SRD  : XForm_6<31, 539, (ops G8RC:$rA, G8RC:$rS, GPRC:$rB),
+                   "srd $rA, $rS, $rB", IntRotateD,
+                   [(set G8RC:$rA, (srl G8RC:$rS, GPRC:$rB))]>, isPPC64;
+def SRAD : XForm_6<31, 794, (ops G8RC:$rA, G8RC:$rS, GPRC:$rB),
+                   "srad $rA, $rS, $rB", IntRotateD,
+                   [(set G8RC:$rA, (sra G8RC:$rS, GPRC:$rB))]>, isPPC64;
+                   
+def EXTSB8 : XForm_11<31, 954, (ops G8RC:$rA, G8RC:$rS),
+                      "extsb $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext_inreg G8RC:$rS, i8))]>;
+def EXTSH8 : XForm_11<31, 922, (ops G8RC:$rA, G8RC:$rS),
+                      "extsh $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext_inreg G8RC:$rS, i16))]>;
+
+def EXTSW  : XForm_11<31, 986, (ops G8RC:$rA, G8RC:$rS),
+                      "extsw $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext_inreg G8RC:$rS, i32))]>, isPPC64;
+/// EXTSW_32 - Just like EXTSW, but works on '32-bit' registers.
+def EXTSW_32 : XForm_11<31, 986, (ops GPRC:$rA, GPRC:$rS),
+                      "extsw $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (PPCextsw_32 GPRC:$rS))]>, isPPC64;
+def EXTSW_32_64 : XForm_11<31, 986, (ops G8RC:$rA, GPRC:$rS),
+                      "extsw $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext GPRC:$rS))]>, isPPC64;
+
+def SRADI  : XSForm_1<31, 413, (ops G8RC:$rA, G8RC:$rS, u6imm:$SH),
+                      "sradi $rA, $rS, $SH", IntRotateD,
+                      [(set G8RC:$rA, (sra G8RC:$rS, (i32 imm:$SH)))]>, isPPC64;
+def CNTLZD : XForm_11<31, 58, (ops G8RC:$rA, G8RC:$rS),
+                      "cntlzd $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (ctlz G8RC:$rS))]>;
+
+def DIVD  : XOForm_1<31, 489, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                     "divd $rT, $rA, $rB", IntDivD,
+                     [(set G8RC:$rT, (sdiv G8RC:$rA, G8RC:$rB))]>, isPPC64,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def DIVDU : XOForm_1<31, 457, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                     "divdu $rT, $rA, $rB", IntDivD,
+                     [(set G8RC:$rT, (udiv G8RC:$rA, G8RC:$rB))]>, isPPC64,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def MULLD : XOForm_1<31, 233, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB),
+                     "mulld $rT, $rA, $rB", IntMulHD,
+                     [(set G8RC:$rT, (mul G8RC:$rA, G8RC:$rB))]>, isPPC64;
+
+
+let isCommutable = 1 in {
+def RLDIMI : MDForm_1<30, 3,
+                      (ops G8RC:$rA, G8RC:$rSi, G8RC:$rS, u6imm:$SH, u6imm:$MB),
+                      "rldimi $rA, $rS, $SH, $MB", IntRotateD,
+                      []>, isPPC64, RegConstraint<"$rSi = $rA">,
+                      NoEncode<"$rSi">;
+}
+
+// Rotate instructions.
+def RLDICL : MDForm_1<30, 0,
+                      (ops G8RC:$rA, G8RC:$rS, u6imm:$SH, u6imm:$MB),
+                      "rldicl $rA, $rS, $SH, $MB", IntRotateD,
+                      []>, isPPC64;
+def RLDICR : MDForm_1<30, 1,
+                      (ops G8RC:$rA, G8RC:$rS, u6imm:$SH, u6imm:$ME),
+                      "rldicr $rA, $rS, $SH, $ME", IntRotateD,
+                      []>, isPPC64;
+}  // End FXU Operations.
+
+
+//===----------------------------------------------------------------------===//
+// Load/Store instructions.
+//
+
+
+// Sign extending loads.
+let isLoad = 1, PPC970_Unit = 2 in {
+def LHA8: DForm_1<42, (ops G8RC:$rD, memri:$src),
+                  "lha $rD, $src", LdStLHA,
+                  [(set G8RC:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LWA  : DSForm_1<58, 2, (ops G8RC:$rD, memrix:$src),
+                    "lwa $rD, $src", LdStLWA,
+                    [(set G8RC:$rD, (sextloadi32 ixaddr:$src))]>, isPPC64,
+                    PPC970_DGroup_Cracked;
+def LHAX8: XForm_1<31, 343, (ops G8RC:$rD, memrr:$src),
+                   "lhax $rD, $src", LdStLHA,
+                   [(set G8RC:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LWAX : XForm_1<31, 341, (ops G8RC:$rD, memrr:$src),
+                   "lwax $rD, $src", LdStLHA,
+                   [(set G8RC:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+
+// Update forms.
+def LHAU8 : DForm_1<43, (ops G8RC:$rD, ptr_rc:$ea_result, symbolLo:$disp,
+                            ptr_rc:$rA),
+                    "lhau $rD, $disp($rA)", LdStGeneral,
+                    []>, RegConstraint<"$rA = $ea_result">,
+                    NoEncode<"$ea_result">;
+// NO LWAU!
+
+}
+
+// Zero extending loads.
+let isLoad = 1, PPC970_Unit = 2 in {
+def LBZ8 : DForm_1<34, (ops G8RC:$rD, memri:$src),
+                  "lbz $rD, $src", LdStGeneral,
+                  [(set G8RC:$rD, (zextloadi8 iaddr:$src))]>;
+def LHZ8 : DForm_1<40, (ops G8RC:$rD, memri:$src),
+                  "lhz $rD, $src", LdStGeneral,
+                  [(set G8RC:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ8 : DForm_1<32, (ops G8RC:$rD, memri:$src),
+                  "lwz $rD, $src", LdStGeneral,
+                  [(set G8RC:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
+
+def LBZX8 : XForm_1<31,  87, (ops G8RC:$rD, memrr:$src),
+                   "lbzx $rD, $src", LdStGeneral,
+                   [(set G8RC:$rD, (zextloadi8 xaddr:$src))]>;
+def LHZX8 : XForm_1<31, 279, (ops G8RC:$rD, memrr:$src),
+                   "lhzx $rD, $src", LdStGeneral,
+                   [(set G8RC:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX8 : XForm_1<31,  23, (ops G8RC:$rD, memrr:$src),
+                   "lwzx $rD, $src", LdStGeneral,
+                   [(set G8RC:$rD, (zextloadi32 xaddr:$src))]>;
+                   
+                   
+// Update forms.
+def LBZU8 : DForm_1<35, (ops G8RC:$rD, ptr_rc:$ea_result, memri:$addr),
+                    "lbzu $rD, $addr", LdStGeneral,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LHZU8 : DForm_1<41, (ops G8RC:$rD, ptr_rc:$ea_result, memri:$addr),
+                    "lhzu $rD, $addr", LdStGeneral,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWZU8 : DForm_1<33, (ops G8RC:$rD, ptr_rc:$ea_result, memri:$addr),
+                    "lwzu $rD, $addr", LdStGeneral,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+}
+
+
+// Full 8-byte loads.
+let isLoad = 1, PPC970_Unit = 2 in {
+def LD   : DSForm_1<58, 0, (ops G8RC:$rD, memrix:$src),
+                    "ld $rD, $src", LdStLD,
+                    [(set G8RC:$rD, (load ixaddr:$src))]>, isPPC64;
+def LDX  : XForm_1<31,  21, (ops G8RC:$rD, memrr:$src),
+                   "ldx $rD, $src", LdStLD,
+                   [(set G8RC:$rD, (load xaddr:$src))]>, isPPC64;
+                   
+def LDU  : DSForm_1<58, 1, (ops G8RC:$rD, ptr_rc:$ea_result, memrix:$addr),
+                    "ldu $rD, $addr", LdStLD,
+                    []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
+                    NoEncode<"$ea_result">;
+
+}
+
+let isStore = 1, noResults = 1, PPC970_Unit = 2 in {
+// Truncating stores.                       
+def STB8 : DForm_1<38, (ops G8RC:$rS, memri:$src),
+                   "stb $rS, $src", LdStGeneral,
+                   [(truncstorei8 G8RC:$rS, iaddr:$src)]>;
+def STH8 : DForm_1<44, (ops G8RC:$rS, memri:$src),
+                   "sth $rS, $src", LdStGeneral,
+                   [(truncstorei16 G8RC:$rS, iaddr:$src)]>;
+def STW8 : DForm_1<36, (ops G8RC:$rS, memri:$src),
+                   "stw $rS, $src", LdStGeneral,
+                   [(truncstorei32 G8RC:$rS, iaddr:$src)]>;
+def STBX8 : XForm_8<31, 215, (ops G8RC:$rS, memrr:$dst),
+                   "stbx $rS, $dst", LdStGeneral,
+                   [(truncstorei8 G8RC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STHX8 : XForm_8<31, 407, (ops G8RC:$rS, memrr:$dst),
+                   "sthx $rS, $dst", LdStGeneral,
+                   [(truncstorei16 G8RC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STWX8 : XForm_8<31, 151, (ops G8RC:$rS, memrr:$dst),
+                   "stwx $rS, $dst", LdStGeneral,
+                   [(truncstorei32 G8RC:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+// Normal 8-byte stores.
+def STD  : DSForm_1<62, 0, (ops G8RC:$rS, memrix:$dst),
+                    "std $rS, $dst", LdStSTD,
+                    [(store G8RC:$rS, ixaddr:$dst)]>, isPPC64;
+def STDX  : XForm_8<31, 149, (ops G8RC:$rS, memrr:$dst),
+                   "stdx $rS, $dst", LdStSTD,
+                   [(store G8RC:$rS, xaddr:$dst)]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+}
+
+let isStore = 1, PPC970_Unit = 2 in {
+
+def STBU8 : DForm_1<38, (ops ptr_rc:$ea_res, G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stbu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                          (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, 
+                                         iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STHU8 : DForm_1<45, (ops ptr_rc:$ea_res, G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "sthu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                        (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, 
+                                        iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STWU8 : DForm_1<37, (ops ptr_rc:$ea_res, G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stwu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
+                                                     iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+
+
+def STDU : DSForm_1<62, 1, (ops ptr_rc:$ea_res, G8RC:$rS,
+                                s16immX4:$ptroff, ptr_rc:$ptrreg),
+                    "stdu $rS, $ptroff($ptrreg)", LdStSTD,
+                    [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
+                                                     iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    isPPC64;
+
+}
+
+let isStore = 1, noResults = 1, PPC970_Unit = 2 in {
+
+def STDUX : XForm_8<31, 181, (ops G8RC:$rS, memrr:$dst),
+                   "stdux $rS, $dst", LdStSTD,
+                   []>, isPPC64;
+                   
+
+// STD_32/STDX_32 - Just like STD/STDX, but uses a '32-bit' input register.
+def STD_32  : DSForm_1<62, 0, (ops GPRC:$rT, memrix:$dst),
+                       "std $rT, $dst", LdStSTD,
+                       [(PPCstd_32  GPRC:$rT, ixaddr:$dst)]>, isPPC64;
+def STDX_32  : XForm_8<31, 149, (ops GPRC:$rT, memrr:$dst),
+                       "stdx $rT, $dst", LdStSTD,
+                       [(PPCstd_32  GPRC:$rT, xaddr:$dst)]>, isPPC64,
+                       PPC970_DGroup_Cracked;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Floating point instructions.
+//
+
+
+let PPC970_Unit = 3 in {  // FPU Operations.
+def FCFID  : XForm_26<63, 846, (ops F8RC:$frD, F8RC:$frB),
+                      "fcfid $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (PPCfcfid F8RC:$frB))]>, isPPC64;
+def FCTIDZ : XForm_26<63, 815, (ops F8RC:$frD, F8RC:$frB),
+                      "fctidz $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (PPCfctidz F8RC:$frB))]>, isPPC64;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Patterns
+//
+
+// Extensions and truncates to/from 32-bit regs.
+def : Pat<(i64 (zext GPRC:$in)),
+          (RLDICL (OR4To8 GPRC:$in, GPRC:$in), 0, 32)>;
+def : Pat<(i64 (anyext GPRC:$in)),
+          (OR4To8 GPRC:$in, GPRC:$in)>;
+def : Pat<(i32 (trunc G8RC:$in)),
+          (OR8To4 G8RC:$in, G8RC:$in)>;
+
+// Extending loads with i64 targets.
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ8 iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX8 xaddr:$src)>;
+def : Pat<(extloadi32 iaddr:$src),
+          (LWZ8 iaddr:$src)>;
+def : Pat<(extloadi32 xaddr:$src),
+          (LWZX8 xaddr:$src)>;
+
+// SHL/SRL
+def : Pat<(shl G8RC:$in, (i32 imm:$imm)),
+          (RLDICR G8RC:$in, imm:$imm, (SHL64 imm:$imm))>;
+def : Pat<(srl G8RC:$in, (i32 imm:$imm)),
+          (RLDICL G8RC:$in, (SRL64 imm:$imm), imm:$imm)>;
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI8  tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in , 0), (LI8  tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in , 0), (LI8  tjumptable:$in)>;
+def : Pat<(add G8RC:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS8 G8RC:$in, tglobaladdr:$g)>;
+def : Pat<(add G8RC:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS8 G8RC:$in, tconstpool:$g)>;
+def : Pat<(add G8RC:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS8 G8RC:$in, tjumptable:$g)>;

diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
new file mode 100644
index 0000000..8a2f255
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td

@@ -0,0 +1,622 @@
+//===- PPCInstrAltivec.td - The PowerPC Altivec Extension --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Altivec extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Altivec transformation functions and pattern fragments.
+//
+
+/// VPKUHUM_shuffle_mask/VPKUWUM_shuffle_mask - Return true if this is a valid
+/// shuffle mask for the VPKUHUM or VPKUWUM instructions.
+def VPKUHUM_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVPKUHUMShuffleMask(N, false);
+}]>;
+def VPKUWUM_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVPKUWUMShuffleMask(N, false);
+}]>;
+
+def VPKUHUM_unary_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVPKUHUMShuffleMask(N, true);
+}]>;
+def VPKUWUM_unary_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVPKUWUMShuffleMask(N, true);
+}]>;
+
+
+def VMRGLB_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGLShuffleMask(N, 1, false);
+}]>;
+def VMRGLH_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGLShuffleMask(N, 2, false);
+}]>;
+def VMRGLW_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGLShuffleMask(N, 4, false);
+}]>;
+def VMRGHB_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGHShuffleMask(N, 1, false);
+}]>;
+def VMRGHH_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGHShuffleMask(N, 2, false);
+}]>;
+def VMRGHW_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGHShuffleMask(N, 4, false);
+}]>;
+
+def VMRGLB_unary_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGLShuffleMask(N, 1, true);
+}]>;
+def VMRGLH_unary_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGLShuffleMask(N, 2, true);
+}]>;
+def VMRGLW_unary_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGLShuffleMask(N, 4, true);
+}]>;
+def VMRGHB_unary_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGHShuffleMask(N, 1, true);
+}]>;
+def VMRGHH_unary_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGHShuffleMask(N, 2, true);
+}]>;
+def VMRGHW_unary_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isVMRGHShuffleMask(N, 4, true);
+}]>;
+
+
+def VSLDOI_get_imm : SDNodeXForm<build_vector, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, false));
+}]>;
+def VSLDOI_shuffle_mask :  PatLeaf<(build_vector), [{
+  return PPC::isVSLDOIShuffleMask(N, false) != -1;
+}], VSLDOI_get_imm>;
+
+/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
+/// vector_shuffle(X,undef,mask) by the dag combiner.
+def VSLDOI_unary_get_imm : SDNodeXForm<build_vector, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, true));
+}]>;
+def VSLDOI_unary_shuffle_mask :  PatLeaf<(build_vector), [{
+  return PPC::isVSLDOIShuffleMask(N, true) != -1;
+}], VSLDOI_unary_get_imm>;
+
+
+// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
+def VSPLTB_get_imm : SDNodeXForm<build_vector, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 1));
+}]>;
+def VSPLTB_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isSplatShuffleMask(N, 1);
+}], VSPLTB_get_imm>;
+def VSPLTH_get_imm : SDNodeXForm<build_vector, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 2));
+}]>;
+def VSPLTH_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isSplatShuffleMask(N, 2);
+}], VSPLTH_get_imm>;
+def VSPLTW_get_imm : SDNodeXForm<build_vector, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 4));
+}]>;
+def VSPLTW_shuffle_mask : PatLeaf<(build_vector), [{
+  return PPC::isSplatShuffleMask(N, 4);
+}], VSPLTW_get_imm>;
+
+
+// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm.
+def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG);
+}]>;
+def vecspltisb : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG).Val != 0;
+}], VSPLTISB_get_imm>;
+
+// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm.
+def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG);
+}]>;
+def vecspltish : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG).Val != 0;
+}], VSPLTISH_get_imm>;
+
+// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm.
+def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG);
+}]>;
+def vecspltisw : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG).Val != 0;
+}], VSPLTISW_get_imm>;
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining instructions that directly correspond to intrinsics.
+
+// VA1a_Int - A VAForm_1a intrinsic definition.
+class VA1a_Int<bits<6> xo, string opc, Intrinsic IntID>
+  : VAForm_1a<xo, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB, VRRC:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), VecFP,
+                       [(set VRRC:$vD, (IntID VRRC:$vA, VRRC:$vB, VRRC:$vC))]>;
+
+// VX1_Int - A VXForm_1 intrinsic definition.
+class VX1_Int<bits<11> xo, string opc, Intrinsic IntID>
+  : VXForm_1<xo, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), VecFP,
+             [(set VRRC:$vD, (IntID VRRC:$vA, VRRC:$vB))]>;
+
+// VX2_Int - A VXForm_2 intrinsic definition.
+class VX2_Int<bits<11> xo, string opc, Intrinsic IntID>
+  : VXForm_2<xo, (ops VRRC:$vD, VRRC:$vB),
+             !strconcat(opc, " $vD, $vB"), VecFP,
+             [(set VRRC:$vD, (IntID VRRC:$vB))]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+
+def IMPLICIT_DEF_VRRC : Pseudo<(ops VRRC:$rD), "; IMPLICIT_DEF_VRRC $rD",
+                               [(set VRRC:$rD, (v4i32 (undef)))]>;
+
+let noResults = 1 in {
+def DSS   : DSS_Form<822, (ops u5imm:$A, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
+                     "dss $STRM, $A", LdStGeneral /*FIXME*/, []>;
+def DST   : DSS_Form<342, (ops u5imm:$T, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                     "dst $rA, $rB, $STRM, $T", LdStGeneral /*FIXME*/, []>;
+def DSTST : DSS_Form<374, (ops u5imm:$T, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                   "dstst $rA, $rB, $STRM, $T", LdStGeneral /*FIXME*/, []>;
+}
+
+def MFVSCR : VXForm_4<1540, (ops VRRC:$vD),
+                      "mfvcr $vD", LdStGeneral,
+                      [(set VRRC:$vD, (int_ppc_altivec_mfvscr))]>; 
+def MTVSCR : VXForm_5<1604, (ops VRRC:$vB),
+                      "mtvcr $vB", LdStGeneral,
+                      [(int_ppc_altivec_mtvscr VRRC:$vB)]>; 
+
+let isLoad = 1, PPC970_Unit = 2 in {  // Loads.
+def LVEBX: XForm_1<31,   7, (ops VRRC:$vD, memrr:$src),
+                   "lvebx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
+def LVEHX: XForm_1<31,  39, (ops VRRC:$vD, memrr:$src),
+                   "lvehx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
+def LVEWX: XForm_1<31,  71, (ops VRRC:$vD, memrr:$src),
+                   "lvewx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
+def LVX  : XForm_1<31, 103, (ops VRRC:$vD, memrr:$src),
+                   "lvx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
+def LVXL : XForm_1<31, 359, (ops VRRC:$vD, memrr:$src),
+                   "lvxl $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
+}
+
+def LVSL : XForm_1<31,   6, (ops VRRC:$vD, memrr:$src),
+                   "lvsl $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+def LVSR : XForm_1<31,  38, (ops VRRC:$vD, memrr:$src),
+                   "lvsr $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+
+let isStore = 1, noResults = 1, PPC970_Unit = 2 in {   // Stores.
+def STVEBX: XForm_8<31, 135, (ops VRRC:$rS, memrr:$dst),
+                   "stvebx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvebx VRRC:$rS, xoaddr:$dst)]>;
+def STVEHX: XForm_8<31, 167, (ops VRRC:$rS, memrr:$dst),
+                   "stvehx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvehx VRRC:$rS, xoaddr:$dst)]>;
+def STVEWX: XForm_8<31, 199, (ops VRRC:$rS, memrr:$dst),
+                   "stvewx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvewx VRRC:$rS, xoaddr:$dst)]>;
+def STVX  : XForm_8<31, 231, (ops VRRC:$rS, memrr:$dst),
+                   "stvx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvx VRRC:$rS, xoaddr:$dst)]>;
+def STVXL : XForm_8<31, 487, (ops VRRC:$rS, memrr:$dst),
+                   "stvxl $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvxl VRRC:$rS, xoaddr:$dst)]>;
+}
+
+let PPC970_Unit = 5 in {  // VALU Operations.
+// VA-Form instructions.  3-input AltiVec ops.
+def VMADDFP : VAForm_1<46, (ops VRRC:$vD, VRRC:$vA, VRRC:$vC, VRRC:$vB),
+                       "vmaddfp $vD, $vA, $vC, $vB", VecFP,
+                       [(set VRRC:$vD, (fadd (fmul VRRC:$vA, VRRC:$vC),
+                                             VRRC:$vB))]>,
+                       Requires<[FPContractions]>;
+def VNMSUBFP: VAForm_1<47, (ops VRRC:$vD, VRRC:$vA, VRRC:$vC, VRRC:$vB),
+                       "vnmsubfp $vD, $vA, $vC, $vB", VecFP,
+                       [(set VRRC:$vD, (fneg (fsub (fmul VRRC:$vA, VRRC:$vC),
+                                                   VRRC:$vB)))]>,
+                       Requires<[FPContractions]>;
+
+def VMHADDSHS  : VA1a_Int<32, "vmhaddshs",  int_ppc_altivec_vmhaddshs>;
+def VMHRADDSHS : VA1a_Int<33, "vmhraddshs", int_ppc_altivec_vmhraddshs>;
+def VMLADDUHM  : VA1a_Int<34, "vmladduhm",  int_ppc_altivec_vmladduhm>;
+def VPERM      : VA1a_Int<43, "vperm",      int_ppc_altivec_vperm>;
+def VSEL       : VA1a_Int<42, "vsel",       int_ppc_altivec_vsel>;
+
+// Shuffles.
+def VSLDOI  : VAForm_2<44, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB, u5imm:$SH),
+                       "vsldoi $vD, $vA, $vB, $SH", VecFP,
+                       [(set VRRC:$vD, 
+                             (vector_shuffle (v16i8 VRRC:$vA), VRRC:$vB,
+                                             VSLDOI_shuffle_mask:$SH))]>;
+
+// VX-Form instructions.  AltiVec arithmetic ops.
+def VADDFP : VXForm_1<10, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vaddfp $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (fadd VRRC:$vA, VRRC:$vB))]>;
+                      
+def VADDUBM : VXForm_1<0, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vaddubm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (add (v16i8 VRRC:$vA), VRRC:$vB))]>;
+def VADDUHM : VXForm_1<64, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vadduhm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (add (v8i16 VRRC:$vA), VRRC:$vB))]>;
+def VADDUWM : VXForm_1<128, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vadduwm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (add (v4i32 VRRC:$vA), VRRC:$vB))]>;
+                      
+def VADDCUW : VX1_Int<384, "vaddcuw", int_ppc_altivec_vaddcuw>;
+def VADDSBS : VX1_Int<768, "vaddsbs", int_ppc_altivec_vaddsbs>;
+def VADDSHS : VX1_Int<832, "vaddshs", int_ppc_altivec_vaddshs>;
+def VADDSWS : VX1_Int<896, "vaddsws", int_ppc_altivec_vaddsws>;
+def VADDUBS : VX1_Int<512, "vaddubs", int_ppc_altivec_vaddubs>;
+def VADDUHS : VX1_Int<576, "vadduhs", int_ppc_altivec_vadduhs>;
+def VADDUWS : VX1_Int<640, "vadduws", int_ppc_altivec_vadduws>;
+                             
+                             
+def VAND : VXForm_1<1028, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                    "vand $vD, $vA, $vB", VecFP,
+                    [(set VRRC:$vD, (and (v4i32 VRRC:$vA), VRRC:$vB))]>;
+def VANDC : VXForm_1<1092, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                     "vandc $vD, $vA, $vB", VecFP,
+                     [(set VRRC:$vD, (and (v4i32 VRRC:$vA), (vnot VRRC:$vB)))]>;
+
+def VCFSX  : VXForm_1<842, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB),
+                      "vcfsx $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vcfsx VRRC:$vB, imm:$UIMM))]>;
+def VCFUX  : VXForm_1<778, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB),
+                      "vcfux $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vcfux VRRC:$vB, imm:$UIMM))]>;
+def VCTSXS : VXForm_1<970, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB),
+                      "vctsxs $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vctsxs VRRC:$vB, imm:$UIMM))]>;
+def VCTUXS : VXForm_1<906, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB),
+                      "vctuxs $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vctuxs VRRC:$vB, imm:$UIMM))]>;
+def VEXPTEFP : VX2_Int<394, "vexptefp", int_ppc_altivec_vexptefp>;
+def VLOGEFP  : VX2_Int<458, "vlogefp",  int_ppc_altivec_vlogefp>;
+
+def VAVGSB : VX1_Int<1282, "vavgsb", int_ppc_altivec_vavgsb>;
+def VAVGSH : VX1_Int<1346, "vavgsh", int_ppc_altivec_vavgsh>;
+def VAVGSW : VX1_Int<1410, "vavgsw", int_ppc_altivec_vavgsw>;
+def VAVGUB : VX1_Int<1026, "vavgub", int_ppc_altivec_vavgub>;
+def VAVGUH : VX1_Int<1090, "vavguh", int_ppc_altivec_vavguh>;
+def VAVGUW : VX1_Int<1154, "vavguw", int_ppc_altivec_vavguw>;
+
+def VMAXFP : VX1_Int<1034, "vmaxfp", int_ppc_altivec_vmaxfp>;
+def VMAXSB : VX1_Int< 258, "vmaxsb", int_ppc_altivec_vmaxsb>;
+def VMAXSH : VX1_Int< 322, "vmaxsh", int_ppc_altivec_vmaxsh>;
+def VMAXSW : VX1_Int< 386, "vmaxsw", int_ppc_altivec_vmaxsw>;
+def VMAXUB : VX1_Int<   2, "vmaxub", int_ppc_altivec_vmaxub>;
+def VMAXUH : VX1_Int<  66, "vmaxuh", int_ppc_altivec_vmaxuh>;
+def VMAXUW : VX1_Int< 130, "vmaxuw", int_ppc_altivec_vmaxuw>;
+def VMINFP : VX1_Int<1098, "vminfp", int_ppc_altivec_vminfp>;
+def VMINSB : VX1_Int< 770, "vminsb", int_ppc_altivec_vminsb>;
+def VMINSH : VX1_Int< 834, "vminsh", int_ppc_altivec_vminsh>;
+def VMINSW : VX1_Int< 898, "vminsw", int_ppc_altivec_vminsw>;
+def VMINUB : VX1_Int< 514, "vminub", int_ppc_altivec_vminub>;
+def VMINUH : VX1_Int< 578, "vminuh", int_ppc_altivec_vminuh>;
+def VMINUW : VX1_Int< 642, "vminuw", int_ppc_altivec_vminuw>;
+
+def VMRGHB : VXForm_1< 12, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vmrghb $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA),
+                                             VRRC:$vB, VMRGHB_shuffle_mask))]>;
+def VMRGHH : VXForm_1< 76, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vmrghh $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA),
+                                             VRRC:$vB, VMRGHH_shuffle_mask))]>;
+def VMRGHW : VXForm_1<140, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vmrghw $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA),
+                                             VRRC:$vB, VMRGHW_shuffle_mask))]>;
+def VMRGLB : VXForm_1<268, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vmrglb $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA),
+                                             VRRC:$vB, VMRGLB_shuffle_mask))]>;
+def VMRGLH : VXForm_1<332, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vmrglh $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA),
+                                             VRRC:$vB, VMRGLH_shuffle_mask))]>;
+def VMRGLW : VXForm_1<396, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vmrglw $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA),
+                                             VRRC:$vB, VMRGLW_shuffle_mask))]>;
+
+def VMSUMMBM : VA1a_Int<37, "vmsummbm", int_ppc_altivec_vmsummbm>;
+def VMSUMSHM : VA1a_Int<40, "vmsumshm", int_ppc_altivec_vmsumshm>;
+def VMSUMSHS : VA1a_Int<41, "vmsumshs", int_ppc_altivec_vmsumshs>;
+def VMSUMUBM : VA1a_Int<36, "vmsumubm", int_ppc_altivec_vmsumubm>;
+def VMSUMUHM : VA1a_Int<38, "vmsumuhm", int_ppc_altivec_vmsumuhm>;
+def VMSUMUHS : VA1a_Int<39, "vmsumuhs", int_ppc_altivec_vmsumuhs>;
+
+def VMULESB : VX1_Int<776, "vmulesb", int_ppc_altivec_vmulesb>;
+def VMULESH : VX1_Int<840, "vmulesh", int_ppc_altivec_vmulesh>;
+def VMULEUB : VX1_Int<520, "vmuleub", int_ppc_altivec_vmuleub>;
+def VMULEUH : VX1_Int<584, "vmuleuh", int_ppc_altivec_vmuleuh>;
+def VMULOSB : VX1_Int<264, "vmulosb", int_ppc_altivec_vmulosb>;
+def VMULOSH : VX1_Int<328, "vmulosh", int_ppc_altivec_vmulosh>;
+def VMULOUB : VX1_Int<  8, "vmuloub", int_ppc_altivec_vmuloub>;
+def VMULOUH : VX1_Int< 72, "vmulouh", int_ppc_altivec_vmulouh>;
+                       
+def VREFP     : VX2_Int<266, "vrefp",     int_ppc_altivec_vrefp>;
+def VRFIM     : VX2_Int<714, "vrfim",     int_ppc_altivec_vrfim>;
+def VRFIN     : VX2_Int<522, "vrfin",     int_ppc_altivec_vrfin>;
+def VRFIP     : VX2_Int<650, "vrfip",     int_ppc_altivec_vrfip>;
+def VRFIZ     : VX2_Int<586, "vrfiz",     int_ppc_altivec_vrfiz>;
+def VRSQRTEFP : VX2_Int<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>;
+
+def VSUBCUW : VX1_Int<74, "vsubcuw", int_ppc_altivec_vsubcuw>;
+
+def VSUBFP  : VXForm_1<74, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vsubfp $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (fsub VRRC:$vA, VRRC:$vB))]>;
+def VSUBUBM : VXForm_1<1024, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vsububm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (sub (v16i8 VRRC:$vA), VRRC:$vB))]>;
+def VSUBUHM : VXForm_1<1088, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vsubuhm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (sub (v8i16 VRRC:$vA), VRRC:$vB))]>;
+def VSUBUWM : VXForm_1<1152, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vsubuwm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (sub (v4i32 VRRC:$vA), VRRC:$vB))]>;
+                      
+def VSUBSBS : VX1_Int<1792, "vsubsbs" , int_ppc_altivec_vsubsbs>;
+def VSUBSHS : VX1_Int<1856, "vsubshs" , int_ppc_altivec_vsubshs>;
+def VSUBSWS : VX1_Int<1920, "vsubsws" , int_ppc_altivec_vsubsws>;
+def VSUBUBS : VX1_Int<1536, "vsububs" , int_ppc_altivec_vsububs>;
+def VSUBUHS : VX1_Int<1600, "vsubuhs" , int_ppc_altivec_vsubuhs>;
+def VSUBUWS : VX1_Int<1664, "vsubuws" , int_ppc_altivec_vsubuws>;
+def VSUMSWS : VX1_Int<1928, "vsumsws" , int_ppc_altivec_vsumsws>;
+def VSUM2SWS: VX1_Int<1672, "vsum2sws", int_ppc_altivec_vsum2sws>;
+def VSUM4SBS: VX1_Int<1672, "vsum4sbs", int_ppc_altivec_vsum4sbs>;
+def VSUM4SHS: VX1_Int<1608, "vsum4shs", int_ppc_altivec_vsum4shs>;
+def VSUM4UBS: VX1_Int<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs>;
+
+def VNOR : VXForm_1<1284, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                    "vnor $vD, $vA, $vB", VecFP,
+                    [(set VRRC:$vD, (vnot (or (v4i32 VRRC:$vA), VRRC:$vB)))]>;
+def VOR : VXForm_1<1156, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vor $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (or (v4i32 VRRC:$vA), VRRC:$vB))]>;
+def VXOR : VXForm_1<1220, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                      "vxor $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (xor (v4i32 VRRC:$vA), VRRC:$vB))]>;
+
+def VRLB   : VX1_Int<   4, "vrlb", int_ppc_altivec_vrlb>;
+def VRLH   : VX1_Int<  68, "vrlh", int_ppc_altivec_vrlh>;
+def VRLW   : VX1_Int< 132, "vrlw", int_ppc_altivec_vrlw>;
+
+def VSL    : VX1_Int< 452, "vsl" , int_ppc_altivec_vsl >;
+def VSLO   : VX1_Int<1036, "vslo", int_ppc_altivec_vslo>;
+def VSLB   : VX1_Int< 260, "vslb", int_ppc_altivec_vslb>;
+def VSLH   : VX1_Int< 324, "vslh", int_ppc_altivec_vslh>;
+def VSLW   : VX1_Int< 388, "vslw", int_ppc_altivec_vslw>;
+
+def VSPLTB : VXForm_1<524, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB),
+                      "vspltb $vD, $vB, $UIMM", VecPerm,
+                      [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vB), (undef),
+                                      VSPLTB_shuffle_mask:$UIMM))]>;
+def VSPLTH : VXForm_1<588, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB),
+                      "vsplth $vD, $vB, $UIMM", VecPerm,
+                      [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vB), (undef),
+                                      VSPLTH_shuffle_mask:$UIMM))]>;
+def VSPLTW : VXForm_1<652, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB),
+                      "vspltw $vD, $vB, $UIMM", VecPerm,
+                      [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vB), (undef),
+                                      VSPLTW_shuffle_mask:$UIMM))]>;
+
+def VSR    : VX1_Int< 708, "vsr"  , int_ppc_altivec_vsr>;
+def VSRO   : VX1_Int<1100, "vsro" , int_ppc_altivec_vsro>;
+def VSRAB  : VX1_Int< 772, "vsrab", int_ppc_altivec_vsrab>;
+def VSRAH  : VX1_Int< 836, "vsrah", int_ppc_altivec_vsrah>;
+def VSRAW  : VX1_Int< 900, "vsraw", int_ppc_altivec_vsraw>;
+def VSRB   : VX1_Int< 516, "vsrb" , int_ppc_altivec_vsrb>;
+def VSRH   : VX1_Int< 580, "vsrh" , int_ppc_altivec_vsrh>;
+def VSRW   : VX1_Int< 644, "vsrw" , int_ppc_altivec_vsrw>;
+
+
+def VSPLTISB : VXForm_3<780, (ops VRRC:$vD, s5imm:$SIMM),
+                       "vspltisb $vD, $SIMM", VecPerm,
+                       [(set VRRC:$vD, (v16i8 vecspltisb:$SIMM))]>;
+def VSPLTISH : VXForm_3<844, (ops VRRC:$vD, s5imm:$SIMM),
+                       "vspltish $vD, $SIMM", VecPerm,
+                       [(set VRRC:$vD, (v8i16 vecspltish:$SIMM))]>;
+def VSPLTISW : VXForm_3<908, (ops VRRC:$vD, s5imm:$SIMM),
+                       "vspltisw $vD, $SIMM", VecPerm,
+                       [(set VRRC:$vD, (v4i32 vecspltisw:$SIMM))]>;
+
+// Vector Pack.
+def VPKPX   : VX1_Int<782, "vpkpx", int_ppc_altivec_vpkpx>;
+def VPKSHSS : VX1_Int<398, "vpkshss", int_ppc_altivec_vpkshss>;
+def VPKSHUS : VX1_Int<270, "vpkshus", int_ppc_altivec_vpkshus>;
+def VPKSWSS : VX1_Int<462, "vpkswss", int_ppc_altivec_vpkswss>;
+def VPKSWUS : VX1_Int<334, "vpkswus", int_ppc_altivec_vpkswus>;
+def VPKUHUM : VXForm_1<14, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                       "vpkuhum $vD, $vA, $vB", VecFP,
+                       [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA),
+                                             VRRC:$vB, VPKUHUM_shuffle_mask))]>;
+def VPKUHUS : VX1_Int<142, "vpkuhus", int_ppc_altivec_vpkuhus>;
+def VPKUWUM : VXForm_1<78, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB),
+                       "vpkuwum $vD, $vA, $vB", VecFP,
+                       [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA),
+                                             VRRC:$vB, VPKUWUM_shuffle_mask))]>;
+def VPKUWUS : VX1_Int<206, "vpkuwus", int_ppc_altivec_vpkuwus>;
+
+// Vector Unpack.
+def VUPKHPX : VX2_Int<846, "vupkhpx", int_ppc_altivec_vupkhpx>;
+def VUPKHSB : VX2_Int<526, "vupkhsb", int_ppc_altivec_vupkhsb>;
+def VUPKHSH : VX2_Int<590, "vupkhsh", int_ppc_altivec_vupkhsh>;
+def VUPKLPX : VX2_Int<974, "vupklpx", int_ppc_altivec_vupklpx>;
+def VUPKLSB : VX2_Int<654, "vupklsb", int_ppc_altivec_vupklsb>;
+def VUPKLSH : VX2_Int<718, "vupklsh", int_ppc_altivec_vupklsh>;
+
+
+// Altivec Comparisons.
+
+class VCMP<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), asmstr, VecFPCompare,
+              [(set VRRC:$vD, (Ty (PPCvcmp VRRC:$vA, VRRC:$vB, xo)))]>;
+class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), asmstr, VecFPCompare,
+              [(set VRRC:$vD, (Ty (PPCvcmp_o VRRC:$vA, VRRC:$vB, xo)))]> {
+  let Defs = [CR6];
+  let RC = 1;
+}
+
+// f32 element comparisons.0
+def VCMPBFP   : VCMP <966, "vcmpbfp $vD, $vA, $vB"  , v4f32>;
+def VCMPBFPo  : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
+def VCMPEQFP  : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>;
+def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
+def VCMPGEFP  : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>;
+def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
+def VCMPGTFP  : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>;
+def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
+
+// i8 element comparisons.
+def VCMPEQUB  : VCMP <  6, "vcmpequb $vD, $vA, $vB" , v16i8>;
+def VCMPEQUBo : VCMPo<  6, "vcmpequb. $vD, $vA, $vB", v16i8>;
+def VCMPGTSB  : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>;
+def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
+def VCMPGTUB  : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>;
+def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
+
+// i16 element comparisons.
+def VCMPEQUH  : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>;
+def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
+def VCMPGTSH  : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>;
+def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
+def VCMPGTUH  : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>;
+def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
+
+// i32 element comparisons.
+def VCMPEQUW  : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>;
+def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
+def VCMPGTSW  : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>;
+def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
+def VCMPGTUW  : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
+def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
+                      
+def V_SET0 : VXForm_setzero<1220, (ops VRRC:$vD),
+                      "vxor $vD, $vD, $vD", VecFP,
+                      [(set VRRC:$vD, (v4i32 immAllZerosV))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Additional Altivec Patterns
+//
+
+// DS* intrinsics.
+def : Pat<(int_ppc_altivec_dss imm:$STRM), (DSS 0, imm:$STRM, 0, 0)>;
+def : Pat<(int_ppc_altivec_dssall), (DSS 1, 0, 0, 0)>;
+def : Pat<(int_ppc_altivec_dst GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DST 0, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dstt GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DST 1, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dstst GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTST 0, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dststt GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTST 1, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+
+// Undef.
+def : Pat<(v16i8 (undef)), (IMPLICIT_DEF_VRRC)>;
+def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VRRC)>;
+def : Pat<(v4f32 (undef)), (IMPLICIT_DEF_VRRC)>;
+
+// Loads.
+def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
+
+// Stores.
+def : Pat<(store (v4i32 VRRC:$rS), xoaddr:$dst),
+          (STVX (v4i32 VRRC:$rS), xoaddr:$dst)>;
+
+// Bit conversions.
+def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>;
+
+// Shuffles.
+
+// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x)
+def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VSLDOI_unary_shuffle_mask:$in),
+        (VSLDOI VRRC:$vA, VRRC:$vA, VSLDOI_unary_shuffle_mask:$in)>;
+def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef,VPKUWUM_unary_shuffle_mask:$in),
+        (VPKUWUM VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef,VPKUHUM_unary_shuffle_mask:$in),
+        (VPKUHUM VRRC:$vA, VRRC:$vA)>;
+
+// Match vmrg*(x,x)
+def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGLB_unary_shuffle_mask:$in),
+        (VMRGLB VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGLH_unary_shuffle_mask:$in),
+        (VMRGLH VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGLW_unary_shuffle_mask:$in),
+        (VMRGLW VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGHB_unary_shuffle_mask:$in),
+        (VMRGHB VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGHH_unary_shuffle_mask:$in),
+        (VMRGHH VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGHW_unary_shuffle_mask:$in),
+        (VMRGHW VRRC:$vA, VRRC:$vA)>;
+
+// Logical Operations
+def : Pat<(v4i32 (vnot VRRC:$vA)),      (VNOR VRRC:$vA, VRRC:$vA)>;
+def : Pat<(v4i32 (vnot_conv VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>;
+
+def : Pat<(v4i32 (vnot_conv (or VRRC:$A, VRRC:$B))),
+          (VNOR VRRC:$A, VRRC:$B)>;
+def : Pat<(v4i32 (and VRRC:$A, (vnot_conv VRRC:$B))),
+          (VANDC VRRC:$A, VRRC:$B)>;
+
+def : Pat<(fmul VRRC:$vA, VRRC:$vB),
+          (VMADDFP VRRC:$vA, VRRC:$vB, (v4i32 (V_SET0)))>; 
+
+// Fused multiply add and multiply sub for packed float.  These are represented
+// separately from the real instructions above, for operations that must have
+// the additional precision, such as Newton-Rhapson (used by divide, sqrt)
+def : Pat<(PPCvmaddfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VMADDFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+def : Pat<(PPCvnmsubfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VNMSUBFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+
+def : Pat<(int_ppc_altivec_vmaddfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VMADDFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+def : Pat<(int_ppc_altivec_vnmsubfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VNMSUBFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+
+def : Pat<(PPCvperm (v16i8 VRRC:$vA), VRRC:$vB, VRRC:$vC),
+          (VPERM VRRC:$vA, VRRC:$vB, VRRC:$vC)>;

diff --git a/lib/Target/PowerPC/PPCInstrBuilder.h b/lib/Target/PowerPC/PPCInstrBuilder.h
new file mode 100644
index 0000000..3861918
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrBuilder.h

@@ -0,0 +1,55 @@
+//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_INSTRBUILDER_H
+#define POWERPC_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference has base register ConstantPoolIndex offset which is retained until
+/// either machine code emission or assembly output.  This allows an optional
+/// offset to be added as well.
+///
+inline const MachineInstrBuilder&
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         int Offset = 0) {
+  return MIB.addImm(Offset).addConstantPoolIndex(CPI);
+}
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
new file mode 100644
index 0000000..6a4a59b
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrFormats.td

@@ -0,0 +1,800 @@
+//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// PowerPC instruction formats
+
+class I<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin>
+        : Instruction {
+  field bits<32> Inst;
+
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let Name = "";
+  let Namespace = "PPC";
+  let Inst{0-5} = opcode;
+  let OperandList = OL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+  
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+}
+
+class PPC970_DGroup_First   { bits<1> PPC970_First = 1;  }
+class PPC970_DGroup_Single  { bits<1> PPC970_Single = 1; }
+class PPC970_DGroup_Cracked { bits<1> PPC970_Cracked = 1; }
+class PPC970_MicroCode;
+
+class PPC970_Unit_Pseudo   { bits<3> PPC970_Unit = 0;   }
+class PPC970_Unit_FXU      { bits<3> PPC970_Unit = 1;   }
+class PPC970_Unit_LSU      { bits<3> PPC970_Unit = 2;   }
+class PPC970_Unit_FPU      { bits<3> PPC970_Unit = 3;   }
+class PPC970_Unit_CRU      { bits<3> PPC970_Unit = 4;   }
+class PPC970_Unit_VALU     { bits<3> PPC970_Unit = 5;   }
+class PPC970_Unit_VPERM    { bits<3> PPC970_Unit = 6;   }
+class PPC970_Unit_BRU      { bits<3> PPC970_Unit = 7;   }
+
+
+// 1.7.1 I-Form
+class IForm<bits<6> opcode, bit aa, bit lk, dag OL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OL, asmstr, itin> {
+  let Pattern = pattern;
+  bits<24> LI;
+
+  let Inst{6-29}  = LI;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+// 1.7.2 B-Form
+class BForm<bits<6> opcode, bit aa, bit lk, dag OL, string asmstr>
+  : I<opcode, OL, asmstr, BrB> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  bits<14> BD;
+
+  bits<5> BI;
+  let BI{0-1} = BIBO{5-6};
+  let BI{2-4} = CR{0-2};
+
+  let Inst{6-10}  = BIBO{4-0};
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+
+// 1.7.4 D-Form
+class DForm_base<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin,
+                 list<dag> pattern> 
+  : I<opcode, OL, asmstr, itin> {
+  bits<5>  A;
+  bits<5>  B;
+  bits<16> C;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+class DForm_1<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin,
+              list<dag> pattern>
+  : I<opcode, OL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> C;
+  bits<5>  B;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+class DForm_2<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin,
+              list<dag> pattern>
+  : DForm_base<opcode, OL, asmstr, itin, pattern>;
+
+class DForm_2_r0<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin,
+                 list<dag> pattern>
+  : I<opcode, OL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> B;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = 0;
+  let Inst{16-31} = B;
+}
+
+class DForm_4<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin,
+              list<dag> pattern>
+  : I<opcode, OL, asmstr, itin> {
+  bits<5>  B;
+  bits<5>  A;
+  bits<16> C;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+              
+class DForm_4_zero<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin,
+                   list<dag> pattern>
+  : DForm_1<opcode, OL, asmstr, itin, pattern> {
+  let A = 0;
+  let B = 0;
+  let C = 0;
+}
+
+class DForm_5<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin>
+  : I<opcode, OL, asmstr, itin> {
+  bits<3>  BF;
+  bits<1>  L;
+  bits<5>  RA;
+  bits<16> I;
+
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-31} = I;
+}
+
+class DForm_5_ext<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin>
+  : DForm_5<opcode, OL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class DForm_6<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin> 
+  : DForm_5<opcode, OL, asmstr, itin>;
+
+class DForm_6_ext<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin>
+  : DForm_6<opcode, OL, asmstr, itin> {
+  let L = PPC64;
+}
+
+
+// 1.7.5 DS-Form
+class DSForm_1<bits<6> opcode, bits<2> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OL, asmstr, itin> {
+  bits<5>  RST;
+  bits<14> DS;
+  bits<5>  RA;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = RA;
+  let Inst{16-29} = DS;
+  let Inst{30-31} = xo;
+}
+
+// 1.7.6 X-Form
+class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OL, asmstr, itin> {
+  bits<5> RST;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// This is the same as XForm_base_r3xo, but the first two operands are swapped
+// when code is emitted.
+class XForm_base_r3xo_swapped
+        <bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+        InstrItinClass itin> 
+  : I<opcode, OL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RST;
+  bits<5> B;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+
+class XForm_1<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OL, asmstr, itin, pattern>;
+
+class XForm_6<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OL, asmstr, itin> {
+  let Pattern = pattern;
+}
+
+class XForm_8<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OL, asmstr, itin, pattern>;
+
+class XForm_10<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OL, asmstr, itin> {
+    let Pattern = pattern;
+}
+
+class XForm_11<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OL, asmstr, itin> {
+  let B = 0;
+  let Pattern = pattern;
+}
+
+class XForm_16<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OL, asmstr, itin> {
+  bits<3> BF;
+  bits<1> L; 
+  bits<5> RA;
+  bits<5> RB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+                   InstrItinClass itin>
+  : XForm_16<opcode, xo, OL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class XForm_17<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OL, asmstr, itin> {
+  bits<3> BF;
+  bits<5> FRA;
+  bits<5> FRB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_25<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OL, asmstr, itin, pattern> {
+}
+
+class XForm_26<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OL, asmstr, itin, pattern> {
+  let A = 0;
+}
+
+class XForm_28<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OL, asmstr, itin, pattern> {
+}
+
+// DCB_Form - Form X instruction, used for dcb* instructions.
+class DCB_Form<bits<10> xo, bits<5> immfield, dag OL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OL, asmstr, itin> {
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = immfield;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+
+// DSS_Form - Form X instruction, used for altivec dss* instructions.
+class DSS_Form<bits<10> xo, dag OL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OL, asmstr, itin> {
+  bits<1> T;
+  bits<2> STRM;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6}     = T;
+  let Inst{7-8}   = 0;
+  let Inst{9-10}  = STRM;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.7 XL-Form
+class XLForm_1<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OL, asmstr, itin> {
+  bits<5> CRD;
+  bits<5> CRA;
+  bits<5> CRB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRA;
+  let Inst{16-20} = CRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OL, asmstr, itin> {
+  bits<5> CRD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRD;
+  let Inst{16-20} = CRD;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OL, string asmstr, 
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OL, asmstr, itin> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<2> BH;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-18} = 0;
+  let Inst{19-20} = BH;
+  let Inst{21-30} = xo;
+  let Inst{31}    = lk;
+}
+
+class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk,
+                  dag OL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OL, asmstr, itin, pattern> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  
+  let BO = BIBO{2-6};
+  let BI{0-1} = BIBO{0-1};
+  let BI{2-4} = CR;
+  let BH = 0;
+}
+
+
+class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo,  bits<5> bi, bit lk,
+                  dag OL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BI = bi;
+  let BH = 0;
+}
+
+class XLForm_3<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OL, asmstr, itin> {
+  bits<3> BF;
+  bits<3> BFA;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-13} = BFA;
+  let Inst{14-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.8 XFX-Form
+class XFXForm_1<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OL, asmstr, itin> {
+  bits<5>  RT;
+  bits<10> SPR;
+
+  let Inst{6-10}  = RT;
+  let Inst{11}    = SPR{4};
+  let Inst{12}    = SPR{3};
+  let Inst{13}    = SPR{2};
+  let Inst{14}    = SPR{1};
+  let Inst{15}    = SPR{0};
+  let Inst{16}    = SPR{9};
+  let Inst{17}    = SPR{8};
+  let Inst{18}    = SPR{7};
+  let Inst{19}    = SPR{6};
+  let Inst{20}    = SPR{5};
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_1_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                   dag OL, string asmstr, InstrItinClass itin> 
+  : XFXForm_1<opcode, xo, OL, asmstr, itin> {
+  let SPR = spr;
+}
+
+class XFXForm_3<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OL, asmstr, itin> {
+  bits<5>  RT;
+   
+  let Inst{6-10}  = RT;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+                InstrItinClass itin> 
+  : I<opcode, OL, asmstr, itin> {
+  bits<8>  FXM;
+  bits<5>  ST;
+   
+  let Inst{6-10}  = ST;
+  let Inst{11}    = 0;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+                 InstrItinClass itin> 
+  : I<opcode, OL, asmstr, itin> {
+  bits<5>  ST;
+  bits<8>  FXM;
+   
+  let Inst{6-10}  = ST;
+  let Inst{11}    = 1;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_7<bits<6> opcode, bits<10> xo, dag OL, string asmstr,
+                InstrItinClass itin>
+  : XFXForm_1<opcode, xo, OL, asmstr, itin>;
+
+class XFXForm_7_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                    dag OL, string asmstr, InstrItinClass itin> 
+  : XFXForm_7<opcode, xo, OL, asmstr, itin> {
+  let SPR = spr;
+}
+
+// 1.7.10 XS-Form - SRADI.
+class XSForm_1<bits<6> opcode, bits<9> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RS;
+  bits<6> SH;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = A;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+// 1.7.11 XO-Form
+class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21}    = oe;
+  let Inst{22-30} = xo;
+  let Inst{31}    = RC;  
+}
+
+class XOForm_3<bits<6> opcode, bits<9> xo, bit oe, 
+               dag OL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XOForm_1<opcode, xo, oe, OL, asmstr, itin, pattern> {
+  let RB = 0;
+}
+
+// 1.7.12 A-Form
+class AForm_1<bits<6> opcode, bits<5> xo, dag OL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRC;
+  bits<5> FRB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-25} = FRC;
+  let Inst{26-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class AForm_2<bits<6> opcode, bits<5> xo, dag OL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : AForm_1<opcode, xo, OL, asmstr, itin, pattern> {
+  let FRC = 0;
+}
+
+class AForm_3<bits<6> opcode, bits<5> xo, dag OL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : AForm_1<opcode, xo, OL, asmstr, itin, pattern> {
+  let FRB = 0;
+}
+
+// 1.7.13 M-Form
+class MForm_1<bits<6> opcode, dag OL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<5> RB;
+  bits<5> MB;
+  bits<5> ME;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-25} = MB;
+  let Inst{26-30} = ME;
+  let Inst{31}    = RC;
+}
+
+class MForm_2<bits<6> opcode, dag OL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : MForm_1<opcode, OL, asmstr, itin, pattern> {
+}
+
+// 1.7.14 MD-Form
+class MDForm_1<bits<6> opcode, bits<3> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<6> SH;
+  bits<6> MBE;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-26} = MBE{4,3,2,1,0,5};
+  let Inst{27-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+
+
+// E-1 VA-Form
+
+// VAForm_1 - DACB ordering.
+class VAForm_1<bits<6> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VC;
+  bits<5> VB;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+// VAForm_1a - DABC ordering.
+class VAForm_1a<bits<6> xo, dag OL, string asmstr,
+                InstrItinClass itin, list<dag> pattern>
+    : I<4, OL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<5> VC;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+class VAForm_2<bits<6> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<4> SH;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = 0;
+  let Inst{22-25} = SH;
+  let Inst{26-31} = xo;
+}
+
+// E-2 VX-Form
+class VXForm_1<bits<11> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_setzero<bits<11> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : VXForm_1<xo, OL, asmstr, itin, pattern> {
+  let VA = VD;
+  let VB = VD;
+}
+
+
+class VXForm_2<bits<11> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_3<bits<11> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> IMM;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = IMM;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr.
+class VXForm_4<bits<11> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OL, asmstr, itin> {
+  bits<5> VD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr.
+class VXForm_5<bits<11> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OL, asmstr, itin> {
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+// E-4 VXR-Form
+class VXRForm_1<bits<10> xo, dag OL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bit RC = 0;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = RC;
+  let Inst{22-31} = xo;
+}
+
+//===----------------------------------------------------------------------===//
+class Pseudo<dag OL, string asmstr, list<dag> pattern>
+    : I<0, OL, asmstr, NoItinerary> {
+  let PPC64 = 0;
+  let Pattern = pattern;
+  let Inst{31-0} = 0;
+}

diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
new file mode 100644
index 0000000..d7ee5ed
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp

@@ -0,0 +1,303 @@
+//===- PPCInstrInfo.cpp - PowerPC32 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPCPredicates.h"
+#include "PPCGenInstrInfo.inc"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+using namespace llvm;
+
+PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
+  : TargetInstrInfo(PPCInsts, sizeof(PPCInsts)/sizeof(PPCInsts[0])), TM(tm),
+    RI(*TM.getSubtargetImpl(), *this) {}
+
+/// getPointerRegClass - Return the register class to use to hold pointers.
+/// This is used for addressing modes.
+const TargetRegisterClass *PPCInstrInfo::getPointerRegClass() const {
+  if (TM.getSubtargetImpl()->isPPC64())
+    return &PPC::G8RCRegClass;
+  else
+    return &PPC::GPRCRegClass;
+}
+
+
+bool PPCInstrInfo::isMoveInstr(const MachineInstr& MI,
+                               unsigned& sourceReg,
+                               unsigned& destReg) const {
+  MachineOpCode oc = MI.getOpcode();
+  if (oc == PPC::OR || oc == PPC::OR8 || oc == PPC::VOR ||
+      oc == PPC::OR4To8 || oc == PPC::OR8To4) {                // or r1, r2, r2
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isRegister() &&
+           MI.getOperand(1).isRegister() &&
+           MI.getOperand(2).isRegister() &&
+           "invalid PPC OR instruction!");
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+  } else if (oc == PPC::ADDI) {             // addi r1, r2, 0
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isRegister() &&
+           MI.getOperand(2).isImmediate() &&
+           "invalid PPC ADDI instruction!");
+    if (MI.getOperand(1).isRegister() && MI.getOperand(2).getImmedValue()==0) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+  } else if (oc == PPC::ORI) {             // ori r1, r2, 0
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isRegister() &&
+           MI.getOperand(1).isRegister() &&
+           MI.getOperand(2).isImmediate() &&
+           "invalid PPC ORI instruction!");
+    if (MI.getOperand(2).getImmedValue()==0) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+  } else if (oc == PPC::FMRS || oc == PPC::FMRD ||
+             oc == PPC::FMRSD) {      // fmr r1, r2
+    assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isRegister() &&
+           MI.getOperand(1).isRegister() &&
+           "invalid PPC FMR instruction");
+    sourceReg = MI.getOperand(1).getReg();
+    destReg = MI.getOperand(0).getReg();
+    return true;
+  } else if (oc == PPC::MCRF) {             // mcrf cr1, cr2
+    assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isRegister() &&
+           MI.getOperand(1).isRegister() &&
+           "invalid PPC MCRF instruction");
+    sourceReg = MI.getOperand(1).getReg();
+    destReg = MI.getOperand(0).getReg();
+    return true;
+  }
+  return false;
+}
+
+unsigned PPCInstrInfo::isLoadFromStackSlot(MachineInstr *MI, 
+                                           int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case PPC::LD:
+  case PPC::LWZ:
+  case PPC::LFS:
+  case PPC::LFD:
+    if (MI->getOperand(1).isImmediate() && !MI->getOperand(1).getImmedValue() &&
+        MI->getOperand(2).isFrameIndex()) {
+      FrameIndex = MI->getOperand(2).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned PPCInstrInfo::isStoreToStackSlot(MachineInstr *MI, 
+                                          int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case PPC::STD:
+  case PPC::STW:
+  case PPC::STFS:
+  case PPC::STFD:
+    if (MI->getOperand(1).isImmediate() && !MI->getOperand(1).getImmedValue() &&
+        MI->getOperand(2).isFrameIndex()) {
+      FrameIndex = MI->getOperand(2).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+// commuteInstruction - We can commute rlwimi instructions, but only if the
+// rotate amt is zero.  We also have to munge the immediates a bit.
+MachineInstr *PPCInstrInfo::commuteInstruction(MachineInstr *MI) const {
+  // Normal instructions can be commuted the obvious way.
+  if (MI->getOpcode() != PPC::RLWIMI)
+    return TargetInstrInfo::commuteInstruction(MI);
+  
+  // Cannot commute if it has a non-zero rotate count.
+  if (MI->getOperand(3).getImmedValue() != 0)
+    return 0;
+  
+  // If we have a zero rotate count, we have:
+  //   M = mask(MB,ME)
+  //   Op0 = (Op1 & ~M) | (Op2 & M)
+  // Change this to:
+  //   M = mask((ME+1)&31, (MB-1)&31)
+  //   Op0 = (Op2 & ~M) | (Op1 & M)
+
+  // Swap op1/op2
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  unsigned Reg2 = MI->getOperand(2).getReg();
+  bool Reg1IsKill = MI->getOperand(1).isKill();
+  bool Reg2IsKill = MI->getOperand(2).isKill();
+  MI->getOperand(2).setReg(Reg1);
+  MI->getOperand(1).setReg(Reg2);
+  if (Reg1IsKill)
+    MI->getOperand(2).setIsKill();
+  else
+    MI->getOperand(2).unsetIsKill();
+  if (Reg2IsKill)
+    MI->getOperand(1).setIsKill();
+  else
+    MI->getOperand(1).unsetIsKill();
+  
+  // Swap the mask around.
+  unsigned MB = MI->getOperand(4).getImmedValue();
+  unsigned ME = MI->getOperand(5).getImmedValue();
+  MI->getOperand(4).setImmedValue((ME+1) & 31);
+  MI->getOperand(5).setImmedValue((MB-1) & 31);
+  return MI;
+}
+
+void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, 
+                              MachineBasicBlock::iterator MI) const {
+  BuildMI(MBB, MI, get(PPC::NOP));
+}
+
+
+// Branch analysis.
+bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 std::vector<MachineOperand> &Cond) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == PPC::B) {
+      TBB = LastInst->getOperand(0).getMachineBasicBlock();
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BCC) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(2).getMachineBasicBlock();
+      Cond.push_back(LastInst->getOperand(0));
+      Cond.push_back(LastInst->getOperand(1));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+  
+  // If the block ends with PPC::B and PPC:BCC, handle it.
+  if (SecondLastInst->getOpcode() == PPC::BCC && 
+      LastInst->getOpcode() == PPC::B) {
+    TBB =  SecondLastInst->getOperand(2).getMachineBasicBlock();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    Cond.push_back(SecondLastInst->getOperand(1));
+    FBB = LastInst->getOperand(0).getMachineBasicBlock();
+    return false;
+  }
+  
+  // If the block ends with two PPC:Bs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == PPC::B && 
+      LastInst->getOpcode() == PPC::B) {
+    TBB = SecondLastInst->getOperand(0).getMachineBasicBlock();
+    I = LastInst;
+    I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != PPC::BCC)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const std::vector<MachineOperand> &Cond) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) && 
+         "PPC branch conditions have two components!");
+  
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, get(PPC::B)).addMBB(TBB);
+    else                // Conditional branch
+      BuildMI(&MBB, get(PPC::BCC))
+        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+    return 1;
+  }
+  
+  // Two-way Conditional Branch.
+  BuildMI(&MBB, get(PPC::BCC))
+    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  BuildMI(&MBB, get(PPC::B)).addMBB(FBB);
+  return 2;
+}
+
+bool PPCInstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case PPC::BLR:   // Return.
+  case PPC::B:     // Uncond branch.
+  case PPC::BCTR:  // Indirect branch.
+    return true;
+  default: return false;
+  }
+}
+
+bool PPCInstrInfo::
+ReverseBranchCondition(std::vector<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
+  // Leave the CR# the same, but invert the condition.
+  Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
+  return false;
+}

diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
new file mode 100644
index 0000000..498a8e5
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrInfo.h

@@ -0,0 +1,112 @@
+//===- PPCInstrInfo.h - PowerPC Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC32_INSTRUCTIONINFO_H
+#define POWERPC32_INSTRUCTIONINFO_H
+
+#include "PPC.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "PPCRegisterInfo.h"
+
+namespace llvm {
+
+/// PPCII - This namespace holds all of the PowerPC target-specific
+/// per-instruction flags.  These must match the corresponding definitions in
+/// PPC.td and PPCInstrFormats.td.
+namespace PPCII {
+enum {
+  // PPC970 Instruction Flags.  These flags describe the characteristics of the
+  // PowerPC 970 (aka G5) dispatch groups and how they are formed out of
+  // raw machine instructions.
+
+  /// PPC970_First - This instruction starts a new dispatch group, so it will
+  /// always be the first one in the group.
+  PPC970_First = 0x1,
+  
+  /// PPC970_Single - This instruction starts a new dispatch group and
+  /// terminates it, so it will be the sole instruction in the group.
+  PPC970_Single = 0x2,
+
+  /// PPC970_Cracked - This instruction is cracked into two pieces, requiring
+  /// two dispatch pipes to be available to issue.
+  PPC970_Cracked = 0x4,
+  
+  /// PPC970_Mask/Shift - This is a bitmask that selects the pipeline type that
+  /// an instruction is issued to.
+  PPC970_Shift = 3,
+  PPC970_Mask = 0x07 << PPC970_Shift
+};
+enum PPC970_Unit {
+  /// These are the various PPC970 execution unit pipelines.  Each instruction
+  /// is one of these.
+  PPC970_Pseudo = 0 << PPC970_Shift,   // Pseudo instruction
+  PPC970_FXU    = 1 << PPC970_Shift,   // Fixed Point (aka Integer/ALU) Unit
+  PPC970_LSU    = 2 << PPC970_Shift,   // Load Store Unit
+  PPC970_FPU    = 3 << PPC970_Shift,   // Floating Point Unit
+  PPC970_CRU    = 4 << PPC970_Shift,   // Control Register Unit
+  PPC970_VALU   = 5 << PPC970_Shift,   // Vector ALU
+  PPC970_VPERM  = 6 << PPC970_Shift,   // Vector Permute Unit
+  PPC970_BRU    = 7 << PPC970_Shift    // Branch Unit
+};
+}
+  
+  
+class PPCInstrInfo : public TargetInstrInfo {
+  PPCTargetMachine &TM;
+  const PPCRegisterInfo RI;
+public:
+  PPCInstrInfo(PPCTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// getPointerRegClass - Return the register class to use to hold pointers.
+  /// This is used for addressing modes.
+  virtual const TargetRegisterClass *getPointerRegClass() const;  
+
+  // Return true if the instruction is a register to register move and
+  // leave the source and dest operands in the passed parameters.
+  //
+  virtual bool isMoveInstr(const MachineInstr& MI,
+                           unsigned& sourceReg,
+                           unsigned& destReg) const;
+
+  unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const;
+
+  // commuteInstruction - We can commute rlwimi instructions, but only if the
+  // rotate amt is zero.  We also have to munge the immediates a bit.
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI) const;
+  
+  virtual void insertNoop(MachineBasicBlock &MBB, 
+                          MachineBasicBlock::iterator MI) const;
+
+
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             std::vector<MachineOperand> &Cond) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const std::vector<MachineOperand> &Cond) const;
+  virtual bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const;
+  virtual bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
new file mode 100644
index 0000000..fe18978
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrInfo.td

@@ -0,0 +1,1164 @@
+//===- PPCInstrInfo.td - The PowerPC Instruction Set -------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the subset of the 32-bit PowerPC instruction set, as used
+// by the PowerPC instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+include "PPCInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific type constraints.
+//
+def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCShiftOp : SDTypeProfile<1, 2, [   // PPCshl, PPCsra, PPCsrl
+  SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>
+]>;
+def SDT_PPCCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>;
+
+def SDT_PPCvperm   : SDTypeProfile<1, 3, [
+  SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>
+]>;
+
+def SDT_PPCvcmp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
+]>;
+
+def SDT_PPCcondbr : SDTypeProfile<0, 3, [
+  SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPClbrx : SDTypeProfile<1, 3, [
+  SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT>
+]>;
+def SDT_PPCstbrx : SDTypeProfile<0, 4, [
+  SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT>
+]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific DAG Nodes.
+//
+
+def PPCfcfid  : SDNode<"PPCISD::FCFID" , SDTFPUnaryOp, []>;
+def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
+def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
+def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx, [SDNPHasChain]>;
+
+def PPCfsel   : SDNode<"PPCISD::FSEL",  
+   // Type constraint for fsel.
+   SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, 
+                        SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
+
+def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
+def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
+def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
+
+def PPCvperm    : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+
+// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
+// amounts.  These nodes are generated by the multi-precision shift code.
+def PPCsrl        : SDNode<"PPCISD::SRL"       , SDT_PPCShiftOp>;
+def PPCsra        : SDNode<"PPCISD::SRA"       , SDT_PPCShiftOp>;
+def PPCshl        : SDNode<"PPCISD::SHL"       , SDT_PPCShiftOp>;
+
+def PPCextsw_32   : SDNode<"PPCISD::EXTSW_32"  , SDTIntUnaryOp>;
+def PPCstd_32     : SDNode<"PPCISD::STD_32"    , SDTStore, [SDNPHasChain]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeq,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_PPCCallSeq,
+                           [SDNPHasChain, SDNPOutFlag]>;
+
+def SDT_PPCCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def PPCcall_Macho : SDNode<"PPCISD::CALL_Macho", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCcall_ELF   : SDNode<"PPCISD::CALL_ELF", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCbctrl_Macho  : SDNode<"PPCISD::BCTRL_Macho", SDTRet,
+	                   [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def PPCbctrl_ELF  : SDNode<"PPCISD::BCTRL_ELF", SDTRet,
+	                   [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def retflag       : SDNode<"PPCISD::RET_FLAG", SDTRet,
+	                   [SDNPHasChain, SDNPOptInFlag]>;
+
+def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
+def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutFlag]>;
+
+def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx, [SDNPHasChain]>;
+def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx, [SDNPHasChain]>;
+
+// Instructions to support dynamic alloca.
+def SDTDynOp  : SDTypeProfile<1, 2, []>;
+def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific transformation functions and pattern fragments.
+//
+
+def SHL32 : SDNodeXForm<imm, [{
+  // Transformation function: 31 - imm
+  return getI32Imm(31 - N->getValue());
+}]>;
+
+def SRL32 : SDNodeXForm<imm, [{
+  // Transformation function: 32 - imm
+  return N->getValue() ? getI32Imm(32 - N->getValue()) : getI32Imm(0);
+}]>;
+
+def LO16 : SDNodeXForm<imm, [{
+  // Transformation function: get the low 16 bits.
+  return getI32Imm((unsigned short)N->getValue());
+}]>;
+
+def HI16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned)N->getValue() >> 16);
+}]>;
+
+def HA16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  signed int Val = N->getValue();
+  return getI32Imm((Val - (signed short)Val) >> 16);
+}]>;
+def MB : SDNodeXForm<imm, [{
+  // Transformation function: get the start bit of a mask
+  unsigned mb, me;
+  (void)isRunOfOnes((unsigned)N->getValue(), mb, me);
+  return getI32Imm(mb);
+}]>;
+
+def ME : SDNodeXForm<imm, [{
+  // Transformation function: get the end bit of a mask
+  unsigned mb, me;
+  (void)isRunOfOnes((unsigned)N->getValue(), mb, me);
+  return getI32Imm(me);
+}]>;
+def maskimm32 : PatLeaf<(imm), [{
+  // maskImm predicate - True if immediate is a run of ones.
+  unsigned mb, me;
+  if (N->getValueType(0) == MVT::i32)
+    return isRunOfOnes((unsigned)N->getValue(), mb, me);
+  else
+    return false;
+}]>;
+
+def immSExt16  : PatLeaf<(imm), [{
+  // immSExt16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.  Used by instructions like 'addi'.
+  if (N->getValueType(0) == MVT::i32)
+    return (int32_t)N->getValue() == (short)N->getValue();
+  else
+    return (int64_t)N->getValue() == (short)N->getValue();
+}]>;
+def immZExt16  : PatLeaf<(imm), [{
+  // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
+  // field.  Used by instructions like 'ori'.
+  return (uint64_t)N->getValue() == (unsigned short)N->getValue();
+}], LO16>;
+
+// imm16Shifted* - These match immediates where the low 16-bits are zero.  There
+// are two forms: imm16ShiftedSExt and imm16ShiftedZExt.  These two forms are
+// identical in 32-bit mode, but in 64-bit mode, they return true if the
+// immediate fits into a sign/zero extended 32-bit immediate (with the low bits
+// clear).
+def imm16ShiftedZExt : PatLeaf<(imm), [{
+  // imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'xoris'.
+  return (N->getValue() & ~uint64_t(0xFFFF0000)) == 0;
+}], HI16>;
+
+def imm16ShiftedSExt : PatLeaf<(imm), [{
+  // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'addis'.  Identical to 
+  // imm16ShiftedZExt in 32-bit mode.
+  if (N->getValue() & 0xFFFF) return false;
+  if (N->getValueType(0) == MVT::i32)
+    return true;
+  // For 64-bit, make sure it is sext right.
+  return N->getValue() == (uint64_t)(int)N->getValue();
+}], HI16>;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Flag Definitions.
+
+class isPPC64 { bit PPC64 = 1; }
+class isDOT   {
+  list<Register> Defs = [CR0];
+  bit RC  = 1;
+}
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+class NoEncode<string E> {
+  string DisableEncoding = E;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Operand Definitions.
+
+def s5imm   : Operand<i32> {
+  let PrintMethod = "printS5ImmOperand";
+}
+def u5imm   : Operand<i32> {
+  let PrintMethod = "printU5ImmOperand";
+}
+def u6imm   : Operand<i32> {
+  let PrintMethod = "printU6ImmOperand";
+}
+def s16imm  : Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+def u16imm  : Operand<i32> {
+  let PrintMethod = "printU16ImmOperand";
+}
+def s16immX4  : Operand<i32> {   // Multiply imm by 4 before printing.
+  let PrintMethod = "printS16X4ImmOperand";
+}
+def target : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+}
+def calltarget : Operand<iPTR> {
+  let PrintMethod = "printCallOperand";
+}
+def aaddr : Operand<iPTR> {
+  let PrintMethod = "printAbsAddrOperand";
+}
+def piclabel: Operand<iPTR> {
+  let PrintMethod = "printPICLabel";
+}
+def symbolHi: Operand<i32> {
+  let PrintMethod = "printSymbolHi";
+}
+def symbolLo: Operand<i32> {
+  let PrintMethod = "printSymbolLo";
+}
+def crbitm: Operand<i8> {
+  let PrintMethod = "printcrbitm";
+}
+// Address operands
+def memri : Operand<iPTR> {
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
+}
+def memrr : Operand<iPTR> {
+  let PrintMethod = "printMemRegReg";
+  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+}
+def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
+  let PrintMethod = "printMemRegImmShifted";
+  let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
+}
+
+// PowerPC Predicate operand.  20 = (0<<5)|20 = always, CR0 is a dummy reg
+// that doesn't matter.
+def pred : PredicateOperand<OtherVT, (ops imm, CRRC),
+                                     (ops (i32 20), CR0)> {
+  let PrintMethod = "printPredicateOperand";
+}
+
+// Define PowerPC specific addressing mode.
+def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
+def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
+def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmShift", [], []>; // "std"
+
+/// This is just the offset part of iaddr, used for preinc.
+def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Predicate Definitions.
+def FPContractions : Predicate<"!NoExcessFPPrecision">;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Definitions.
+
+// Pseudo-instructions:
+
+let hasCtrlDep = 1 in {
+def ADJCALLSTACKDOWN : Pseudo<(ops u16imm:$amt),
+                              "${:comment} ADJCALLSTACKDOWN",
+                              [(callseq_start imm:$amt)]>, Imp<[R1],[R1]>;
+def ADJCALLSTACKUP   : Pseudo<(ops u16imm:$amt),
+                              "${:comment} ADJCALLSTACKUP",
+                              [(callseq_end imm:$amt)]>, Imp<[R1],[R1]>;
+
+def UPDATE_VRSAVE    : Pseudo<(ops GPRC:$rD, GPRC:$rS),
+                              "UPDATE_VRSAVE $rD, $rS", []>;
+}
+
+def DYNALLOC : Pseudo<(ops GPRC:$result, GPRC:$negsize, memri:$fpsi),
+                       "${:comment} DYNALLOC $result, $negsize, $fpsi",
+                       [(set GPRC:$result,
+                             (PPCdynalloc GPRC:$negsize, iaddr:$fpsi))]>,
+                        Imp<[R1],[R1]>;
+                         
+def IMPLICIT_DEF_GPRC: Pseudo<(ops GPRC:$rD),"${:comment}IMPLICIT_DEF_GPRC $rD",
+                              [(set GPRC:$rD, (undef))]>;
+def IMPLICIT_DEF_F8  : Pseudo<(ops F8RC:$rD), "${:comment} IMPLICIT_DEF_F8 $rD",
+                              [(set F8RC:$rD, (undef))]>;
+def IMPLICIT_DEF_F4  : Pseudo<(ops F4RC:$rD), "${:comment} IMPLICIT_DEF_F4 $rD",
+                              [(set F4RC:$rD, (undef))]>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded by the
+// scheduler into a branch sequence.
+let usesCustomDAGSchedInserter = 1,    // Expanded by the scheduler.
+    PPC970_Single = 1 in {
+  def SELECT_CC_I4 : Pseudo<(ops GPRC:$dst, CRRC:$cond, GPRC:$T, GPRC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_I8 : Pseudo<(ops G8RC:$dst, CRRC:$cond, G8RC:$T, G8RC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_F4  : Pseudo<(ops F4RC:$dst, CRRC:$cond, F4RC:$T, F4RC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_F8  : Pseudo<(ops F8RC:$dst, CRRC:$cond, F8RC:$T, F8RC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_VRRC: Pseudo<(ops VRRC:$dst, CRRC:$cond, VRRC:$T, VRRC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+}
+
+let isTerminator = 1, isBarrier = 1, noResults = 1, PPC970_Unit = 7 in {
+  let isReturn = 1 in
+    def BLR : XLForm_2_br<19, 16, 0, (ops pred:$p),
+                          "b${p:cc}lr ${p:reg}", BrB, 
+                          [(retflag)]>;
+  def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (ops), "bctr", BrB, []>;
+}
+
+
+
+let Defs = [LR] in
+  def MovePCtoLR : Pseudo<(ops piclabel:$label), "bl $label", []>,
+                   PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, 
+    noResults = 1, PPC970_Unit = 7 in {
+  let isBarrier = 1 in {
+  def B   : IForm<18, 0, 0, (ops target:$dst),
+                  "b $dst", BrB,
+                  [(br bb:$dst)]>;
+  }
+
+  // BCC represents an arbitrary conditional branch on a predicate.
+  // FIXME: should be able to write a pattern for PPCcondbranch, but can't use
+  // a two-value operand where a dag node expects two operands. :( 
+  def BCC : BForm<16, 0, 0, (ops pred:$cond, target:$dst),
+                  "b${cond:cc} ${cond:reg}, $dst"
+                  /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>;
+}
+
+// Macho ABI Calls.
+let isCall = 1, noResults = 1, PPC970_Unit = 7, 
+  // All calls clobber the non-callee saved registers...
+  Defs = [R0,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR,CTR,
+          CR0,CR1,CR5,CR6,CR7] in {
+  // Convenient aliases for call instructions
+  def BL_Macho  : IForm<18, 0, 1,
+                        (ops calltarget:$func, variable_ops), 
+                        "bl $func", BrB, []>;  // See Pat patterns below.
+  def BLA_Macho : IForm<18, 1, 1, 
+                        (ops aaddr:$func, variable_ops),
+                        "bla $func", BrB, [(PPCcall_Macho (i32 imm:$func))]>;
+  def BCTRL_Macho : XLForm_2_ext<19, 528, 20, 0, 1, 
+                                 (ops variable_ops),
+                                 "bctrl", BrB,
+                                 [(PPCbctrl_Macho)]>;
+}
+
+// ELF ABI Calls.
+let isCall = 1, noResults = 1, PPC970_Unit = 7, 
+  // All calls clobber the non-callee saved registers...
+  Defs = [R0,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR,CTR,
+          CR0,CR1,CR5,CR6,CR7] in {
+  // Convenient aliases for call instructions
+  def BL_ELF  : IForm<18, 0, 1,
+                      (ops calltarget:$func, variable_ops), 
+                      "bl $func", BrB, []>;  // See Pat patterns below.
+  def BLA_ELF : IForm<18, 1, 1,
+                      (ops aaddr:$func, variable_ops),
+                      "bla $func", BrB,
+                      [(PPCcall_ELF (i32 imm:$func))]>;
+  def BCTRL_ELF : XLForm_2_ext<19, 528, 20, 0, 1,
+                               (ops variable_ops),
+                               "bctrl", BrB,
+                               [(PPCbctrl_ELF)]>;
+}
+
+// DCB* instructions.
+def DCBA   : DCB_Form<758, 0, (ops memrr:$dst),
+                      "dcba $dst", LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBF   : DCB_Form<86, 0, (ops memrr:$dst),
+                      "dcbf $dst", LdStDCBF, [(int_ppc_dcbf xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBI   : DCB_Form<470, 0, (ops memrr:$dst),
+                      "dcbi $dst", LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBST  : DCB_Form<54, 0, (ops memrr:$dst),
+                      "dcbst $dst", LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBT   : DCB_Form<278, 0, (ops memrr:$dst),
+                      "dcbt $dst", LdStDCBF, [(int_ppc_dcbt xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBTST : DCB_Form<246, 0, (ops memrr:$dst),
+                      "dcbtst $dst", LdStDCBF, [(int_ppc_dcbtst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZ   : DCB_Form<1014, 0, (ops memrr:$dst),
+                      "dcbz $dst", LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZL  : DCB_Form<1014, 1, (ops memrr:$dst),
+                      "dcbzl $dst", LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Load Instructions.
+//
+
+// Unindexed (r+i) Loads. 
+let isLoad = 1, PPC970_Unit = 2 in {
+def LBZ : DForm_1<34, (ops GPRC:$rD, memri:$src),
+                  "lbz $rD, $src", LdStGeneral,
+                  [(set GPRC:$rD, (zextloadi8 iaddr:$src))]>;
+def LHA : DForm_1<42, (ops GPRC:$rD, memri:$src),
+                  "lha $rD, $src", LdStLHA,
+                  [(set GPRC:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LHZ : DForm_1<40, (ops GPRC:$rD, memri:$src),
+                  "lhz $rD, $src", LdStGeneral,
+                  [(set GPRC:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ : DForm_1<32, (ops GPRC:$rD, memri:$src),
+                  "lwz $rD, $src", LdStGeneral,
+                  [(set GPRC:$rD, (load iaddr:$src))]>;
+
+def LFS : DForm_1<48, (ops F4RC:$rD, memri:$src),
+                  "lfs $rD, $src", LdStLFDU,
+                  [(set F4RC:$rD, (load iaddr:$src))]>;
+def LFD : DForm_1<50, (ops F8RC:$rD, memri:$src),
+                  "lfd $rD, $src", LdStLFD,
+                  [(set F8RC:$rD, (load iaddr:$src))]>;
+
+
+// Unindexed (r+i) Loads with Update (preinc).
+def LBZU : DForm_1<35, (ops GPRC:$rD, ptr_rc:$ea_result, memri:$addr),
+                   "lbzu $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAU : DForm_1<43, (ops GPRC:$rD, ptr_rc:$ea_result, memri:$addr),
+                   "lhau $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZU : DForm_1<41, (ops GPRC:$rD, ptr_rc:$ea_result, memri:$addr),
+                   "lhzu $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZU : DForm_1<33, (ops GPRC:$rD, ptr_rc:$ea_result, memri:$addr),
+                   "lwzu $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSU : DForm_1<49, (ops F4RC:$rD, ptr_rc:$ea_result, memri:$addr),
+                  "lfs $rD, $addr", LdStLFDU,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDU : DForm_1<51, (ops F8RC:$rD, ptr_rc:$ea_result, memri:$addr),
+                  "lfd $rD, $addr", LdStLFD,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+}
+
+// Indexed (r+r) Loads.
+//
+let isLoad = 1, PPC970_Unit = 2 in {
+def LBZX : XForm_1<31,  87, (ops GPRC:$rD, memrr:$src),
+                   "lbzx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (zextloadi8 xaddr:$src))]>;
+def LHAX : XForm_1<31, 343, (ops GPRC:$rD, memrr:$src),
+                   "lhax $rD, $src", LdStLHA,
+                   [(set GPRC:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LHZX : XForm_1<31, 279, (ops GPRC:$rD, memrr:$src),
+                   "lhzx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX : XForm_1<31,  23, (ops GPRC:$rD, memrr:$src),
+                   "lwzx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (load xaddr:$src))]>;
+                   
+                   
+def LHBRX : XForm_1<31, 790, (ops GPRC:$rD, memrr:$src),
+                   "lhbrx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (PPClbrx xoaddr:$src, srcvalue:$sv, i16))]>;
+def LWBRX : XForm_1<31,  534, (ops GPRC:$rD, memrr:$src),
+                   "lwbrx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (PPClbrx xoaddr:$src, srcvalue:$sv, i32))]>;
+
+def LFSX   : XForm_25<31, 535, (ops F4RC:$frD, memrr:$src),
+                      "lfsx $frD, $src", LdStLFDU,
+                      [(set F4RC:$frD, (load xaddr:$src))]>;
+def LFDX   : XForm_25<31, 599, (ops F8RC:$frD, memrr:$src),
+                      "lfdx $frD, $src", LdStLFDU,
+                      [(set F8RC:$frD, (load xaddr:$src))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// PPC32 Store Instructions.
+//
+
+// Unindexed (r+i) Stores.
+let isStore = 1, noResults = 1, PPC970_Unit = 2 in {
+def STB  : DForm_1<38, (ops GPRC:$rS, memri:$src),
+                   "stb $rS, $src", LdStGeneral,
+                   [(truncstorei8 GPRC:$rS, iaddr:$src)]>;
+def STH  : DForm_1<44, (ops GPRC:$rS, memri:$src),
+                   "sth $rS, $src", LdStGeneral,
+                   [(truncstorei16 GPRC:$rS, iaddr:$src)]>;
+def STW  : DForm_1<36, (ops GPRC:$rS, memri:$src),
+                   "stw $rS, $src", LdStGeneral,
+                   [(store GPRC:$rS, iaddr:$src)]>;
+def STFS : DForm_1<52, (ops F4RC:$rS, memri:$dst),
+                   "stfs $rS, $dst", LdStUX,
+                   [(store F4RC:$rS, iaddr:$dst)]>;
+def STFD : DForm_1<54, (ops F8RC:$rS, memri:$dst),
+                   "stfd $rS, $dst", LdStUX,
+                   [(store F8RC:$rS, iaddr:$dst)]>;
+}
+
+// Unindexed (r+i) Stores with Update (preinc).
+let isStore = 1, PPC970_Unit = 2 in {
+def STBU  : DForm_1<39, (ops ptr_rc:$ea_res, GPRC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stbu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                          (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, 
+                                         iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STHU  : DForm_1<45, (ops ptr_rc:$ea_res, GPRC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "sthu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                        (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, 
+                                        iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STWU  : DForm_1<37, (ops ptr_rc:$ea_res, GPRC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stwu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, 
+                                                     iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STFSU : DForm_1<37, (ops ptr_rc:$ea_res, F4RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stfsu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store F4RC:$rS,  ptr_rc:$ptrreg, 
+                                          iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STFDU : DForm_1<37, (ops ptr_rc:$ea_res, F8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stfdu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, 
+                                          iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+}
+
+
+// Indexed (r+r) Stores.
+//
+let isStore = 1, noResults = 1, PPC970_Unit = 2 in {
+def STBX  : XForm_8<31, 215, (ops GPRC:$rS, memrr:$dst),
+                   "stbx $rS, $dst", LdStGeneral,
+                   [(truncstorei8 GPRC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STHX  : XForm_8<31, 407, (ops GPRC:$rS, memrr:$dst),
+                   "sthx $rS, $dst", LdStGeneral,
+                   [(truncstorei16 GPRC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STWX  : XForm_8<31, 151, (ops GPRC:$rS, memrr:$dst),
+                   "stwx $rS, $dst", LdStGeneral,
+                   [(store GPRC:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STWUX : XForm_8<31, 183, (ops GPRC:$rS, GPRC:$rA, GPRC:$rB),
+                   "stwux $rS, $rA, $rB", LdStGeneral,
+                   []>;
+def STHBRX: XForm_8<31, 918, (ops GPRC:$rS, memrr:$dst),
+                   "sthbrx $rS, $dst", LdStGeneral,
+                   [(PPCstbrx GPRC:$rS, xoaddr:$dst, srcvalue:$dummy, i16)]>, 
+                   PPC970_DGroup_Cracked;
+def STWBRX: XForm_8<31, 662, (ops GPRC:$rS, memrr:$dst),
+                   "stwbrx $rS, $dst", LdStGeneral,
+                   [(PPCstbrx GPRC:$rS, xoaddr:$dst, srcvalue:$dummy, i32)]>,
+                   PPC970_DGroup_Cracked;
+
+def STFIWX: XForm_28<31, 983, (ops F8RC:$frS, memrr:$dst),
+                     "stfiwx $frS, $dst", LdStUX,
+                     [(PPCstfiwx F8RC:$frS, xoaddr:$dst)]>;
+def STFSX : XForm_28<31, 663, (ops F4RC:$frS, memrr:$dst),
+                     "stfsx $frS, $dst", LdStUX,
+                     [(store F4RC:$frS, xaddr:$dst)]>;
+def STFDX : XForm_28<31, 727, (ops F8RC:$frS, memrr:$dst),
+                     "stfdx $frS, $dst", LdStUX,
+                     [(store F8RC:$frS, xaddr:$dst)]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PPC32 Arithmetic Instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def ADDI   : DForm_2<14, (ops GPRC:$rD, GPRC:$rA, s16imm:$imm),
+                     "addi $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>;
+def ADDIC  : DForm_2<12, (ops GPRC:$rD, GPRC:$rA, s16imm:$imm),
+                     "addic $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (addc GPRC:$rA, immSExt16:$imm))]>,
+                     PPC970_DGroup_Cracked;
+def ADDICo : DForm_2<13, (ops GPRC:$rD, GPRC:$rA, s16imm:$imm),
+                     "addic. $rD, $rA, $imm", IntGeneral,
+                     []>;
+def ADDIS  : DForm_2<15, (ops GPRC:$rD, GPRC:$rA, symbolHi:$imm),
+                     "addis $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (add GPRC:$rA, imm16ShiftedSExt:$imm))]>;
+def LA     : DForm_2<14, (ops GPRC:$rD, GPRC:$rA, symbolLo:$sym),
+                     "la $rD, $sym($rA)", IntGeneral,
+                     [(set GPRC:$rD, (add GPRC:$rA,
+                                          (PPClo tglobaladdr:$sym, 0)))]>;
+def MULLI  : DForm_2< 7, (ops GPRC:$rD, GPRC:$rA, s16imm:$imm),
+                     "mulli $rD, $rA, $imm", IntMulLI,
+                     [(set GPRC:$rD, (mul GPRC:$rA, immSExt16:$imm))]>;
+def SUBFIC : DForm_2< 8, (ops GPRC:$rD, GPRC:$rA, s16imm:$imm),
+                     "subfic $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (subc immSExt16:$imm, GPRC:$rA))]>;
+def LI  : DForm_2_r0<14, (ops GPRC:$rD, symbolLo:$imm),
+                     "li $rD, $imm", IntGeneral,
+                     [(set GPRC:$rD, immSExt16:$imm)]>;
+def LIS : DForm_2_r0<15, (ops GPRC:$rD, symbolHi:$imm),
+                     "lis $rD, $imm", IntGeneral,
+                     [(set GPRC:$rD, imm16ShiftedSExt:$imm)]>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def ANDIo : DForm_4<28, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2),
+                    "andi. $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (and GPRC:$src1, immZExt16:$src2))]>,
+                    isDOT;
+def ANDISo : DForm_4<29, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2),
+                    "andis. $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (and GPRC:$src1,imm16ShiftedZExt:$src2))]>,
+                    isDOT;
+def ORI   : DForm_4<24, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2),
+                    "ori $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (or GPRC:$src1, immZExt16:$src2))]>;
+def ORIS  : DForm_4<25, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2),
+                    "oris $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (or GPRC:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI  : DForm_4<26, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2),
+                    "xori $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (xor GPRC:$src1, immZExt16:$src2))]>;
+def XORIS : DForm_4<27, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2),
+                    "xoris $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (xor GPRC:$src1,imm16ShiftedZExt:$src2))]>;
+def NOP   : DForm_4_zero<24, (ops), "nop", IntGeneral,
+                         []>;
+def CMPWI : DForm_5_ext<11, (ops CRRC:$crD, GPRC:$rA, s16imm:$imm),
+                        "cmpwi $crD, $rA, $imm", IntCompare>;
+def CMPLWI : DForm_6_ext<10, (ops CRRC:$dst, GPRC:$src1, u16imm:$src2),
+                         "cmplwi $dst, $src1, $src2", IntCompare>;
+}
+
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def NAND : XForm_6<31, 476, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "nand $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (not (and GPRC:$rS, GPRC:$rB)))]>;
+def AND  : XForm_6<31,  28, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "and $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (and GPRC:$rS, GPRC:$rB))]>;
+def ANDC : XForm_6<31,  60, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "andc $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (and GPRC:$rS, (not GPRC:$rB)))]>;
+def OR   : XForm_6<31, 444, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (or GPRC:$rS, GPRC:$rB))]>;
+def NOR  : XForm_6<31, 124, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "nor $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (not (or GPRC:$rS, GPRC:$rB)))]>;
+def ORC  : XForm_6<31, 412, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "orc $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (or GPRC:$rS, (not GPRC:$rB)))]>;
+def EQV  : XForm_6<31, 284, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "eqv $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (not (xor GPRC:$rS, GPRC:$rB)))]>;
+def XOR  : XForm_6<31, 316, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "xor $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (xor GPRC:$rS, GPRC:$rB))]>;
+def SLW  : XForm_6<31,  24, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "slw $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (PPCshl GPRC:$rS, GPRC:$rB))]>;
+def SRW  : XForm_6<31, 536, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "srw $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (PPCsrl GPRC:$rS, GPRC:$rB))]>;
+def SRAW : XForm_6<31, 792, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB),
+                   "sraw $rA, $rS, $rB", IntShift,
+                   [(set GPRC:$rA, (PPCsra GPRC:$rS, GPRC:$rB))]>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def SRAWI : XForm_10<31, 824, (ops GPRC:$rA, GPRC:$rS, u5imm:$SH), 
+                     "srawi $rA, $rS, $SH", IntShift,
+                     [(set GPRC:$rA, (sra GPRC:$rS, (i32 imm:$SH)))]>;
+def CNTLZW : XForm_11<31,  26, (ops GPRC:$rA, GPRC:$rS),
+                      "cntlzw $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (ctlz GPRC:$rS))]>;
+def EXTSB  : XForm_11<31, 954, (ops GPRC:$rA, GPRC:$rS),
+                      "extsb $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (sext_inreg GPRC:$rS, i8))]>;
+def EXTSH  : XForm_11<31, 922, (ops GPRC:$rA, GPRC:$rS),
+                      "extsh $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (sext_inreg GPRC:$rS, i16))]>;
+
+def CMPW   : XForm_16_ext<31, 0, (ops CRRC:$crD, GPRC:$rA, GPRC:$rB),
+                          "cmpw $crD, $rA, $rB", IntCompare>;
+def CMPLW  : XForm_16_ext<31, 32, (ops CRRC:$crD, GPRC:$rA, GPRC:$rB),
+                          "cmplw $crD, $rA, $rB", IntCompare>;
+}
+let PPC970_Unit = 3 in {  // FPU Operations.
+//def FCMPO  : XForm_17<63, 32, (ops CRRC:$crD, FPRC:$fA, FPRC:$fB),
+//                      "fcmpo $crD, $fA, $fB", FPCompare>;
+def FCMPUS : XForm_17<63, 0, (ops CRRC:$crD, F4RC:$fA, F4RC:$fB),
+                      "fcmpu $crD, $fA, $fB", FPCompare>;
+def FCMPUD : XForm_17<63, 0, (ops CRRC:$crD, F8RC:$fA, F8RC:$fB),
+                      "fcmpu $crD, $fA, $fB", FPCompare>;
+
+def FCTIWZ : XForm_26<63, 15, (ops F8RC:$frD, F8RC:$frB),
+                      "fctiwz $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (PPCfctiwz F8RC:$frB))]>;
+def FRSP   : XForm_26<63, 12, (ops F4RC:$frD, F8RC:$frB),
+                      "frsp $frD, $frB", FPGeneral,
+                      [(set F4RC:$frD, (fround F8RC:$frB))]>;
+def FSQRT  : XForm_26<63, 22, (ops F8RC:$frD, F8RC:$frB),
+                      "fsqrt $frD, $frB", FPSqrt,
+                      [(set F8RC:$frD, (fsqrt F8RC:$frB))]>;
+def FSQRTS : XForm_26<59, 22, (ops F4RC:$frD, F4RC:$frB),
+                      "fsqrts $frD, $frB", FPSqrt,
+                      [(set F4RC:$frD, (fsqrt F4RC:$frB))]>;
+}
+
+/// FMR is split into 3 versions, one for 4/8 byte FP, and one for extending.
+///
+/// Note that these are defined as pseudo-ops on the PPC970 because they are
+/// often coalesced away and we don't want the dispatch group builder to think
+/// that they will fill slots (which could cause the load of a LSU reject to
+/// sneak into a d-group with a store).
+def FMRS   : XForm_26<63, 72, (ops F4RC:$frD, F4RC:$frB),
+                      "fmr $frD, $frB", FPGeneral,
+                      []>,  // (set F4RC:$frD, F4RC:$frB)
+                      PPC970_Unit_Pseudo;
+def FMRD   : XForm_26<63, 72, (ops F8RC:$frD, F8RC:$frB),
+                      "fmr $frD, $frB", FPGeneral,
+                      []>,  // (set F8RC:$frD, F8RC:$frB)
+                      PPC970_Unit_Pseudo;
+def FMRSD  : XForm_26<63, 72, (ops F8RC:$frD, F4RC:$frB),
+                      "fmr $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fextend F4RC:$frB))]>,
+                      PPC970_Unit_Pseudo;
+
+let PPC970_Unit = 3 in {  // FPU Operations.
+// These are artificially split into two different forms, for 4/8 byte FP.
+def FABSS  : XForm_26<63, 264, (ops F4RC:$frD, F4RC:$frB),
+                      "fabs $frD, $frB", FPGeneral,
+                      [(set F4RC:$frD, (fabs F4RC:$frB))]>;
+def FABSD  : XForm_26<63, 264, (ops F8RC:$frD, F8RC:$frB),
+                      "fabs $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fabs F8RC:$frB))]>;
+def FNABSS : XForm_26<63, 136, (ops F4RC:$frD, F4RC:$frB),
+                      "fnabs $frD, $frB", FPGeneral,
+                      [(set F4RC:$frD, (fneg (fabs F4RC:$frB)))]>;
+def FNABSD : XForm_26<63, 136, (ops F8RC:$frD, F8RC:$frB),
+                      "fnabs $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fneg (fabs F8RC:$frB)))]>;
+def FNEGS  : XForm_26<63, 40, (ops F4RC:$frD, F4RC:$frB),
+                      "fneg $frD, $frB", FPGeneral,
+                      [(set F4RC:$frD, (fneg F4RC:$frB))]>;
+def FNEGD  : XForm_26<63, 40, (ops F8RC:$frD, F8RC:$frB),
+                      "fneg $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fneg F8RC:$frB))]>;
+}
+                      
+
+// XL-Form instructions.  condition register logical ops.
+//
+def MCRF   : XLForm_3<19, 0, (ops CRRC:$BF, CRRC:$BFA),
+                      "mcrf $BF, $BFA", BrMCR>,
+             PPC970_DGroup_First, PPC970_Unit_CRU;
+
+def CREQV  : XLForm_1<19, 289, (ops CRRC:$CRD, CRRC:$CRA, CRRC:$CRB),
+                      "creqv $CRD, $CRA, $CRB", BrCR,
+                      []>;
+
+def SETCR  : XLForm_1_ext<19, 289, (ops CRRC:$dst),
+              "creqv $dst, $dst, $dst", BrCR,
+              []>;
+
+// XFX-Form instructions.  Instructions that deal with SPRs.
+//
+def MFCTR : XFXForm_1_ext<31, 339, 9, (ops GPRC:$rT), "mfctr $rT", SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+let Pattern = [(PPCmtctr GPRC:$rS)] in {
+def MTCTR : XFXForm_7_ext<31, 467, 9, (ops GPRC:$rS), "mtctr $rS", SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+def MTLR  : XFXForm_7_ext<31, 467, 8, (ops GPRC:$rS), "mtlr $rS", SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+def MFLR  : XFXForm_1_ext<31, 339, 8, (ops GPRC:$rT), "mflr $rT", SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+
+// Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed like
+// a GPR on the PPC970.  As such, copies in and out have the same performance
+// characteristics as an OR instruction.
+def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (ops GPRC:$rS),
+                             "mtspr 256, $rS", IntGeneral>,
+               PPC970_DGroup_Single, PPC970_Unit_FXU;
+def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (ops GPRC:$rT),
+                             "mfspr $rT, 256", IntGeneral>,
+               PPC970_DGroup_First, PPC970_Unit_FXU;
+
+def MTCRF : XFXForm_5<31, 144, (ops crbitm:$FXM, GPRC:$rS),
+                      "mtcrf $FXM, $rS", BrMCRX>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+def MFCR  : XFXForm_3<31, 19, (ops GPRC:$rT), "mfcr $rT", SprMFCR>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+def MFOCRF: XFXForm_5a<31, 19, (ops GPRC:$rT, crbitm:$FXM),
+                       "mfcr $rT, $FXM", SprMFCR>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+
+// XO-Form instructions.  Arithmetic instructions that can set overflow bit
+//
+def ADD4  : XOForm_1<31, 266, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "add $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (add GPRC:$rA, GPRC:$rB))]>;
+def ADDC  : XOForm_1<31, 10, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "addc $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (addc GPRC:$rA, GPRC:$rB))]>,
+                     PPC970_DGroup_Cracked;
+def ADDE  : XOForm_1<31, 138, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "adde $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (adde GPRC:$rA, GPRC:$rB))]>;
+def DIVW  : XOForm_1<31, 491, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "divw $rT, $rA, $rB", IntDivW,
+                     [(set GPRC:$rT, (sdiv GPRC:$rA, GPRC:$rB))]>,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def DIVWU : XOForm_1<31, 459, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "divwu $rT, $rA, $rB", IntDivW,
+                     [(set GPRC:$rT, (udiv GPRC:$rA, GPRC:$rB))]>,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def MULHW : XOForm_1<31, 75, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "mulhw $rT, $rA, $rB", IntMulHW,
+                     [(set GPRC:$rT, (mulhs GPRC:$rA, GPRC:$rB))]>;
+def MULHWU : XOForm_1<31, 11, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "mulhwu $rT, $rA, $rB", IntMulHWU,
+                     [(set GPRC:$rT, (mulhu GPRC:$rA, GPRC:$rB))]>;
+def MULLW : XOForm_1<31, 235, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "mullw $rT, $rA, $rB", IntMulHW,
+                     [(set GPRC:$rT, (mul GPRC:$rA, GPRC:$rB))]>;
+def SUBF  : XOForm_1<31, 40, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "subf $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (sub GPRC:$rB, GPRC:$rA))]>;
+def SUBFC : XOForm_1<31, 8, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "subfc $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (subc GPRC:$rB, GPRC:$rA))]>,
+                     PPC970_DGroup_Cracked;
+def SUBFE : XOForm_1<31, 136, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB),
+                     "subfe $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (sube GPRC:$rB, GPRC:$rA))]>;
+def ADDME  : XOForm_3<31, 234, 0, (ops GPRC:$rT, GPRC:$rA),
+                      "addme $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (adde GPRC:$rA, immAllOnes))]>;
+def ADDZE  : XOForm_3<31, 202, 0, (ops GPRC:$rT, GPRC:$rA),
+                      "addze $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (adde GPRC:$rA, 0))]>;
+def NEG    : XOForm_3<31, 104, 0, (ops GPRC:$rT, GPRC:$rA),
+                      "neg $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (ineg GPRC:$rA))]>;
+def SUBFME : XOForm_3<31, 232, 0, (ops GPRC:$rT, GPRC:$rA),
+                      "subfme $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (sube immAllOnes, GPRC:$rA))]>;
+def SUBFZE : XOForm_3<31, 200, 0, (ops GPRC:$rT, GPRC:$rA),
+                      "subfze $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (sube 0, GPRC:$rA))]>;
+}
+
+// A-Form instructions.  Most of the instructions executed in the FPU are of
+// this type.
+//
+let PPC970_Unit = 3 in {  // FPU Operations.
+def FMADD : AForm_1<63, 29, 
+                    (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                    "fmadd $FRT, $FRA, $FRC, $FRB", FPFused,
+                    [(set F8RC:$FRT, (fadd (fmul F8RC:$FRA, F8RC:$FRC),
+                                           F8RC:$FRB))]>,
+                    Requires<[FPContractions]>;
+def FMADDS : AForm_1<59, 29,
+                    (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                    "fmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set F4RC:$FRT, (fadd (fmul F4RC:$FRA, F4RC:$FRC),
+                                           F4RC:$FRB))]>,
+                    Requires<[FPContractions]>;
+def FMSUB : AForm_1<63, 28,
+                    (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                    "fmsub $FRT, $FRA, $FRC, $FRB", FPFused,
+                    [(set F8RC:$FRT, (fsub (fmul F8RC:$FRA, F8RC:$FRC),
+                                           F8RC:$FRB))]>,
+                    Requires<[FPContractions]>;
+def FMSUBS : AForm_1<59, 28,
+                    (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                    "fmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set F4RC:$FRT, (fsub (fmul F4RC:$FRA, F4RC:$FRC),
+                                           F4RC:$FRB))]>,
+                    Requires<[FPContractions]>;
+def FNMADD : AForm_1<63, 31,
+                    (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                    "fnmadd $FRT, $FRA, $FRC, $FRB", FPFused,
+                    [(set F8RC:$FRT, (fneg (fadd (fmul F8RC:$FRA, F8RC:$FRC),
+                                                 F8RC:$FRB)))]>,
+                    Requires<[FPContractions]>;
+def FNMADDS : AForm_1<59, 31,
+                    (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                    "fnmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set F4RC:$FRT, (fneg (fadd (fmul F4RC:$FRA, F4RC:$FRC),
+                                                 F4RC:$FRB)))]>,
+                    Requires<[FPContractions]>;
+def FNMSUB : AForm_1<63, 30,
+                    (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                    "fnmsub $FRT, $FRA, $FRC, $FRB", FPFused,
+                    [(set F8RC:$FRT, (fneg (fsub (fmul F8RC:$FRA, F8RC:$FRC),
+                                                 F8RC:$FRB)))]>,
+                    Requires<[FPContractions]>;
+def FNMSUBS : AForm_1<59, 30,
+                    (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                    "fnmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set F4RC:$FRT, (fneg (fsub (fmul F4RC:$FRA, F4RC:$FRC),
+                                                 F4RC:$FRB)))]>,
+                    Requires<[FPContractions]>;
+// FSEL is artificially split into 4 and 8-byte forms for the result.  To avoid
+// having 4 of these, force the comparison to always be an 8-byte double (code
+// should use an FMRSD if the input comparison value really wants to be a float)
+// and 4/8 byte forms for the result and operand type..
+def FSELD : AForm_1<63, 23,
+                    (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                    "fsel $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set F8RC:$FRT, (PPCfsel F8RC:$FRA,F8RC:$FRC,F8RC:$FRB))]>;
+def FSELS : AForm_1<63, 23,
+                     (ops F4RC:$FRT, F8RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                     "fsel $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set F4RC:$FRT, (PPCfsel F8RC:$FRA,F4RC:$FRC,F4RC:$FRB))]>;
+def FADD  : AForm_2<63, 21,
+                    (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRB),
+                    "fadd $FRT, $FRA, $FRB", FPGeneral,
+                    [(set F8RC:$FRT, (fadd F8RC:$FRA, F8RC:$FRB))]>;
+def FADDS : AForm_2<59, 21,
+                    (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRB),
+                    "fadds $FRT, $FRA, $FRB", FPGeneral,
+                    [(set F4RC:$FRT, (fadd F4RC:$FRA, F4RC:$FRB))]>;
+def FDIV  : AForm_2<63, 18,
+                    (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRB),
+                    "fdiv $FRT, $FRA, $FRB", FPDivD,
+                    [(set F8RC:$FRT, (fdiv F8RC:$FRA, F8RC:$FRB))]>;
+def FDIVS : AForm_2<59, 18,
+                    (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRB),
+                    "fdivs $FRT, $FRA, $FRB", FPDivS,
+                    [(set F4RC:$FRT, (fdiv F4RC:$FRA, F4RC:$FRB))]>;
+def FMUL  : AForm_3<63, 25,
+                    (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRB),
+                    "fmul $FRT, $FRA, $FRB", FPFused,
+                    [(set F8RC:$FRT, (fmul F8RC:$FRA, F8RC:$FRB))]>;
+def FMULS : AForm_3<59, 25,
+                    (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRB),
+                    "fmuls $FRT, $FRA, $FRB", FPGeneral,
+                    [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRB))]>;
+def FSUB  : AForm_2<63, 20,
+                    (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRB),
+                    "fsub $FRT, $FRA, $FRB", FPGeneral,
+                    [(set F8RC:$FRT, (fsub F8RC:$FRA, F8RC:$FRB))]>;
+def FSUBS : AForm_2<59, 20,
+                    (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRB),
+                    "fsubs $FRT, $FRA, $FRB", FPGeneral,
+                    [(set F4RC:$FRT, (fsub F4RC:$FRA, F4RC:$FRB))]>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+// M-Form instructions.  rotate and mask instructions.
+//
+let isCommutable = 1 in {
+// RLWIMI can be commuted if the rotate amount is zero.
+def RLWIMI : MForm_2<20,
+                     (ops GPRC:$rA, GPRC:$rSi, GPRC:$rS, u5imm:$SH, u5imm:$MB, 
+                      u5imm:$ME), "rlwimi $rA, $rS, $SH, $MB, $ME", IntRotate,
+                      []>, PPC970_DGroup_Cracked, RegConstraint<"$rSi = $rA">,
+                      NoEncode<"$rSi">;
+}
+def RLWINM : MForm_2<21,
+                     (ops GPRC:$rA, GPRC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     []>;
+def RLWINMo : MForm_2<21,
+                     (ops GPRC:$rA, GPRC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm. $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     []>, isDOT, PPC970_DGroup_Cracked;
+def RLWNM  : MForm_2<23,
+                     (ops GPRC:$rA, GPRC:$rS, GPRC:$rB, u5imm:$MB, u5imm:$ME),
+                     "rlwnm $rA, $rS, $rB, $MB, $ME", IntGeneral,
+                     []>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DWARF Pseudo Instructions
+//
+
+def DWARF_LOC        : Pseudo<(ops i32imm:$line, i32imm:$col, i32imm:$file),
+                              "${:comment} .loc $file, $line, $col",
+                      [(dwarf_loc (i32 imm:$line), (i32 imm:$col),
+                                  (i32 imm:$file))]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Patterns
+//
+
+// Arbitrary immediate support.  Implement in terms of LIS/ORI.
+def : Pat<(i32 imm:$imm),
+          (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Implement the 'not' operation with the NOR instruction.
+def NOT : Pat<(not GPRC:$in),
+              (NOR GPRC:$in, GPRC:$in)>;
+
+// ADD an arbitrary immediate.
+def : Pat<(add GPRC:$in, imm:$imm),
+          (ADDIS (ADDI GPRC:$in, (LO16 imm:$imm)), (HA16 imm:$imm))>;
+// OR an arbitrary immediate.
+def : Pat<(or GPRC:$in, imm:$imm),
+          (ORIS (ORI GPRC:$in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// XOR an arbitrary immediate.
+def : Pat<(xor GPRC:$in, imm:$imm),
+          (XORIS (XORI GPRC:$in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// SUBFIC
+def : Pat<(sub  immSExt16:$imm, GPRC:$in),
+          (SUBFIC GPRC:$in, imm:$imm)>;
+
+// SHL/SRL
+def : Pat<(shl GPRC:$in, (i32 imm:$imm)),
+          (RLWINM GPRC:$in, imm:$imm, 0, (SHL32 imm:$imm))>;
+def : Pat<(srl GPRC:$in, (i32 imm:$imm)),
+          (RLWINM GPRC:$in, (SRL32 imm:$imm), imm:$imm, 31)>;
+
+// ROTL
+def : Pat<(rotl GPRC:$in, GPRC:$sh),
+          (RLWNM GPRC:$in, GPRC:$sh, 0, 31)>;
+def : Pat<(rotl GPRC:$in, (i32 imm:$imm)),
+          (RLWINM GPRC:$in, imm:$imm, 0, 31)>;
+
+// RLWNM
+def : Pat<(and (rotl GPRC:$in, GPRC:$sh), maskimm32:$imm),
+          (RLWNM GPRC:$in, GPRC:$sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>;
+
+// Calls
+def : Pat<(PPCcall_Macho (i32 tglobaladdr:$dst)),
+          (BL_Macho tglobaladdr:$dst)>;
+def : Pat<(PPCcall_Macho (i32 texternalsym:$dst)),
+          (BL_Macho texternalsym:$dst)>;
+def : Pat<(PPCcall_ELF (i32 tglobaladdr:$dst)),
+          (BL_ELF tglobaladdr:$dst)>;
+def : Pat<(PPCcall_ELF (i32 texternalsym:$dst)),
+          (BL_ELF texternalsym:$dst)>;
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
+def : Pat<(add GPRC:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS GPRC:$in, tglobaladdr:$g)>;
+def : Pat<(add GPRC:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS GPRC:$in, tconstpool:$g)>;
+def : Pat<(add GPRC:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS GPRC:$in, tjumptable:$g)>;
+
+// Fused negative multiply subtract, alternate pattern
+def : Pat<(fsub F8RC:$B, (fmul F8RC:$A, F8RC:$C)),
+          (FNMSUB F8RC:$A, F8RC:$C, F8RC:$B)>,
+          Requires<[FPContractions]>;
+def : Pat<(fsub F4RC:$B, (fmul F4RC:$A, F4RC:$C)),
+          (FNMSUBS F4RC:$A, F4RC:$C, F4RC:$B)>,
+          Requires<[FPContractions]>;
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 5-bit and 6-bit shift
+// amounts.
+def : Pat<(sra GPRC:$rS, GPRC:$rB),
+          (SRAW GPRC:$rS, GPRC:$rB)>;
+def : Pat<(srl GPRC:$rS, GPRC:$rB),
+          (SRW GPRC:$rS, GPRC:$rB)>;
+def : Pat<(shl GPRC:$rS, GPRC:$rB),
+          (SLW GPRC:$rS, GPRC:$rB)>;
+
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX xaddr:$src)>;
+def : Pat<(extloadf32 iaddr:$src),
+          (FMRSD (LFS iaddr:$src))>;
+def : Pat<(extloadf32 xaddr:$src),
+          (FMRSD (LFSX xaddr:$src))>;
+
+include "PPCInstrAltivec.td"
+include "PPCInstr64Bit.td"

diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
new file mode 100644
index 0000000..acaed0b
--- /dev/null
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp

@@ -0,0 +1,429 @@
+//===-- PPCJITInfo.cpp - Implement the JIT interfaces for the PowerPC -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the 32-bit PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "PPCJITInfo.h"
+#include "PPCRelocations.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/Config/alloca.h"
+#include "llvm/Support/Debug.h"
+#include <set>
+using namespace llvm;
+
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+#define BUILD_ADDIS(RD,RS,IMM16) \
+  ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+#define BUILD_ORI(RD,RS,UIMM16) \
+  ((24 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
+#define BUILD_ORIS(RD,RS,UIMM16) \
+  ((25 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
+#define BUILD_RLDICR(RD,RS,SH,ME) \
+  ((30 << 26) | ((RS) << 21) | ((RD) << 16) | (((SH) & 31) << 11) | \
+   (((ME) & 63) << 6) | (1 << 2) | ((((SH) >> 5) & 1) << 1))
+#define BUILD_MTSPR(RS,SPR)      \
+  ((31 << 26) | ((RS) << 21) | ((SPR) << 16) | (467 << 1))
+#define BUILD_BCCTRx(BO,BI,LINK) \
+  ((19 << 26) | ((BO) << 21) | ((BI) << 16) | (528 << 1) | ((LINK) & 1))
+#define BUILD_B(TARGET, LINK) \
+  ((18 << 26) | (((TARGET) & 0x00FFFFFF) << 2) | ((LINK) & 1))
+
+// Pseudo-ops
+#define BUILD_LIS(RD,IMM16)    BUILD_ADDIS(RD,0,IMM16)
+#define BUILD_SLDI(RD,RS,IMM6) BUILD_RLDICR(RD,RS,IMM6,63-IMM6)
+#define BUILD_MTCTR(RS)        BUILD_MTSPR(RS,9)
+#define BUILD_BCTR(LINK)       BUILD_BCCTRx(20,0,LINK)
+
+static void EmitBranchToAt(uint64_t At, uint64_t To, bool isCall, bool is64Bit){
+  intptr_t Offset = ((intptr_t)To - (intptr_t)At) >> 2;
+  unsigned *AtI = (unsigned*)(intptr_t)At;
+
+  if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
+    AtI[0] = BUILD_B(Offset, isCall);     // b/bl target
+  } else if (!is64Bit) {
+    AtI[0] = BUILD_LIS(12, To >> 16);     // lis r12, hi16(address)
+    AtI[1] = BUILD_ORI(12, 12, To);       // ori r12, r12, lo16(address)
+    AtI[2] = BUILD_MTCTR(12);             // mtctr r12
+    AtI[3] = BUILD_BCTR(isCall);          // bctr/bctrl
+  } else {
+    AtI[0] = BUILD_LIS(12, To >> 48);      // lis r12, hi16(address)
+    AtI[1] = BUILD_ORI(12, 12, To >> 32);  // ori r12, r12, lo16(address)
+    AtI[2] = BUILD_SLDI(12, 12, 32);       // sldi r12, r12, 32
+    AtI[3] = BUILD_ORIS(12, 12, To >> 16); // oris r12, r12, hi16(address)
+    AtI[4] = BUILD_ORI(12, 12, To);        // ori r12, r12, lo16(address)
+    AtI[5] = BUILD_MTCTR(12);              // mtctr r12
+    AtI[6] = BUILD_BCTR(isCall);           // bctr/bctrl
+  }
+}
+
+extern "C" void PPC32CompilationCallback();
+extern "C" void PPC64CompilationCallback();
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+    !defined(__ppc64__)
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl _PPC32CompilationCallback\n"
+"_PPC32CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // FIXME: could shrink frame
+    // Set up a proper stack frame
+    // FIXME Layout
+    //   PowerPC64 ABI linkage    -  24 bytes
+    //                 parameters -  32 bytes
+    //   13 double registers      - 104 bytes
+    //   8 int registers          -  32 bytes
+    "mflr r0\n"
+    "stw r0,  8(r1)\n"
+    "stwu r1, -208(r1)\n"
+    // Save all int arg registers
+    "stw r10, 204(r1)\n"    "stw r9,  200(r1)\n"
+    "stw r8,  196(r1)\n"    "stw r7,  192(r1)\n"
+    "stw r6,  188(r1)\n"    "stw r5,  184(r1)\n"
+    "stw r4,  180(r1)\n"    "stw r3,  176(r1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd f13, 168(r1)\n"   "stfd f12, 160(r1)\n"
+    "stfd f11, 152(r1)\n"   "stfd f10, 144(r1)\n"
+    "stfd f9,  136(r1)\n"   "stfd f8,  128(r1)\n"
+    "stfd f7,  120(r1)\n"   "stfd f6,  112(r1)\n"
+    "stfd f5,  104(r1)\n"   "stfd f4,   96(r1)\n"
+    "stfd f3,   88(r1)\n"   "stfd f2,   80(r1)\n"
+    "stfd f1,   72(r1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 0.
+    "mr   r3, r0\n"
+    "lwz  r2, 208(r1)\n" // stub's frame
+    "lwz  r4, 8(r2)\n" // stub's lr
+    "li   r5, 0\n"       // 0 == 32 bit
+    "bl _PPCCompilationCallbackC\n"
+    "mtctr r3\n"
+    // Restore all int arg registers
+    "lwz r10, 204(r1)\n"    "lwz r9,  200(r1)\n"
+    "lwz r8,  196(r1)\n"    "lwz r7,  192(r1)\n"
+    "lwz r6,  188(r1)\n"    "lwz r5,  184(r1)\n"
+    "lwz r4,  180(r1)\n"    "lwz r3,  176(r1)\n"
+    // Restore all FP arg registers
+    "lfd f13, 168(r1)\n"    "lfd f12, 160(r1)\n"
+    "lfd f11, 152(r1)\n"    "lfd f10, 144(r1)\n"
+    "lfd f9,  136(r1)\n"    "lfd f8,  128(r1)\n"
+    "lfd f7,  120(r1)\n"    "lfd f6,  112(r1)\n"
+    "lfd f5,  104(r1)\n"    "lfd f4,   96(r1)\n"
+    "lfd f3,   88(r1)\n"    "lfd f2,   80(r1)\n"
+    "lfd f1,   72(r1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "lwz  r1, 208(r1)\n"
+    "lwz  r2, 8(r1)\n"
+    "mtlr r2\n"
+    "bctr\n"
+    );
+
+#elif defined(__PPC__) && !defined(__ppc64__)
+// Linux/PPC support
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl PPC32CompilationCallback\n"
+"PPC32CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 8 doubles f[1-8] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // FIXME: could shrink frame
+    // Set up a proper stack frame
+    // FIXME Layout
+    //   8 double registers       -  64 bytes
+    //   8 int registers          -  32 bytes
+    "mflr 0\n"
+    "stw 0,  4(1)\n"
+    "stwu 1, -104(1)\n"
+    // Save all int arg registers
+    "stw 10, 100(1)\n"   "stw 9,  96(1)\n"
+    "stw 8,  92(1)\n"    "stw 7,  88(1)\n"
+    "stw 6,  84(1)\n"    "stw 5,  80(1)\n"
+    "stw 4,  76(1)\n"    "stw 3,  72(1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd 8,  64(1)\n"
+    "stfd 7,  56(1)\n"   "stfd 6,  48(1)\n"
+    "stfd 5,  40(1)\n"   "stfd 4,  32(1)\n"
+    "stfd 3,  24(1)\n"   "stfd 2,  16(1)\n"
+    "stfd 1,  8(1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 0.
+    "mr   3, 0\n"
+    "lwz  5, 104(1)\n" // stub's frame
+    "lwz  4, 4(5)\n" // stub's lr
+    "li   5, 0\n"       // 0 == 32 bit
+    "bl PPCCompilationCallbackC\n"
+    "mtctr 3\n"
+    // Restore all int arg registers
+    "lwz 10, 100(1)\n"   "lwz 9,  96(1)\n"
+    "lwz 8,  92(1)\n"    "lwz 7,  88(1)\n"
+    "lwz 6,  84(1)\n"    "lwz 5,  80(1)\n"
+    "lwz 4,  76(1)\n"    "lwz 3,  72(1)\n"
+    // Restore all FP arg registers
+    "lfd 8,  64(1)\n"
+    "lfd 7,  56(1)\n"    "lfd 6,  48(1)\n"
+    "lfd 5,  40(1)\n"    "lfd 4,  32(1)\n"
+    "lfd 3,  24(1)\n"    "lfd 2,  16(1)\n"
+    "lfd 1,  8(1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "lwz  1, 104(1)\n"
+    "lwz  0, 4(1)\n"
+    "mtlr 0\n"
+    "bctr\n"
+    );
+#else
+void PPC32CompilationCallback() {
+  assert(0 && "This is not a power pc, you can't execute this!");
+  abort();
+}
+#endif
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+    defined(__ppc64__)
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl _PPC64CompilationCallback\n"
+"_PPC64CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // Set up a proper stack frame
+    // Layout
+    //   PowerPC64 ABI linkage    -  48 bytes
+    //                 parameters -  64 bytes
+    //   13 double registers      - 104 bytes
+    //   8 int registers          -  64 bytes
+    "mflr r0\n"
+    "std r0,  16(r1)\n"
+    "stdu r1, -280(r1)\n"
+    // Save all int arg registers
+    "std r10, 272(r1)\n"    "std r9,  264(r1)\n"
+    "std r8,  256(r1)\n"    "std r7,  248(r1)\n"
+    "std r6,  240(r1)\n"    "std r5,  232(r1)\n"
+    "std r4,  224(r1)\n"    "std r3,  216(r1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd f13, 208(r1)\n"    "stfd f12, 200(r1)\n"
+    "stfd f11, 192(r1)\n"    "stfd f10, 184(r1)\n"
+    "stfd f9,  176(r1)\n"    "stfd f8,  168(r1)\n"
+    "stfd f7,  160(r1)\n"    "stfd f6,  152(r1)\n"
+    "stfd f5,  144(r1)\n"    "stfd f4,  136(r1)\n"
+    "stfd f3,  128(r1)\n"    "stfd f2,  120(r1)\n"
+    "stfd f1,  112(r1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 1.
+    "mr   r3, r0\n"
+    "ld   r2, 280(r1)\n" // stub's frame
+    "ld   r4, 16(r2)\n"  // stub's lr
+    "li   r5, 1\n"       // 1 == 64 bit
+    "bl _PPCCompilationCallbackC\n"
+    "mtctr r3\n"
+    // Restore all int arg registers
+    "ld r10, 272(r1)\n"    "ld r9,  264(r1)\n"
+    "ld r8,  256(r1)\n"    "ld r7,  248(r1)\n"
+    "ld r6,  240(r1)\n"    "ld r5,  232(r1)\n"
+    "ld r4,  224(r1)\n"    "ld r3,  216(r1)\n"
+    // Restore all FP arg registers
+    "lfd f13, 208(r1)\n"    "lfd f12, 200(r1)\n"
+    "lfd f11, 192(r1)\n"    "lfd f10, 184(r1)\n"
+    "lfd f9,  176(r1)\n"    "lfd f8,  168(r1)\n"
+    "lfd f7,  160(r1)\n"    "lfd f6,  152(r1)\n"
+    "lfd f5,  144(r1)\n"    "lfd f4,  136(r1)\n"
+    "lfd f3,  128(r1)\n"    "lfd f2,  120(r1)\n"
+    "lfd f1,  112(r1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "ld  r1, 280(r1)\n"
+    "ld  r2, 16(r1)\n"
+    "mtlr r2\n"
+    "bctr\n"
+    );
+#else
+void PPC64CompilationCallback() {
+  assert(0 && "This is not a power pc, you can't execute this!");
+  abort();
+}
+#endif
+
+extern "C" void *PPCCompilationCallbackC(unsigned *StubCallAddrPlus4,
+                                         unsigned *OrigCallAddrPlus4,
+                                         bool is64Bit) {
+  // Adjust the pointer to the address of the call instruction in the stub
+  // emitted by emitFunctionStub, rather than the instruction after it.
+  unsigned *StubCallAddr = StubCallAddrPlus4 - 1;
+  unsigned *OrigCallAddr = OrigCallAddrPlus4 - 1;
+
+  void *Target = JITCompilerFunction(StubCallAddr);
+
+  // Check to see if *OrigCallAddr is a 'bl' instruction, and if we can rewrite
+  // it to branch directly to the destination.  If so, rewrite it so it does not
+  // need to go through the stub anymore.
+  unsigned OrigCallInst = *OrigCallAddr;
+  if ((OrigCallInst >> 26) == 18) {     // Direct call.
+    intptr_t Offset = ((intptr_t)Target - (intptr_t)OrigCallAddr) >> 2;
+    
+    if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
+      // Clear the original target out.
+      OrigCallInst &= (63 << 26) | 3;
+      // Fill in the new target.
+      OrigCallInst |= (Offset & ((1 << 24)-1)) << 2;
+      // Replace the call.
+      *OrigCallAddr = OrigCallInst;
+    }
+  }
+
+  // Assert that we are coming from a stub that was created with our
+  // emitFunctionStub.
+  if ((*StubCallAddr >> 26) == 18)
+    StubCallAddr -= 3;
+  else {
+  assert((*StubCallAddr >> 26) == 19 && "Call in stub is not indirect!");
+    StubCallAddr -= is64Bit ? 9 : 6;
+  }
+
+  // Rewrite the stub with an unconditional branch to the target, for any users
+  // who took the address of the stub.
+  EmitBranchToAt((intptr_t)StubCallAddr, (intptr_t)Target, false, is64Bit);
+
+  // Put the address of the target function to call and the address to return to
+  // after calling the target function in a place that is easy to get on the
+  // stack after we restore all regs.
+  return Target;
+}
+
+
+
+TargetJITInfo::LazyResolverFn
+PPCJITInfo::getLazyResolverFunction(JITCompilerFn Fn) {
+  JITCompilerFunction = Fn;
+  return is64Bit ? PPC64CompilationCallback : PPC32CompilationCallback;
+}
+
+void *PPCJITInfo::emitFunctionStub(void *Fn, MachineCodeEmitter &MCE) {
+  // If this is just a call to an external function, emit a branch instead of a
+  // call.  The code is the same except for one bit of the last instruction.
+  if (Fn != (void*)(intptr_t)PPC32CompilationCallback && 
+      Fn != (void*)(intptr_t)PPC64CompilationCallback) {
+    MCE.startFunctionStub(7*4);
+    intptr_t Addr = (intptr_t)MCE.getCurrentPCValue();
+    MCE.emitWordBE(0);
+    MCE.emitWordBE(0);
+    MCE.emitWordBE(0);
+    MCE.emitWordBE(0);
+    MCE.emitWordBE(0);
+    MCE.emitWordBE(0);
+    MCE.emitWordBE(0);
+    EmitBranchToAt(Addr, (intptr_t)Fn, false, is64Bit);
+    return MCE.finishFunctionStub(0);
+  }
+
+  MCE.startFunctionStub(10*4);
+  if (is64Bit) {
+    MCE.emitWordBE(0xf821ffb1);     // stdu r1,-80(r1)
+    MCE.emitWordBE(0x7d6802a6);     // mflr r11
+    MCE.emitWordBE(0xf9610060);     // std r11, 96(r1)
+  } else if (TM.getSubtargetImpl()->isMachoABI()){
+    MCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
+    MCE.emitWordBE(0x7d6802a6);     // mflr r11
+    MCE.emitWordBE(0x91610028);     // stw r11, 40(r1)
+  } else {
+    MCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
+    MCE.emitWordBE(0x7d6802a6);     // mflr r11
+    MCE.emitWordBE(0x91610024);     // stw r11, 36(r1)
+  }
+  intptr_t Addr = (intptr_t)MCE.getCurrentPCValue();
+  MCE.emitWordBE(0);
+  MCE.emitWordBE(0);
+  MCE.emitWordBE(0);
+  MCE.emitWordBE(0);
+  MCE.emitWordBE(0);
+  MCE.emitWordBE(0);
+  MCE.emitWordBE(0);
+  EmitBranchToAt(Addr, (intptr_t)Fn, true, is64Bit);
+  return MCE.finishFunctionStub(0);
+}
+
+
+void PPCJITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4;
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((PPC::RelocationType)MR->getRelocationType()) {
+    default: assert(0 && "Unknown relocation type!");
+    case PPC::reloc_pcrel_bx:
+      // PC-relative relocation for b and bl instructions.
+      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
+      assert(ResultPtr >= -(1 << 23) && ResultPtr < (1 << 23) &&
+             "Relocation out of range!");
+      *RelocPos |= (ResultPtr & ((1 << 24)-1))  << 2;
+      break;
+    case PPC::reloc_pcrel_bcx:
+      // PC-relative relocation for BLT,BLE,BEQ,BGE,BGT,BNE, or other
+      // bcx instructions.
+      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
+      assert(ResultPtr >= -(1 << 13) && ResultPtr < (1 << 13) &&
+             "Relocation out of range!");
+      *RelocPos |= (ResultPtr & ((1 << 14)-1))  << 2;
+      break;
+    case PPC::reloc_absolute_high:     // high bits of ref -> low 16 of instr
+    case PPC::reloc_absolute_low: {    // low bits of ref  -> low 16 of instr
+      ResultPtr += MR->getConstantVal();
+
+      // If this is a high-part access, get the high-part.
+      if (MR->getRelocationType() == PPC::reloc_absolute_high) {
+        // If the low part will have a carry (really a borrow) from the low
+        // 16-bits into the high 16, add a bit to borrow from.
+        if (((int)ResultPtr << 16) < 0)
+          ResultPtr += 1 << 16;
+        ResultPtr >>= 16;
+      }
+
+      // Do the addition then mask, so the addition does not overflow the 16-bit
+      // immediate section of the instruction.
+      unsigned LowBits  = (*RelocPos + ResultPtr) & 65535;
+      unsigned HighBits = *RelocPos & ~65535;
+      *RelocPos = LowBits | HighBits;  // Slam into low 16-bits
+      break;
+    }
+    case PPC::reloc_absolute_low_ix: {  // low bits of ref  -> low 14 of instr
+      ResultPtr += MR->getConstantVal();
+      // Do the addition then mask, so the addition does not overflow the 16-bit
+      // immediate section of the instruction.
+      unsigned LowBits  = (*RelocPos + ResultPtr) & 0xFFFC;
+      unsigned HighBits = *RelocPos & 0xFFFF0003;
+      *RelocPos = LowBits | HighBits;  // Slam into low 14-bits.
+      break;
+    }
+    }
+  }
+}
+
+void PPCJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  EmitBranchToAt((intptr_t)Old, (intptr_t)New, false, is64Bit);
+}

diff --git a/lib/Target/PowerPC/PPCJITInfo.h b/lib/Target/PowerPC/PPCJITInfo.h
new file mode 100644
index 0000000..66ee0ee
--- /dev/null
+++ b/lib/Target/PowerPC/PPCJITInfo.h

@@ -0,0 +1,46 @@
+//===- PPCJITInfo.h - PowerPC impl. of the JIT interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_JITINFO_H
+#define POWERPC_JITINFO_H
+
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class PPCTargetMachine;
+
+  class PPCJITInfo : public TargetJITInfo {
+  protected:
+    PPCTargetMachine &TM;
+    bool is64Bit;
+  public:
+    PPCJITInfo(PPCTargetMachine &tm, bool tmIs64Bit) : TM(tm) {
+      useGOT = 0;
+      is64Bit = tmIs64Bit;
+    }
+
+    virtual void *emitFunctionStub(void *Fn, MachineCodeEmitter &MCE);
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+    
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+  };
+}
+
+#endif

diff --git a/lib/Target/PowerPC/PPCMachOWriterInfo.cpp b/lib/Target/PowerPC/PPCMachOWriterInfo.cpp
new file mode 100644
index 0000000..5e2dc9e
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachOWriterInfo.cpp

@@ -0,0 +1,150 @@
+//===-- PPCMachOWriterInfo.cpp - Mach-O Writer Info for the PowerPC -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bill Wendling and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Mach-O writer information for the PowerPC backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMachOWriterInfo.h"
+#include "PPCRelocations.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachORelocation.h"
+#include "llvm/Support/OutputBuffer.h"
+using namespace llvm;
+
+PPCMachOWriterInfo::PPCMachOWriterInfo(const PPCTargetMachine &TM)
+  : TargetMachOWriterInfo(TM.getTargetData()->getPointerSizeInBits() == 64 ?
+                          HDR_CPU_TYPE_POWERPC64 :
+                          HDR_CPU_TYPE_POWERPC,
+                          HDR_CPU_SUBTYPE_POWERPC_ALL) {}
+PPCMachOWriterInfo::~PPCMachOWriterInfo() {}
+
+/// GetTargetRelocation - For the MachineRelocation MR, convert it to one or
+/// more PowerPC MachORelocation(s), add the new relocations to the
+/// MachOSection, and rewrite the instruction at the section offset if required
+/// by that relocation type.
+unsigned PPCMachOWriterInfo::GetTargetRelocation(MachineRelocation &MR,
+                                                 unsigned FromIdx,
+                                                 unsigned ToAddr,
+                                                 unsigned ToIdx,
+                                                 OutputBuffer &RelocOut,
+                                                 OutputBuffer &SecOut,
+                                                 bool Scattered,
+                                                 bool isExtern) const {
+  unsigned NumRelocs = 0;
+  uint64_t Addr = 0;
+
+  // Get the address of whatever it is we're relocating, if possible.
+  if (!isExtern)
+    Addr = (uintptr_t)MR.getResultPointer() + ToAddr;
+
+  switch ((PPC::RelocationType)MR.getRelocationType()) {
+  default: assert(0 && "Unknown PPC relocation type!");
+  case PPC::reloc_absolute_low_ix:
+    assert(0 && "Unhandled PPC relocation type!");
+    break;
+  case PPC::reloc_vanilla:
+    {
+      // FIXME: need to handle 64 bit vanilla relocs
+      MachORelocation VANILLA(MR.getMachineCodeOffset(), ToIdx,
+                              false, 2, isExtern,
+                              PPC_RELOC_VANILLA,
+                              Scattered, (intptr_t)MR.getResultPointer());
+      ++NumRelocs;
+
+      if (Scattered) {
+        RelocOut.outword(VANILLA.getPackedFields());
+        RelocOut.outword(VANILLA.getAddress());
+      } else {
+        RelocOut.outword(VANILLA.getAddress());
+        RelocOut.outword(VANILLA.getPackedFields());
+      }
+      
+      intptr_t SymbolOffset;
+
+      if (Scattered)
+        SymbolOffset = Addr + MR.getConstantVal();
+      else
+        SymbolOffset = Addr;
+
+      printf("vanilla fixup: sec_%x[%x] = %x\n", FromIdx,
+             unsigned(MR.getMachineCodeOffset()),
+             unsigned(SymbolOffset));
+      SecOut.fixword(SymbolOffset, MR.getMachineCodeOffset());
+    }
+    break;
+  case PPC::reloc_pcrel_bx:
+    {
+      // FIXME: Presumably someday we will need to branch to other, non-extern
+      // functions too.  Need to figure out some way to distinguish between
+      // target is BB and target is function.
+      if (isExtern) {
+        MachORelocation BR24(MR.getMachineCodeOffset(), ToIdx, true, 2, 
+                             isExtern, PPC_RELOC_BR24, Scattered, 
+                             (intptr_t)MR.getMachineCodeOffset());
+        RelocOut.outword(BR24.getAddress());
+        RelocOut.outword(BR24.getPackedFields());
+        ++NumRelocs;
+      }
+
+      Addr -= MR.getMachineCodeOffset();
+      Addr >>= 2;
+      Addr &= 0xFFFFFF;
+      Addr <<= 2;
+      Addr |= (SecOut[MR.getMachineCodeOffset()] << 24);
+      Addr |= (SecOut[MR.getMachineCodeOffset()+3] & 0x3);
+      SecOut.fixword(Addr, MR.getMachineCodeOffset());
+      break;
+    }
+  case PPC::reloc_pcrel_bcx:
+    {
+      Addr -= MR.getMachineCodeOffset();
+      Addr &= 0xFFFC;
+
+      SecOut.fixhalf(Addr, MR.getMachineCodeOffset() + 2);
+      break;
+    }
+  case PPC::reloc_absolute_high:
+    {
+      MachORelocation HA16(MR.getMachineCodeOffset(), ToIdx, false, 2,
+                           isExtern, PPC_RELOC_HA16);
+      MachORelocation PAIR(Addr & 0xFFFF, 0xFFFFFF, false, 2, isExtern,
+                           PPC_RELOC_PAIR);
+      NumRelocs = 2;
+
+      RelocOut.outword(HA16.getRawAddress());
+      RelocOut.outword(HA16.getPackedFields());
+      RelocOut.outword(PAIR.getRawAddress());
+      RelocOut.outword(PAIR.getPackedFields());
+
+      Addr += 0x8000;
+
+      SecOut.fixhalf(Addr >> 16, MR.getMachineCodeOffset() + 2);
+      break;
+    }
+  case PPC::reloc_absolute_low:
+    {
+      MachORelocation LO16(MR.getMachineCodeOffset(), ToIdx, false, 2,
+                           isExtern, PPC_RELOC_LO16);
+      MachORelocation PAIR(Addr >> 16, 0xFFFFFF, false, 2, isExtern,
+                           PPC_RELOC_PAIR);
+      NumRelocs = 2;
+
+      RelocOut.outword(LO16.getRawAddress());
+      RelocOut.outword(LO16.getPackedFields());
+      RelocOut.outword(PAIR.getRawAddress());
+      RelocOut.outword(PAIR.getPackedFields());
+
+      SecOut.fixhalf(Addr, MR.getMachineCodeOffset() + 2);
+      break;
+    }
+  }
+
+  return NumRelocs;
+}

diff --git a/lib/Target/PowerPC/PPCMachOWriterInfo.h b/lib/Target/PowerPC/PPCMachOWriterInfo.h
new file mode 100644
index 0000000..69ed9f7
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachOWriterInfo.h

@@ -0,0 +1,55 @@
+//===-- PPCMachOWriterInfo.h - Mach-O Writer Info for PowerPC ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bill Wendling and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Mach-O writer information for the PowerPC backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_MACHO_WRITER_INFO_H
+#define PPC_MACHO_WRITER_INFO_H
+
+#include "llvm/Target/TargetMachOWriterInfo.h"
+
+namespace llvm {
+
+  // Forward declarations
+  class MachineRelocation;
+  class OutputBuffer;
+  class PPCTargetMachine;
+
+  class PPCMachOWriterInfo : public TargetMachOWriterInfo {
+  public:
+    PPCMachOWriterInfo(const PPCTargetMachine &TM);
+    virtual ~PPCMachOWriterInfo();
+
+    virtual unsigned GetTargetRelocation(MachineRelocation &MR,
+                                         unsigned FromIdx,
+                                         unsigned ToAddr,
+                                         unsigned ToIdx,
+                                         OutputBuffer &RelocOut,
+                                         OutputBuffer &SecOut,
+                                         bool Scattered, bool Extern) const;
+
+    // Constants for the relocation r_type field.
+    // See <mach-o/ppc/reloc.h>
+    enum {
+      PPC_RELOC_VANILLA, // generic relocation
+      PPC_RELOC_PAIR,    // the second relocation entry of a pair
+      PPC_RELOC_BR14,    // 14 bit branch displacement to word address
+      PPC_RELOC_BR24,    // 24 bit branch displacement to word address
+      PPC_RELOC_HI16,    // a PAIR follows with the low 16 bits
+      PPC_RELOC_LO16,    // a PAIR follows with the high 16 bits
+      PPC_RELOC_HA16,    // a PAIR follows, which is sign extended to 32b
+      PPC_RELOC_LO14     // LO16 with low 2 bits implicitly zero
+    };
+  };
+
+} // end llvm namespace
+
+#endif // PPC_MACHO_WRITER_INFO_H

diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
new file mode 100644
index 0000000..e227456
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h

@@ -0,0 +1,50 @@
+//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_MACHINE_FUNCTION_INFO_H
+#define PPC_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// PPCFunctionInfo - This class is derived from MachineFunction private
+/// PowerPC target-specific information for each MachineFunction.
+class PPCFunctionInfo : public MachineFunctionInfo {
+private:
+  /// FramePointerSaveIndex - Frame index of where the old frame pointer is
+  /// stored.  Also used as an anchor for instructions that need to be altered
+  /// when using frame pointers (dyna_add, dyna_sub.)
+  int FramePointerSaveIndex;
+  
+  /// UsesLR - Indicates whether LR is used in the current function.
+  ///
+  bool UsesLR;
+
+public:
+  PPCFunctionInfo(MachineFunction& MF) 
+  : FramePointerSaveIndex(0)
+  {}
+
+  int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
+  void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
+  
+  void setUsesLR(bool U) { UsesLR = U; }
+  bool usesLR()          { return UsesLR; }
+
+};
+
+} // end of namespace llvm
+
+
+#endif

diff --git a/lib/Target/PowerPC/PPCPerfectShuffle.h b/lib/Target/PowerPC/PPCPerfectShuffle.h
new file mode 100644
index 0000000..d0f833e
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPerfectShuffle.h

@@ -0,0 +1,6586 @@
+//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle without using vperm.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 292 entries have cost 1
+// 1384 entries have cost 2
+// 3061 entries have cost 3
+// 1733 entries have cost 4
+// 60 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  202162278U,	// <0,0,0,0>: Cost 1 vspltisw0 LHS
+  1140850790U,	// <0,0,0,1>: Cost 2 vmrghw <0,0,0,0>, LHS
+  2617247181U,	// <0,0,0,2>: Cost 3 vsldoi4 <0,0,0,0>, <2,0,3,0>
+  2635163787U,	// <0,0,0,3>: Cost 3 vsldoi4 <3,0,0,0>, <3,0,0,0>
+  1543507254U,	// <0,0,0,4>: Cost 2 vsldoi4 <0,0,0,0>, RHS
+  2281701705U,	// <0,0,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,0,5>
+  2617250133U,	// <0,0,0,6>: Cost 3 vsldoi4 <0,0,0,0>, <6,0,7,0>
+  2659054575U,	// <0,0,0,7>: Cost 3 vsldoi4 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,0,u>: Cost 1 vspltisw0 LHS
+  1141686282U,	// <0,0,1,0>: Cost 2 vmrghw LHS, <0,0,1,1>
+  67944550U,	// <0,0,1,1>: Cost 1 vmrghw LHS, LHS
+  1685241958U,	// <0,0,1,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2215870716U,	// <0,0,1,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1141727570U,	// <0,0,1,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  2215428562U,	// <0,0,1,5>: Cost 3 vmrghw LHS, <0,5,6,7>
+  2215428589U,	// <0,0,1,6>: Cost 3 vmrghw LHS, <0,6,0,7>
+  2659062768U,	// <0,0,1,7>: Cost 3 vsldoi4 <7,0,0,1>, <7,0,0,1>
+  67945117U,	// <0,0,1,u>: Cost 1 vmrghw LHS, LHS
+  2684356045U,	// <0,0,2,0>: Cost 3 vsldoi8 <0,0,0,0>, <2,0,3,0>
+  2216009830U,	// <0,0,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2216009901U,	// <0,0,2,2>: Cost 3 vmrghw <0,2,1,2>, <0,2,1,2>
+  2698290853U,	// <0,0,2,3>: Cost 3 vsldoi8 <2,3,0,0>, <2,3,0,0>
+  3289751890U,	// <0,0,2,4>: Cost 4 vmrghw <0,2,1,2>, <0,4,1,5>
+  3758098275U,	// <0,0,2,5>: Cost 4 vsldoi8 <0,0,0,0>, <2,5,3,1>
+  2684356538U,	// <0,0,2,6>: Cost 3 vsldoi8 <0,0,0,0>, <2,6,3,7>
+  3758098410U,	// <0,0,2,7>: Cost 4 vsldoi8 <0,0,0,0>, <2,7,0,1>
+  2216010397U,	// <0,0,2,u>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2702272651U,	// <0,0,3,0>: Cost 3 vsldoi8 <3,0,0,0>, <3,0,0,0>
+  2216656998U,	// <0,0,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  3844669704U,	// <0,0,3,2>: Cost 4 vsldoi12 <3,2,3,0>, <0,3,2,3>
+  2216657148U,	// <0,0,3,3>: Cost 3 vmrghw <0,3,1,0>, <0,3,1,0>
+  2684357122U,	// <0,0,3,4>: Cost 3 vsldoi8 <0,0,0,0>, <3,4,5,6>
+  3732820066U,	// <0,0,3,5>: Cost 4 vsldoi4 <7,0,0,3>, <5,6,7,0>
+  3778005624U,	// <0,0,3,6>: Cost 4 vsldoi8 <3,3,0,0>, <3,6,0,7>
+  3374713464U,	// <0,0,3,7>: Cost 4 vmrglw <3,2,0,3>, <3,6,0,7>
+  2216657565U,	// <0,0,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217361408U,	// <0,0,4,0>: Cost 3 vmrghw <0,4,1,5>, <0,0,0,0>
+  1143619686U,	// <0,0,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  3291103405U,	// <0,0,4,2>: Cost 4 vmrghw <0,4,1,5>, <0,2,1,2>
+  3827269988U,	// <0,0,4,3>: Cost 4 vsldoi12 <0,3,1,0>, <0,4,3,5>
+  1143619922U,	// <0,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1610616118U,	// <0,0,4,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  3758099833U,	// <0,0,4,6>: Cost 4 vsldoi8 <0,0,0,0>, <4,6,5,2>
+  3854107016U,	// <0,0,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <0,4,7,5>
+  1143620253U,	// <0,0,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2284396544U,	// <0,0,5,0>: Cost 3 vmrglw <0,4,0,5>, <0,0,0,0>
+  2218025062U,	// <0,0,5,1>: Cost 3 vmrghw <0,5,1,5>, LHS
+  3758100203U,	// <0,0,5,2>: Cost 4 vsldoi8 <0,0,0,0>, <5,2,1,3>
+  3395966100U,	// <0,0,5,3>: Cost 4 vmrglw <6,7,0,5>, <7,2,0,3>
+  3804549052U,	// <0,0,5,4>: Cost 4 vsldoi8 <7,7,0,0>, <5,4,6,5>
+  2302314964U,	// <0,0,5,5>: Cost 3 vmrglw <3,4,0,5>, <3,4,0,5>
+  2785821138U,	// <0,0,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  3395966428U,	// <0,0,5,7>: Cost 4 vmrglw <6,7,0,5>, <7,6,0,7>
+  2787148260U,	// <0,0,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <0,5,u,7>
+  2684358997U,	// <0,0,6,0>: Cost 3 vsldoi8 <0,0,0,0>, <6,0,7,0>
+  2218631270U,	// <0,0,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2684359162U,	// <0,0,6,2>: Cost 3 vsldoi8 <0,0,0,0>, <6,2,7,3>
+  3758101042U,	// <0,0,6,3>: Cost 4 vsldoi8 <0,0,0,0>, <6,3,4,5>
+  3732843830U,	// <0,0,6,4>: Cost 4 vsldoi4 <7,0,0,6>, RHS
+  3758101227U,	// <0,0,6,5>: Cost 4 vsldoi8 <0,0,0,0>, <6,5,7,1>
+  2684359480U,	// <0,0,6,6>: Cost 3 vsldoi8 <0,0,0,0>, <6,6,6,6>
+  2724836173U,	// <0,0,6,7>: Cost 3 vsldoi8 <6,7,0,0>, <6,7,0,0>
+  2725499806U,	// <0,0,6,u>: Cost 3 vsldoi8 <6,u,0,0>, <6,u,0,0>
+  2726163439U,	// <0,0,7,0>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  2219311206U,	// <0,0,7,1>: Cost 3 vmrghw <0,7,1,0>, LHS
+  3868557900U,	// <0,0,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <0,7,2,3>
+  3377400112U,	// <0,0,7,3>: Cost 4 vmrglw <3,6,0,7>, <3,2,0,3>
+  2684360038U,	// <0,0,7,4>: Cost 3 vsldoi8 <0,0,0,0>, <7,4,5,6>
+  3732852834U,	// <0,0,7,5>: Cost 4 vsldoi4 <7,0,0,7>, <5,6,7,0>
+  3871507060U,	// <0,0,7,6>: Cost 4 vsldoi12 <7,6,7,0>, <0,7,6,7>
+  2303658616U,	// <0,0,7,7>: Cost 3 vmrglw <3,6,0,7>, <3,6,0,7>
+  2726163439U,	// <0,0,7,u>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,u,0>: Cost 1 vspltisw0 LHS
+  72589414U,	// <0,0,u,1>: Cost 1 vmrghw LHS, LHS
+  1685242525U,	// <0,0,u,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2220073212U,	// <0,0,u,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1146331474U,	// <0,0,u,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  1610619034U,	// <0,0,u,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  2785821138U,	// <0,0,u,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  2659120119U,	// <0,0,u,7>: Cost 3 vsldoi4 <7,0,0,u>, <7,0,0,u>
+  72589981U,	// <0,0,u,u>: Cost 1 vmrghw LHS, LHS
+  2698297344U,	// <0,1,0,0>: Cost 3 vsldoi8 <2,3,0,1>, <0,0,0,0>
+  1624555622U,	// <0,1,0,1>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  2758984428U,	// <0,1,0,2>: Cost 3 vsldoi12 <1,2,3,0>, <1,0,2,1>
+  2635237524U,	// <0,1,0,3>: Cost 3 vsldoi4 <3,0,1,0>, <3,0,1,0>
+  2693652818U,	// <0,1,0,4>: Cost 3 vsldoi8 <1,5,0,1>, <0,4,1,5>
+  2281701714U,	// <0,1,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,1,5>
+  2698297846U,	// <0,1,0,6>: Cost 3 vsldoi8 <2,3,0,1>, <0,6,1,7>
+  2659128312U,	// <0,1,0,7>: Cost 3 vsldoi4 <7,0,1,0>, <7,0,1,0>
+  1624556189U,	// <0,1,0,u>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  1543585802U,	// <0,1,1,0>: Cost 2 vsldoi4 <0,0,1,1>, <0,0,1,1>
+  1141728052U,	// <0,1,1,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1141728150U,	// <0,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2295644334U,	// <0,1,1,3>: Cost 3 vmrglw <2,3,0,1>, <0,2,1,3>
+  1543589174U,	// <0,1,1,4>: Cost 2 vsldoi4 <0,0,1,1>, RHS
+  2290999634U,	// <0,1,1,5>: Cost 3 vmrglw <1,5,0,1>, <0,4,1,5>
+  2617332135U,	// <0,1,1,6>: Cost 3 vsldoi4 <0,0,1,1>, <6,1,7,1>
+  2617332720U,	// <0,1,1,7>: Cost 3 vsldoi4 <0,0,1,1>, <7,0,0,1>
+  1142171004U,	// <0,1,1,u>: Cost 2 vmrghw LHS, <1,u,3,0>
+  1561509990U,	// <0,1,2,0>: Cost 2 vsldoi4 <3,0,1,2>, LHS
+  2623308516U,	// <0,1,2,1>: Cost 3 vsldoi4 <1,0,1,2>, <1,0,1,2>
+  2698298984U,	// <0,1,2,2>: Cost 3 vsldoi8 <2,3,0,1>, <2,2,2,2>
+  835584U,	// <0,1,2,3>: Cost 0 copy LHS
+  1561513270U,	// <0,1,2,4>: Cost 2 vsldoi4 <3,0,1,2>, RHS
+  2647199304U,	// <0,1,2,5>: Cost 3 vsldoi4 <5,0,1,2>, <5,0,1,2>
+  2698299322U,	// <0,1,2,6>: Cost 3 vsldoi8 <2,3,0,1>, <2,6,3,7>
+  1585402874U,	// <0,1,2,7>: Cost 2 vsldoi4 <7,0,1,2>, <7,0,1,2>
+  835584U,	// <0,1,2,u>: Cost 0 copy LHS
+  2698299540U,	// <0,1,3,0>: Cost 3 vsldoi8 <2,3,0,1>, <3,0,1,0>
+  3290399540U,	// <0,1,3,1>: Cost 4 vmrghw <0,3,1,0>, <1,1,1,1>
+  2698299720U,	// <0,1,3,2>: Cost 3 vsldoi8 <2,3,0,1>, <3,2,3,0>
+  2698299804U,	// <0,1,3,3>: Cost 3 vsldoi8 <2,3,0,1>, <3,3,3,3>
+  2698299906U,	// <0,1,3,4>: Cost 3 vsldoi8 <2,3,0,1>, <3,4,5,6>
+  3832726521U,	// <0,1,3,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,3,5,0>
+  2724842160U,	// <0,1,3,6>: Cost 3 vsldoi8 <6,7,0,1>, <3,6,7,0>
+  2706926275U,	// <0,1,3,7>: Cost 3 vsldoi8 <3,7,0,1>, <3,7,0,1>
+  2698300190U,	// <0,1,3,u>: Cost 3 vsldoi8 <2,3,0,1>, <3,u,1,2>
+  2635268198U,	// <0,1,4,0>: Cost 3 vsldoi4 <3,0,1,4>, LHS
+  2217362228U,	// <0,1,4,1>: Cost 3 vmrghw <0,4,1,5>, <1,1,1,1>
+  2217362326U,	// <0,1,4,2>: Cost 3 vmrghw <0,4,1,5>, <1,2,3,0>
+  2635270296U,	// <0,1,4,3>: Cost 3 vsldoi4 <3,0,1,4>, <3,0,1,4>
+  2635271478U,	// <0,1,4,4>: Cost 3 vsldoi4 <3,0,1,4>, RHS
+  1624558902U,	// <0,1,4,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2659160910U,	// <0,1,4,6>: Cost 3 vsldoi4 <7,0,1,4>, <6,7,0,1>
+  2659161084U,	// <0,1,4,7>: Cost 3 vsldoi4 <7,0,1,4>, <7,0,1,4>
+  1624559145U,	// <0,1,4,u>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  3832726639U,	// <0,1,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,0,1>
+  2714889871U,	// <0,1,5,1>: Cost 3 vsldoi8 <5,1,0,1>, <5,1,0,1>
+  2302314646U,	// <0,1,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  3834717321U,	// <0,1,5,3>: Cost 4 vsldoi12 <1,5,3,0>, <1,5,3,0>
+  3832726679U,	// <0,1,5,4>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,4,5>
+  2717544403U,	// <0,1,5,5>: Cost 3 vsldoi8 <5,5,0,1>, <5,5,0,1>
+  2718208036U,	// <0,1,5,6>: Cost 3 vsldoi8 <5,6,0,1>, <5,6,0,1>
+  3792613493U,	// <0,1,5,7>: Cost 4 vsldoi8 <5,7,0,1>, <5,7,0,1>
+  2719535302U,	// <0,1,5,u>: Cost 3 vsldoi8 <5,u,0,1>, <5,u,0,1>
+  2659172454U,	// <0,1,6,0>: Cost 3 vsldoi4 <7,0,1,6>, LHS
+  3832726735U,	// <0,1,6,1>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,1,7>
+  2724844026U,	// <0,1,6,2>: Cost 3 vsldoi8 <6,7,0,1>, <6,2,7,3>
+  3775361608U,	// <0,1,6,3>: Cost 4 vsldoi8 <2,u,0,1>, <6,3,7,0>
+  2659175734U,	// <0,1,6,4>: Cost 3 vsldoi4 <7,0,1,6>, RHS
+  3832726771U,	// <0,1,6,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,5,7>
+  2724844344U,	// <0,1,6,6>: Cost 3 vsldoi8 <6,7,0,1>, <6,6,6,6>
+  1651102542U,	// <0,1,6,7>: Cost 2 vsldoi8 <6,7,0,1>, <6,7,0,1>
+  1651766175U,	// <0,1,6,u>: Cost 2 vsldoi8 <6,u,0,1>, <6,u,0,1>
+  2724844536U,	// <0,1,7,0>: Cost 3 vsldoi8 <6,7,0,1>, <7,0,1,0>
+  3377397770U,	// <0,1,7,1>: Cost 4 vmrglw <3,6,0,7>, <0,0,1,1>
+  2698302636U,	// <0,1,7,2>: Cost 3 vsldoi8 <2,3,0,1>, <7,2,3,0>
+  2728162531U,	// <0,1,7,3>: Cost 3 vsldoi8 <7,3,0,1>, <7,3,0,1>
+  2724844902U,	// <0,1,7,4>: Cost 3 vsldoi8 <6,7,0,1>, <7,4,5,6>
+  3377398098U,	// <0,1,7,5>: Cost 4 vmrglw <3,6,0,7>, <0,4,1,5>
+  2724845076U,	// <0,1,7,6>: Cost 3 vsldoi8 <6,7,0,1>, <7,6,7,0>
+  2724845164U,	// <0,1,7,7>: Cost 3 vsldoi8 <6,7,0,1>, <7,7,7,7>
+  2724845186U,	// <0,1,7,u>: Cost 3 vsldoi8 <6,7,0,1>, <7,u,1,2>
+  1561559142U,	// <0,1,u,0>: Cost 2 vsldoi4 <3,0,1,u>, LHS
+  1146331956U,	// <0,1,u,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1146332054U,	// <0,1,u,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  835584U,	// <0,1,u,3>: Cost 0 copy LHS
+  1561562422U,	// <0,1,u,4>: Cost 2 vsldoi4 <3,0,1,u>, RHS
+  1624561818U,	// <0,1,u,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2220074191U,	// <0,1,u,6>: Cost 3 vmrghw LHS, <1,6,1,7>
+  1585452032U,	// <0,1,u,7>: Cost 2 vsldoi4 <7,0,1,u>, <7,0,1,u>
+  835584U,	// <0,1,u,u>: Cost 0 copy LHS
+  2214593997U,	// <0,2,0,0>: Cost 3 vmrghw <0,0,0,0>, <2,0,3,0>
+  2214675999U,	// <0,2,0,1>: Cost 3 vmrghw <0,0,1,1>, <2,1,3,1>
+  2214594152U,	// <0,2,0,2>: Cost 3 vmrghw <0,0,0,0>, <2,2,2,2>
+  1207959654U,	// <0,2,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  3709054262U,	// <0,2,0,4>: Cost 4 vsldoi4 <3,0,2,0>, RHS
+  3375350836U,	// <0,2,0,5>: Cost 4 vmrglw <3,3,0,0>, <1,4,2,5>
+  2214594490U,	// <0,2,0,6>: Cost 3 vmrghw <0,0,0,0>, <2,6,3,7>
+  3288336362U,	// <0,2,0,7>: Cost 4 vmrghw <0,0,0,0>, <2,7,0,1>
+  1207959659U,	// <0,2,0,u>: Cost 2 vmrglw <0,0,0,0>, LHS
+  2215871994U,	// <0,2,1,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2215470623U,	// <0,2,1,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1141728872U,	// <0,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1141728934U,	// <0,2,1,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2215872323U,	// <0,2,1,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2215872405U,	// <0,2,1,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1141729210U,	// <0,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2215430122U,	// <0,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1141729368U,	// <0,2,1,u>: Cost 2 vmrghw LHS, <2,u,3,3>
+  3289736698U,	// <0,2,2,0>: Cost 4 vmrghw <0,2,1,0>, <2,0,u,0>
+  3289744927U,	// <0,2,2,1>: Cost 4 vmrghw <0,2,1,1>, <2,1,3,1>
+  2216011368U,	// <0,2,2,2>: Cost 3 vmrghw <0,2,1,2>, <2,2,2,2>
+  2216019622U,	// <0,2,2,3>: Cost 3 vmrghw <0,2,1,3>, <2,3,0,1>
+  3289769795U,	// <0,2,2,4>: Cost 4 vmrghw <0,2,1,4>, <2,4,u,5>
+  3289778069U,	// <0,2,2,5>: Cost 4 vmrghw <0,2,1,5>, <2,5,u,6>
+  2216044474U,	// <0,2,2,6>: Cost 3 vmrghw <0,2,1,6>, <2,6,3,7>
+  3732960259U,	// <0,2,2,7>: Cost 4 vsldoi4 <7,0,2,2>, <7,0,2,2>
+  2216061016U,	// <0,2,2,u>: Cost 3 vmrghw <0,2,1,u>, <2,u,3,3>
+  2758985382U,	// <0,2,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,0,1>
+  2758985392U,	// <0,2,3,1>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,1,2>
+  3290400360U,	// <0,2,3,2>: Cost 4 vmrghw <0,3,1,0>, <2,2,2,2>
+  2758985408U,	// <0,2,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,3,0>
+  2758985422U,	// <0,2,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,4,5>
+  2785822424U,	// <0,2,3,5>: Cost 3 vsldoi12 <5,6,7,0>, <2,3,5,6>
+  3290400698U,	// <0,2,3,6>: Cost 4 vmrghw <0,3,1,0>, <2,6,3,7>
+  2765915876U,	// <0,2,3,7>: Cost 3 vsldoi12 <2,3,7,0>, <2,3,7,0>
+  2758985453U,	// <0,2,3,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,u,0>
+  3291104762U,	// <0,2,4,0>: Cost 4 vmrghw <0,4,1,5>, <2,0,u,0>
+  2217362979U,	// <0,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2217363048U,	// <0,2,4,2>: Cost 3 vmrghw <0,4,1,5>, <2,2,2,2>
+  2217363110U,	// <0,2,4,3>: Cost 3 vmrghw <0,4,1,5>, <2,3,0,1>
+  3291105087U,	// <0,2,4,4>: Cost 4 vmrghw <0,4,1,5>, <2,4,u,1>
+  3291105173U,	// <0,2,4,5>: Cost 4 vmrghw <0,4,1,5>, <2,5,u,6>
+  2217363386U,	// <0,2,4,6>: Cost 3 vmrghw <0,4,1,5>, <2,6,3,7>
+  3788639688U,	// <0,2,4,7>: Cost 4 vsldoi8 <5,1,0,2>, <4,7,5,0>
+  2217363515U,	// <0,2,4,u>: Cost 3 vmrghw <0,4,1,5>, <2,u,0,1>
+  3376054371U,	// <0,2,5,0>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,0>
+  3788639888U,	// <0,2,5,1>: Cost 4 vsldoi8 <5,1,0,2>, <5,1,0,2>
+  3376055912U,	// <0,2,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,2,2,2>
+  2302312550U,	// <0,2,5,3>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3376054375U,	// <0,2,5,4>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,4>
+  3374728244U,	// <0,2,5,5>: Cost 4 vmrglw <3,2,0,5>, <1,4,2,5>
+  3805229154U,	// <0,2,5,6>: Cost 4 vsldoi8 <7,u,0,2>, <5,6,7,0>
+  3376055512U,	// <0,2,5,7>: Cost 4 vmrglw <3,4,0,5>, <1,6,2,7>
+  2302312555U,	// <0,2,5,u>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3709100134U,	// <0,2,6,0>: Cost 4 vsldoi4 <3,0,2,6>, LHS
+  3709100950U,	// <0,2,6,1>: Cost 4 vsldoi4 <3,0,2,6>, <1,2,3,0>
+  3709102010U,	// <0,2,6,2>: Cost 4 vsldoi4 <3,0,2,6>, <2,6,3,7>
+  2758985658U,	// <0,2,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,3,7>
+  3709103414U,	// <0,2,6,4>: Cost 4 vsldoi4 <3,0,2,6>, RHS
+  3732992098U,	// <0,2,6,5>: Cost 4 vsldoi4 <7,0,2,6>, <5,6,7,0>
+  3292374970U,	// <0,2,6,6>: Cost 4 vmrghw <0,6,0,7>, <2,6,3,7>
+  3798594383U,	// <0,2,6,7>: Cost 4 vsldoi8 <6,7,0,2>, <6,7,0,2>
+  2758985703U,	// <0,2,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,u,7>
+  3788641274U,	// <0,2,7,0>: Cost 4 vsldoi8 <5,1,0,2>, <7,0,1,2>
+  3377398508U,	// <0,2,7,1>: Cost 4 vmrglw <3,6,0,7>, <1,0,2,1>
+  3377398590U,	// <0,2,7,2>: Cost 4 vmrglw <3,6,0,7>, <1,1,2,2>
+  2303656038U,	// <0,2,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  3709111606U,	// <0,2,7,4>: Cost 4 vsldoi4 <3,0,2,7>, RHS
+  3377398836U,	// <0,2,7,5>: Cost 4 vmrglw <3,6,0,7>, <1,4,2,5>
+  3803903447U,	// <0,2,7,6>: Cost 4 vsldoi8 <7,6,0,2>, <7,6,0,2>
+  3293054954U,	// <0,2,7,7>: Cost 4 vmrghw <0,7,1,0>, <2,7,0,1>
+  2303656043U,	// <0,2,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2220074490U,	// <0,2,u,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2220074527U,	// <0,2,u,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1146332776U,	// <0,2,u,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1146332838U,	// <0,2,u,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2220074819U,	// <0,2,u,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2220074901U,	// <0,2,u,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1146333114U,	// <0,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2220074986U,	// <0,2,u,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1146333243U,	// <0,2,u,u>: Cost 2 vmrghw LHS, <2,u,0,1>
+  2629410816U,	// <0,3,0,0>: Cost 3 vsldoi4 <2,0,3,0>, <0,0,0,0>
+  2753530006U,	// <0,3,0,1>: Cost 3 vsldoi12 <0,3,1,0>, <3,0,1,2>
+  2629412301U,	// <0,3,0,2>: Cost 3 vsldoi4 <2,0,3,0>, <2,0,3,0>
+  2214594972U,	// <0,3,0,3>: Cost 3 vmrghw <0,0,0,0>, <3,3,3,3>
+  2758985908U,	// <0,3,0,4>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,4,5>
+  3733016674U,	// <0,3,0,5>: Cost 4 vsldoi4 <7,0,3,0>, <5,6,7,0>
+  3777364488U,	// <0,3,0,6>: Cost 4 vsldoi8 <3,2,0,3>, <0,6,3,7>
+  2281703354U,	// <0,3,0,7>: Cost 3 vmrglw <0,0,0,0>, <2,6,3,7>
+  2758985941U,	// <0,3,0,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,u,2>
+  1141729430U,	// <0,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2215471334U,	// <0,3,1,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2215471425U,	// <0,3,1,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1141729692U,	// <0,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1141729794U,	// <0,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2215430738U,	// <0,3,1,5>: Cost 3 vmrghw LHS, <3,5,5,5>
+  2215430776U,	// <0,3,1,6>: Cost 3 vmrghw LHS, <3,6,0,7>
+  2295646138U,	// <0,3,1,7>: Cost 3 vmrglw <2,3,0,1>, <2,6,3,7>
+  1141730078U,	// <0,3,1,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2758986032U,	// <0,3,2,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,2,0,3>
+  3709141910U,	// <0,3,2,1>: Cost 4 vsldoi4 <3,0,3,2>, <1,2,3,0>
+  3289753921U,	// <0,3,2,2>: Cost 4 vmrghw <0,2,1,2>, <3,2,2,2>
+  2770929992U,	// <0,3,2,3>: Cost 3 vsldoi12 <3,2,3,0>, <3,2,3,0>
+  3289754114U,	// <0,3,2,4>: Cost 4 vmrghw <0,2,1,2>, <3,4,5,6>
+  3362095460U,	// <0,3,2,5>: Cost 5 vmrglw <1,1,0,2>, <0,4,3,5>
+  3832727910U,	// <0,3,2,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,2,6,3>
+  3365414842U,	// <0,3,2,7>: Cost 4 vmrglw <1,6,0,2>, <2,6,3,7>
+  2771298677U,	// <0,3,2,u>: Cost 3 vsldoi12 <3,2,u,0>, <3,2,u,0>
+  2216659094U,	// <0,3,3,0>: Cost 3 vmrghw <0,3,1,0>, <3,0,1,2>
+  3290409190U,	// <0,3,3,1>: Cost 4 vmrghw <0,3,1,1>, <3,1,1,1>
+  2703624496U,	// <0,3,3,2>: Cost 3 vsldoi8 <3,2,0,3>, <3,2,0,3>
+  2216683932U,	// <0,3,3,3>: Cost 3 vmrghw <0,3,1,3>, <3,3,3,3>
+  2216692226U,	// <0,3,3,4>: Cost 3 vmrghw <0,3,1,4>, <3,4,5,6>
+  3733041250U,	// <0,3,3,5>: Cost 4 vsldoi4 <7,0,3,3>, <5,6,7,0>
+  3832727988U,	// <0,3,3,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,3,6,0>
+  3374712762U,	// <0,3,3,7>: Cost 4 vmrglw <3,2,0,3>, <2,6,3,7>
+  2216725278U,	// <0,3,3,u>: Cost 3 vmrghw <0,3,1,u>, <3,u,1,2>
+  2217363606U,	// <0,3,4,0>: Cost 3 vmrghw <0,4,1,5>, <3,0,1,2>
+  3291105510U,	// <0,3,4,1>: Cost 4 vmrghw <0,4,1,5>, <3,1,1,1>
+  3291105601U,	// <0,3,4,2>: Cost 4 vmrghw <0,4,1,5>, <3,2,2,2>
+  2217363868U,	// <0,3,4,3>: Cost 3 vmrghw <0,4,1,5>, <3,3,3,3>
+  2217363970U,	// <0,3,4,4>: Cost 3 vmrghw <0,4,1,5>, <3,4,5,6>
+  2758986242U,	// <0,3,4,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,4,5,6>
+  3727077685U,	// <0,3,4,6>: Cost 4 vsldoi4 <6,0,3,4>, <6,0,3,4>
+  3364767674U,	// <0,3,4,7>: Cost 4 vmrglw <1,5,0,4>, <2,6,3,7>
+  2217364254U,	// <0,3,4,u>: Cost 3 vmrghw <0,4,1,5>, <3,u,1,2>
+  3832728102U,	// <0,3,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <3,5,0,6>
+  3405916003U,	// <0,3,5,1>: Cost 4 vmrglw <u,4,0,5>, <2,5,3,1>
+  3376055840U,	// <0,3,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,1,3,2>
+  3376055679U,	// <0,3,5,3>: Cost 4 vmrglw <3,4,0,5>, <1,u,3,3>
+  3376055194U,	// <0,3,5,4>: Cost 4 vmrglw <3,4,0,5>, <1,2,3,4>
+  3859565138U,	// <0,3,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <3,5,5,5>
+  2727514210U,	// <0,3,5,6>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  3376056250U,	// <0,3,5,7>: Cost 4 vmrglw <3,4,0,5>, <2,6,3,7>
+  2727514210U,	// <0,3,5,u>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  2758986360U,	// <0,3,6,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  3709174678U,	// <0,3,6,1>: Cost 4 vsldoi4 <3,0,3,6>, <1,2,3,0>
+  3795284411U,	// <0,3,6,2>: Cost 4 vsldoi8 <6,2,0,3>, <6,2,0,3>
+  3709175980U,	// <0,3,6,3>: Cost 4 vsldoi4 <3,0,3,6>, <3,0,3,6>
+  3833096860U,	// <0,3,6,4>: Cost 4 vsldoi12 <1,2,u,0>, <3,6,4,7>
+  3376728235U,	// <0,3,6,5>: Cost 5 vmrglw <3,5,0,6>, <3,0,3,5>
+  3859565229U,	// <0,3,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <3,6,6,6>
+  2773879472U,	// <0,3,6,7>: Cost 3 vsldoi12 <3,6,7,0>, <3,6,7,0>
+  2758986360U,	// <0,3,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  2303656854U,	// <0,3,7,0>: Cost 3 vmrglw <3,6,0,7>, <1,2,3,0>
+  3807229018U,	// <0,3,7,1>: Cost 4 vsldoi8 <u,2,0,3>, <7,1,2,u>
+  2727515284U,	// <0,3,7,2>: Cost 3 vsldoi8 <7,2,0,3>, <7,2,0,3>
+  3377399410U,	// <0,3,7,3>: Cost 4 vmrglw <3,6,0,7>, <2,2,3,3>
+  3377398682U,	// <0,3,7,4>: Cost 4 vmrglw <3,6,0,7>, <1,2,3,4>
+  3801257409U,	// <0,3,7,5>: Cost 4 vsldoi8 <7,2,0,3>, <7,5,6,7>
+  3377399980U,	// <0,3,7,6>: Cost 4 vmrglw <3,6,0,7>, <3,0,3,6>
+  3375409082U,	// <0,3,7,7>: Cost 4 vmrglw <3,3,0,7>, <2,6,3,7>
+  2731497082U,	// <0,3,7,u>: Cost 3 vsldoi8 <7,u,0,3>, <7,u,0,3>
+  1146333334U,	// <0,3,u,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2220075238U,	// <0,3,u,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2220075329U,	// <0,3,u,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1146333596U,	// <0,3,u,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1146333698U,	// <0,3,u,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2758986566U,	// <0,3,u,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,u,5,6>
+  2803739472U,	// <0,3,u,6>: Cost 3 vsldoi12 <u,6,7,0>, <3,u,6,7>
+  2295703482U,	// <0,3,u,7>: Cost 3 vmrglw <2,3,0,u>, <2,6,3,7>
+  1146333982U,	// <0,3,u,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2214595473U,	// <0,4,0,0>: Cost 3 vmrghw <0,0,0,0>, <4,0,5,0>
+  2693677158U,	// <0,4,0,1>: Cost 3 vsldoi8 <1,5,0,4>, LHS
+  3839437689U,	// <0,4,0,2>: Cost 4 vsldoi12 <2,3,4,0>, <4,0,2,3>
+  3709200559U,	// <0,4,0,3>: Cost 4 vsldoi4 <3,0,4,0>, <3,0,4,0>
+  2693677394U,	// <0,4,0,4>: Cost 3 vsldoi8 <1,5,0,4>, <0,4,1,5>
+  1140854070U,	// <0,4,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  3767419409U,	// <0,4,0,6>: Cost 4 vsldoi8 <1,5,0,4>, <0,6,4,7>
+  3854109604U,	// <0,4,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,0,7,1>
+  1140854313U,	// <0,4,0,u>: Cost 2 vmrghw <0,0,0,0>, RHS
+  1141689234U,	// <0,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2215431114U,	// <0,4,1,1>: Cost 3 vmrghw LHS, <4,1,2,3>
+  2215431221U,	// <0,4,1,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635466928U,	// <0,4,1,3>: Cost 3 vsldoi4 <3,0,4,1>, <3,0,4,1>
+  1141689552U,	// <0,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  67947830U,	// <0,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2215431545U,	// <0,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2659357716U,	// <0,4,1,7>: Cost 3 vsldoi4 <7,0,4,1>, <7,0,4,1>
+  67948073U,	// <0,4,1,u>: Cost 1 vmrghw LHS, RHS
+  3767420369U,	// <0,4,2,0>: Cost 4 vsldoi8 <1,5,0,4>, <2,0,3,4>
+  3767420451U,	// <0,4,2,1>: Cost 4 vsldoi8 <1,5,0,4>, <2,1,3,5>
+  3767420520U,	// <0,4,2,2>: Cost 4 vsldoi8 <1,5,0,4>, <2,2,2,2>
+  2698323625U,	// <0,4,2,3>: Cost 3 vsldoi8 <2,3,0,4>, <2,3,0,4>
+  3709218102U,	// <0,4,2,4>: Cost 4 vsldoi4 <3,0,4,2>, RHS
+  2216013110U,	// <0,4,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767420858U,	// <0,4,2,6>: Cost 4 vsldoi8 <1,5,0,4>, <2,6,3,7>
+  3774719981U,	// <0,4,2,7>: Cost 4 vsldoi8 <2,7,0,4>, <2,7,0,4>
+  2216013353U,	// <0,4,2,u>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767421078U,	// <0,4,3,0>: Cost 4 vsldoi8 <1,5,0,4>, <3,0,1,2>
+  3776710880U,	// <0,4,3,1>: Cost 4 vsldoi8 <3,1,0,4>, <3,1,0,4>
+  3833097325U,	// <0,4,3,2>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,2,4>
+  3767421340U,	// <0,4,3,3>: Cost 4 vsldoi8 <1,5,0,4>, <3,3,3,3>
+  3767421442U,	// <0,4,3,4>: Cost 4 vsldoi8 <1,5,0,4>, <3,4,5,6>
+  2216660278U,	// <0,4,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  3833097361U,	// <0,4,3,6>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,6,4>
+  3780692678U,	// <0,4,3,7>: Cost 4 vsldoi8 <3,7,0,4>, <3,7,0,4>
+  2216660521U,	// <0,4,3,u>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2617573416U,	// <0,4,4,0>: Cost 3 vsldoi4 <0,0,4,4>, <0,0,4,4>
+  2217364450U,	// <0,4,4,1>: Cost 3 vmrghw <0,4,1,5>, <4,1,5,0>
+  3691316771U,	// <0,4,4,2>: Cost 4 vsldoi4 <0,0,4,4>, <2,1,3,5>
+  3709233331U,	// <0,4,4,3>: Cost 4 vsldoi4 <3,0,4,4>, <3,0,4,4>
+  2785823952U,	// <0,4,4,4>: Cost 3 vsldoi12 <5,6,7,0>, <4,4,4,4>
+  1143622966U,	// <0,4,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  3691319723U,	// <0,4,4,6>: Cost 4 vsldoi4 <0,0,4,4>, <6,1,7,5>
+  3854109932U,	// <0,4,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,4,7,5>
+  1143623209U,	// <0,4,4,u>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2635497574U,	// <0,4,5,0>: Cost 3 vsldoi4 <3,0,4,5>, LHS
+  2635498390U,	// <0,4,5,1>: Cost 3 vsldoi4 <3,0,4,5>, <1,2,3,0>
+  3709240936U,	// <0,4,5,2>: Cost 4 vsldoi4 <3,0,4,5>, <2,2,2,2>
+  2635499700U,	// <0,4,5,3>: Cost 3 vsldoi4 <3,0,4,5>, <3,0,4,5>
+  2635500854U,	// <0,4,5,4>: Cost 3 vsldoi4 <3,0,4,5>, RHS
+  2785824044U,	// <0,4,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <4,5,5,6>
+  1685245238U,	// <0,4,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659390488U,	// <0,4,5,7>: Cost 3 vsldoi4 <7,0,4,5>, <7,0,4,5>
+  1685245256U,	// <0,4,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  3839438161U,	// <0,4,6,0>: Cost 4 vsldoi12 <2,3,4,0>, <4,6,0,7>
+  3798610347U,	// <0,4,6,1>: Cost 4 vsldoi8 <6,7,0,4>, <6,1,7,5>
+  3798610426U,	// <0,4,6,2>: Cost 4 vsldoi8 <6,7,0,4>, <6,2,7,3>
+  3795956237U,	// <0,4,6,3>: Cost 4 vsldoi8 <6,3,0,4>, <6,3,0,4>
+  3733138742U,	// <0,4,6,4>: Cost 4 vsldoi4 <7,0,4,6>, RHS
+  2218634550U,	// <0,4,6,5>: Cost 3 vmrghw <0,6,0,7>, RHS
+  3798610744U,	// <0,4,6,6>: Cost 4 vsldoi8 <6,7,0,4>, <6,6,6,6>
+  2724868945U,	// <0,4,6,7>: Cost 3 vsldoi8 <6,7,0,4>, <6,7,0,4>
+  2725532578U,	// <0,4,6,u>: Cost 3 vsldoi8 <6,u,0,4>, <6,u,0,4>
+  3383371465U,	// <0,4,7,0>: Cost 4 vmrglw <4,6,0,7>, <2,3,4,0>
+  3800601668U,	// <0,4,7,1>: Cost 4 vsldoi8 <7,1,0,4>, <7,1,0,4>
+  3775386826U,	// <0,4,7,2>: Cost 5 vsldoi8 <2,u,0,4>, <7,2,6,3>
+  3801928934U,	// <0,4,7,3>: Cost 4 vsldoi8 <7,3,0,4>, <7,3,0,4>
+  3721202998U,	// <0,4,7,4>: Cost 4 vsldoi4 <5,0,4,7>, RHS
+  2780368328U,	// <0,4,7,5>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  3383372686U,	// <0,4,7,6>: Cost 5 vmrglw <4,6,0,7>, <4,0,4,6>
+  3854110170U,	// <0,4,7,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,7,7,0>
+  2780368328U,	// <0,4,7,u>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  1146334098U,	// <0,4,u,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2220076002U,	// <0,4,u,1>: Cost 3 vmrghw LHS, <4,1,5,0>
+  2220076085U,	// <0,4,u,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635524279U,	// <0,4,u,3>: Cost 3 vsldoi4 <3,0,4,u>, <3,0,4,u>
+  1146334416U,	// <0,4,u,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  72592694U,	// <0,4,u,5>: Cost 1 vmrghw LHS, RHS
+  1685245481U,	// <0,4,u,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659415067U,	// <0,4,u,7>: Cost 3 vsldoi4 <7,0,4,u>, <7,0,4,u>
+  72592937U,	// <0,4,u,u>: Cost 1 vmrghw LHS, RHS
+  2281704337U,	// <0,5,0,0>: Cost 3 vmrglw <0,0,0,0>, <4,0,5,0>
+  2704965734U,	// <0,5,0,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  3778707666U,	// <0,5,0,2>: Cost 4 vsldoi8 <3,4,0,5>, <0,2,5,3>
+  3778707708U,	// <0,5,0,3>: Cost 4 vsldoi8 <3,4,0,5>, <0,3,1,0>
+  2687050057U,	// <0,5,0,4>: Cost 3 vsldoi8 <0,4,0,5>, <0,4,0,5>
+  2214596612U,	// <0,5,0,5>: Cost 3 vmrghw <0,0,0,0>, <5,5,5,5>
+  2785824372U,	// <0,5,0,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,0,6,1>
+  3854110332U,	// <0,5,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <5,0,7,0>
+  2704966301U,	// <0,5,0,u>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  1567768678U,	// <0,5,1,0>: Cost 2 vsldoi4 <4,0,5,1>, LHS
+  2312236570U,	// <0,5,1,1>: Cost 3 vmrglw <5,1,0,1>, <4,u,5,1>
+  2215431915U,	// <0,5,1,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641512598U,	// <0,5,1,3>: Cost 3 vsldoi4 <4,0,5,1>, <3,0,1,2>
+  1567771538U,	// <0,5,1,4>: Cost 2 vsldoi4 <4,0,5,1>, <4,0,5,1>
+  1141690372U,	// <0,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1141690466U,	// <0,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2641515514U,	// <0,5,1,7>: Cost 3 vsldoi4 <4,0,5,1>, <7,0,1,2>
+  1141690615U,	// <0,5,1,u>: Cost 2 vmrghw LHS, <5,u,5,5>
+  3772736973U,	// <0,5,2,0>: Cost 4 vsldoi8 <2,4,0,5>, <2,0,3,0>
+  3778709024U,	// <0,5,2,1>: Cost 4 vsldoi8 <3,4,0,5>, <2,1,3,2>
+  3778709096U,	// <0,5,2,2>: Cost 4 vsldoi8 <3,4,0,5>, <2,2,2,2>
+  3778709158U,	// <0,5,2,3>: Cost 4 vsldoi8 <3,4,0,5>, <2,3,0,1>
+  3772737275U,	// <0,5,2,4>: Cost 4 vsldoi8 <2,4,0,5>, <2,4,0,5>
+  3859566351U,	// <0,5,2,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,2,5,3>
+  3778709434U,	// <0,5,2,6>: Cost 4 vsldoi8 <3,4,0,5>, <2,6,3,7>
+  3805251562U,	// <0,5,2,7>: Cost 4 vsldoi8 <7,u,0,5>, <2,7,0,1>
+  3775391807U,	// <0,5,2,u>: Cost 4 vsldoi8 <2,u,0,5>, <2,u,0,5>
+  2704967830U,	// <0,5,3,0>: Cost 3 vsldoi8 <3,4,0,5>, <3,0,1,2>
+  3776719073U,	// <0,5,3,1>: Cost 4 vsldoi8 <3,1,0,5>, <3,1,0,5>
+  3777382706U,	// <0,5,3,2>: Cost 4 vsldoi8 <3,2,0,5>, <3,2,0,5>
+  3778709887U,	// <0,5,3,3>: Cost 4 vsldoi8 <3,4,0,5>, <3,3,0,1>
+  2704968148U,	// <0,5,3,4>: Cost 3 vsldoi8 <3,4,0,5>, <3,4,0,5>
+  3857428317U,	// <0,5,3,5>: Cost 4 vsldoi12 <5,3,5,0>, <5,3,5,0>
+  3364096514U,	// <0,5,3,6>: Cost 4 vmrglw <1,4,0,3>, <3,4,5,6>
+  3780700871U,	// <0,5,3,7>: Cost 4 vsldoi8 <3,7,0,5>, <3,7,0,5>
+  2707622680U,	// <0,5,3,u>: Cost 3 vsldoi8 <3,u,0,5>, <3,u,0,5>
+  2728856466U,	// <0,5,4,0>: Cost 3 vsldoi8 <7,4,0,5>, <4,0,5,1>
+  3697361674U,	// <0,5,4,1>: Cost 4 vsldoi4 <1,0,5,4>, <1,0,5,4>
+  3697362601U,	// <0,5,4,2>: Cost 4 vsldoi4 <1,0,5,4>, <2,3,0,4>
+  3364766635U,	// <0,5,4,3>: Cost 4 vmrglw <1,5,0,4>, <1,2,5,3>
+  2217365428U,	// <0,5,4,4>: Cost 3 vmrghw <0,4,1,5>, <5,4,5,6>
+  2704969014U,	// <0,5,4,5>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  2785824700U,	// <0,5,4,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,4,6,5>
+  3364766963U,	// <0,5,4,7>: Cost 4 vmrglw <1,5,0,4>, <1,6,5,7>
+  2704969257U,	// <0,5,4,u>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  3846148050U,	// <0,5,5,0>: Cost 4 vsldoi12 <3,4,5,0>, <5,5,0,0>
+  2326203282U,	// <0,5,5,1>: Cost 3 vmrglw <7,4,0,5>, <4,0,5,1>
+  3291746027U,	// <0,5,5,2>: Cost 4 vmrghw <0,5,1,2>, <5,2,1,3>
+  3376054482U,	// <0,5,5,3>: Cost 4 vmrglw <3,4,0,5>, <0,2,5,3>
+  3790655366U,	// <0,5,5,4>: Cost 4 vsldoi8 <5,4,0,5>, <5,4,0,5>
+  2785824772U,	// <0,5,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <5,5,5,5>
+  2724876386U,	// <0,5,5,6>: Cost 3 vsldoi8 <6,7,0,5>, <5,6,7,0>
+  3858903057U,	// <0,5,5,7>: Cost 4 vsldoi12 <5,5,7,0>, <5,5,7,0>
+  2736820484U,	// <0,5,5,u>: Cost 3 vsldoi8 <u,7,0,5>, <5,u,7,0>
+  2659467366U,	// <0,5,6,0>: Cost 3 vsldoi4 <7,0,5,6>, LHS
+  3859566643U,	// <0,5,6,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,6,1,7>
+  3798618618U,	// <0,5,6,2>: Cost 4 vsldoi8 <6,7,0,5>, <6,2,7,3>
+  3852857410U,	// <0,5,6,3>: Cost 4 vsldoi12 <4,5,6,0>, <5,6,3,4>
+  2659470646U,	// <0,5,6,4>: Cost 3 vsldoi4 <7,0,5,6>, RHS
+  2659471458U,	// <0,5,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  3832729696U,	// <0,5,6,6>: Cost 4 vsldoi12 <1,2,3,0>, <5,6,6,7>
+  1712083042U,	// <0,5,6,7>: Cost 2 vsldoi12 <5,6,7,0>, <5,6,7,0>
+  1712156779U,	// <0,5,6,u>: Cost 2 vsldoi12 <5,6,u,0>, <5,6,u,0>
+  2731512826U,	// <0,5,7,0>: Cost 3 vsldoi8 <7,u,0,5>, <7,0,1,2>
+  3859566717U,	// <0,5,7,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,1,0>
+  3798619284U,	// <0,5,7,2>: Cost 4 vsldoi8 <6,7,0,5>, <7,2,0,3>
+  3778712803U,	// <0,5,7,3>: Cost 4 vsldoi8 <3,4,0,5>, <7,3,0,1>
+  2728858936U,	// <0,5,7,4>: Cost 3 vsldoi8 <7,4,0,5>, <7,4,0,5>
+  3859566753U,	// <0,5,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,5,0>
+  3377398135U,	// <0,5,7,6>: Cost 4 vmrglw <3,6,0,7>, <0,4,5,6>
+  3798619686U,	// <0,5,7,7>: Cost 4 vsldoi8 <6,7,0,5>, <7,7,0,0>
+  2731513468U,	// <0,5,7,u>: Cost 3 vsldoi8 <7,u,0,5>, <7,u,0,5>
+  1567826022U,	// <0,5,u,0>: Cost 2 vsldoi4 <4,0,5,u>, LHS
+  2704971566U,	// <0,5,u,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  2220076779U,	// <0,5,u,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641569942U,	// <0,5,u,3>: Cost 3 vsldoi4 <4,0,5,u>, <3,0,1,2>
+  1567828889U,	// <0,5,u,4>: Cost 2 vsldoi4 <4,0,5,u>, <4,0,5,u>
+  1146335236U,	// <0,5,u,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1146335330U,	// <0,5,u,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  1713410308U,	// <0,5,u,7>: Cost 2 vsldoi12 <5,u,7,0>, <5,u,7,0>
+  1713484045U,	// <0,5,u,u>: Cost 2 vsldoi12 <5,u,u,0>, <5,u,u,0>
+  2214596949U,	// <0,6,0,0>: Cost 3 vmrghw <0,0,0,0>, <6,0,7,0>
+  2214678951U,	// <0,6,0,1>: Cost 3 vmrghw <0,0,1,1>, <6,1,7,1>
+  2214597114U,	// <0,6,0,2>: Cost 3 vmrghw <0,0,0,0>, <6,2,7,3>
+  3852857653U,	// <0,6,0,3>: Cost 4 vsldoi12 <4,5,6,0>, <6,0,3,4>
+  3832729919U,	// <0,6,0,4>: Cost 4 vsldoi12 <1,2,3,0>, <6,0,4,5>
+  3721293427U,	// <0,6,0,5>: Cost 4 vsldoi4 <5,0,6,0>, <5,0,6,0>
+  2214597432U,	// <0,6,0,6>: Cost 3 vmrghw <0,0,0,0>, <6,6,6,6>
+  1207962934U,	// <0,6,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  1207962935U,	// <0,6,0,u>: Cost 2 vmrglw <0,0,0,0>, RHS
+  2215432481U,	// <0,6,1,0>: Cost 3 vmrghw LHS, <6,0,1,2>
+  2215432615U,	// <0,6,1,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1141690874U,	// <0,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2215432754U,	// <0,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2215432817U,	// <0,6,1,4>: Cost 3 vmrghw LHS, <6,4,2,5>
+  2215432939U,	// <0,6,1,5>: Cost 3 vmrghw LHS, <6,5,7,1>
+  1141691192U,	// <0,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221905718U,	// <0,6,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  1221905719U,	// <0,6,1,u>: Cost 2 vmrglw <2,3,0,1>, RHS
+  3852857787U,	// <0,6,2,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,2,0,3>
+  3289764265U,	// <0,6,2,1>: Cost 4 vmrghw <0,2,1,3>, <6,1,7,3>
+  3289690618U,	// <0,6,2,2>: Cost 4 vmrghw <0,2,0,3>, <6,2,7,3>
+  3862589907U,	// <0,6,2,3>: Cost 4 vsldoi12 <6,2,3,0>, <6,2,3,0>
+  3733253430U,	// <0,6,2,4>: Cost 4 vsldoi4 <7,0,6,2>, RHS
+  3733254242U,	// <0,6,2,5>: Cost 4 vsldoi4 <7,0,6,2>, <5,6,7,0>
+  3777390522U,	// <0,6,2,6>: Cost 4 vsldoi8 <3,2,0,6>, <2,6,3,7>
+  2785825274U,	// <0,6,2,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,7,3>
+  2785825283U,	// <0,6,2,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,u,3>
+  3777390742U,	// <0,6,3,0>: Cost 4 vsldoi8 <3,2,0,6>, <3,0,1,2>
+  3863106066U,	// <0,6,3,1>: Cost 4 vsldoi12 <6,3,1,0>, <6,3,1,0>
+  3777390899U,	// <0,6,3,2>: Cost 4 vsldoi8 <3,2,0,6>, <3,2,0,6>
+  3290436146U,	// <0,6,3,3>: Cost 4 vmrghw <0,3,1,4>, <6,3,4,5>
+  3779381762U,	// <0,6,3,4>: Cost 4 vsldoi8 <3,5,0,6>, <3,4,5,6>
+  3779381798U,	// <0,6,3,5>: Cost 4 vsldoi8 <3,5,0,6>, <3,5,0,6>
+  3733262920U,	// <0,6,3,6>: Cost 4 vsldoi4 <7,0,6,3>, <6,3,7,0>
+  2300972342U,	// <0,6,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2300972343U,	// <0,6,3,u>: Cost 3 vmrglw <3,2,0,3>, RHS
+  3802606482U,	// <0,6,4,0>: Cost 4 vsldoi8 <7,4,0,6>, <4,0,5,1>
+  2217365931U,	// <0,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2217366010U,	// <0,6,4,2>: Cost 3 vmrghw <0,4,1,5>, <6,2,7,3>
+  3291107890U,	// <0,6,4,3>: Cost 4 vmrghw <0,4,1,5>, <6,3,4,5>
+  3291099805U,	// <0,6,4,4>: Cost 4 vmrghw <0,4,1,4>, <6,4,7,4>
+  3777391926U,	// <0,6,4,5>: Cost 4 vsldoi8 <3,2,0,6>, RHS
+  2217366328U,	// <0,6,4,6>: Cost 3 vmrghw <0,4,1,5>, <6,6,6,6>
+  2291027254U,	// <0,6,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  2291027255U,	// <0,6,4,u>: Cost 3 vmrglw <1,5,0,4>, RHS
+  3852858033U,	// <0,6,5,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,5,0,6>
+  3395964532U,	// <0,6,5,1>: Cost 4 vmrglw <6,7,0,5>, <5,0,6,1>
+  3864507069U,	// <0,6,5,2>: Cost 4 vsldoi12 <6,5,2,0>, <6,5,2,0>
+  3376056678U,	// <0,6,5,3>: Cost 5 vmrglw <3,4,0,5>, <3,2,6,3>
+  3721334070U,	// <0,6,5,4>: Cost 4 vsldoi4 <5,0,6,5>, RHS
+  3395964860U,	// <0,6,5,5>: Cost 4 vmrglw <6,7,0,5>, <5,4,6,5>
+  3864802017U,	// <0,6,5,6>: Cost 4 vsldoi12 <6,5,6,0>, <6,5,6,0>
+  2302315830U,	// <0,6,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  2302315831U,	// <0,6,5,u>: Cost 3 vmrglw <3,4,0,5>, RHS
+  3852858108U,	// <0,6,6,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,6,0,0>
+  3398624745U,	// <0,6,6,1>: Cost 4 vmrglw <7,2,0,6>, <2,0,6,1>
+  2218668538U,	// <0,6,6,2>: Cost 3 vmrghw <0,6,1,2>, <6,2,7,3>
+  3292418610U,	// <0,6,6,3>: Cost 4 vmrghw <0,6,1,3>, <6,3,4,5>
+  3733286198U,	// <0,6,6,4>: Cost 4 vsldoi4 <7,0,6,6>, RHS
+  3797299889U,	// <0,6,6,5>: Cost 4 vsldoi8 <6,5,0,6>, <6,5,0,6>
+  2785825592U,	// <0,6,6,6>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,6,6>
+  2785825602U,	// <0,6,6,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,7,7>
+  2785825611U,	// <0,6,6,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,u,7>
+  2785825614U,	// <0,6,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,0,1>
+  2758988632U,	// <0,6,7,1>: Cost 3 vsldoi12 <1,2,3,0>, <6,7,1,2>
+  3377400084U,	// <0,6,7,2>: Cost 4 vmrglw <3,6,0,7>, <3,1,6,2>
+  2792166248U,	// <0,6,7,3>: Cost 3 vsldoi12 <6,7,3,0>, <6,7,3,0>
+  2785825654U,	// <0,6,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,4,5>
+  2785825664U,	// <0,6,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  3859567493U,	// <0,6,7,6>: Cost 4 vsldoi12 <5,6,7,0>, <6,7,6,2>
+  2303659318U,	// <0,6,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303659319U,	// <0,6,7,u>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2785825695U,	// <0,6,u,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,0,1>
+  2220077479U,	// <0,6,u,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1146335738U,	// <0,6,u,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2792829881U,	// <0,6,u,3>: Cost 3 vsldoi12 <6,u,3,0>, <6,u,3,0>
+  2785825735U,	// <0,6,u,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,4,5>
+  2785825664U,	// <0,6,u,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  1146336056U,	// <0,6,u,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221963062U,	// <0,6,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  1221963063U,	// <0,6,u,u>: Cost 2 vmrglw <2,3,0,u>, RHS
+  2653593600U,	// <0,7,0,0>: Cost 3 vsldoi4 <6,0,7,0>, <0,0,0,0>
+  2706309222U,	// <0,7,0,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  3709421498U,	// <0,7,0,2>: Cost 4 vsldoi4 <3,0,7,0>, <2,6,3,7>
+  2281705978U,	// <0,7,0,3>: Cost 3 vmrglw <0,0,0,0>, <6,2,7,3>
+  2785825816U,	// <0,7,0,4>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,4,5>
+  2785825826U,	// <0,7,0,5>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,5,6>
+  2653598037U,	// <0,7,0,6>: Cost 3 vsldoi4 <6,0,7,0>, <6,0,7,0>
+  2214598252U,	// <0,7,0,7>: Cost 3 vmrghw <0,0,0,0>, <7,7,7,7>
+  2706309789U,	// <0,7,0,u>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  1141691386U,	// <0,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2215433290U,	// <0,7,1,1>: Cost 3 vmrghw LHS, <7,1,1,1>
+  2706310038U,	// <0,7,1,2>: Cost 3 vsldoi8 <3,6,0,7>, <1,2,3,0>
+  2322190842U,	// <0,7,1,3>: Cost 3 vmrglw <6,7,0,1>, <6,2,7,3>
+  1141691750U,	// <0,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2215433654U,	// <0,7,1,5>: Cost 3 vmrghw LHS, <7,5,5,5>
+  2653606230U,	// <0,7,1,6>: Cost 3 vsldoi4 <6,0,7,1>, <6,0,7,1>
+  1141692012U,	// <0,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1141692034U,	// <0,7,1,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  2785825940U,	// <0,7,2,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,2,0,3>
+  3768108576U,	// <0,7,2,1>: Cost 5 vsldoi8 <1,6,0,7>, <2,1,3,2>
+  3780052584U,	// <0,7,2,2>: Cost 4 vsldoi8 <3,6,0,7>, <2,2,2,2>
+  2794820780U,	// <0,7,2,3>: Cost 3 vsldoi12 <7,2,3,0>, <7,2,3,0>
+  3859641528U,	// <0,7,2,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,2,4,3>
+  3733327970U,	// <0,7,2,5>: Cost 4 vsldoi4 <7,0,7,2>, <5,6,7,0>
+  3778062266U,	// <0,7,2,6>: Cost 4 vsldoi8 <3,3,0,7>, <2,6,3,7>
+  3733328944U,	// <0,7,2,7>: Cost 4 vsldoi4 <7,0,7,2>, <7,0,7,2>
+  2795189465U,	// <0,7,2,u>: Cost 3 vsldoi12 <7,2,u,0>, <7,2,u,0>
+  2324861026U,	// <0,7,3,0>: Cost 3 vmrglw <7,2,0,3>, <5,6,7,0>
+  3780053233U,	// <0,7,3,1>: Cost 4 vsldoi8 <3,6,0,7>, <3,1,2,3>
+  3780053296U,	// <0,7,3,2>: Cost 4 vsldoi8 <3,6,0,7>, <3,2,0,3>
+  3778062725U,	// <0,7,3,3>: Cost 4 vsldoi8 <3,3,0,7>, <3,3,0,7>
+  3780053506U,	// <0,7,3,4>: Cost 4 vsldoi8 <3,6,0,7>, <3,4,5,6>
+  3803941469U,	// <0,7,3,5>: Cost 4 vsldoi8 <7,6,0,7>, <3,5,6,7>
+  2706311800U,	// <0,7,3,6>: Cost 3 vsldoi8 <3,6,0,7>, <3,6,0,7>
+  3398603586U,	// <0,7,3,7>: Cost 4 vmrglw <7,2,0,3>, <6,6,7,7>
+  2707639066U,	// <0,7,3,u>: Cost 3 vsldoi8 <3,u,0,7>, <3,u,0,7>
+  2217366522U,	// <0,7,4,0>: Cost 3 vmrghw <0,4,1,5>, <7,0,1,2>
+  3727369110U,	// <0,7,4,1>: Cost 4 vsldoi4 <6,0,7,4>, <1,2,3,0>
+  3291108500U,	// <0,7,4,2>: Cost 4 vmrghw <0,4,1,5>, <7,2,0,3>
+  3727370872U,	// <0,7,4,3>: Cost 4 vsldoi4 <6,0,7,4>, <3,6,0,7>
+  2217366886U,	// <0,7,4,4>: Cost 3 vmrghw <0,4,1,5>, <7,4,5,6>
+  2706312502U,	// <0,7,4,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  3786026321U,	// <0,7,4,6>: Cost 4 vsldoi8 <4,6,0,7>, <4,6,0,7>
+  2217367148U,	// <0,7,4,7>: Cost 3 vmrghw <0,4,1,5>, <7,7,7,7>
+  2706312745U,	// <0,7,4,u>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2322223202U,	// <0,7,5,0>: Cost 3 vmrglw <6,7,0,5>, <5,6,7,0>
+  3399946987U,	// <0,7,5,1>: Cost 4 vmrglw <7,4,0,5>, <6,5,7,1>
+  3291780244U,	// <0,7,5,2>: Cost 4 vmrghw <0,5,1,6>, <7,2,0,3>
+  3727378582U,	// <0,7,5,3>: Cost 4 vsldoi4 <6,0,7,5>, <3,0,1,2>
+  3727379766U,	// <0,7,5,4>: Cost 4 vsldoi4 <6,0,7,5>, RHS
+  3859568054U,	// <0,7,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,5,5,5>
+  2785826241U,	// <0,7,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <7,5,6,7>
+  3395965762U,	// <0,7,5,7>: Cost 4 vmrglw <6,7,0,5>, <6,6,7,7>
+  2787153363U,	// <0,7,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <7,5,u,7>
+  2785826268U,	// <0,7,6,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,6,0,7>
+  3780055420U,	// <0,7,6,1>: Cost 5 vsldoi8 <3,6,0,7>, <6,1,2,3>
+  3859568110U,	// <0,7,6,2>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,2,7>
+  3874534903U,	// <0,7,6,3>: Cost 4 vsldoi12 <u,2,3,0>, <7,6,3,7>
+  3859641856U,	// <0,7,6,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,6,4,7>
+  3733360738U,	// <0,7,6,5>: Cost 4 vsldoi4 <7,0,7,6>, <5,6,7,0>
+  3859568145U,	// <0,7,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,6,6>
+  2797770260U,	// <0,7,6,7>: Cost 3 vsldoi12 <7,6,7,0>, <7,6,7,0>
+  2797843997U,	// <0,7,6,u>: Cost 3 vsldoi12 <7,6,u,0>, <7,6,u,0>
+  2785826342U,	// <0,7,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,0,0>
+  3727393686U,	// <0,7,7,1>: Cost 4 vsldoi4 <6,0,7,7>, <1,2,3,0>
+  3868563003U,	// <0,7,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <7,7,2,3>
+  3377397988U,	// <0,7,7,3>: Cost 4 vmrglw <3,6,0,7>, <0,2,7,3>
+  2219349350U,	// <0,7,7,4>: Cost 3 vmrghw <0,7,1,4>, <7,4,5,6>
+  3859568217U,	// <0,7,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,7,5,6>
+  2730202588U,	// <0,7,7,6>: Cost 3 vsldoi8 <7,6,0,7>, <7,6,0,7>
+  2785826412U,	// <0,7,7,7>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,7,7>
+  2731529854U,	// <0,7,7,u>: Cost 3 vsldoi8 <7,u,0,7>, <7,u,0,7>
+  1146336250U,	// <0,7,u,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2706315054U,	// <0,7,u,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  2653660845U,	// <0,7,u,2>: Cost 3 vsldoi4 <6,0,7,u>, <2,3,0,u>
+  2322248186U,	// <0,7,u,3>: Cost 3 vmrglw <6,7,0,u>, <6,2,7,3>
+  1146336614U,	// <0,7,u,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2706315418U,	// <0,7,u,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2653663581U,	// <0,7,u,6>: Cost 3 vsldoi4 <6,0,7,u>, <6,0,7,u>
+  1146336876U,	// <0,7,u,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1146336898U,	// <0,7,u,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  202162278U,	// <0,u,0,0>: Cost 1 vspltisw0 LHS
+  1624612966U,	// <0,u,0,1>: Cost 2 vsldoi8 <2,3,0,u>, LHS
+  2629780986U,	// <0,u,0,2>: Cost 3 vsldoi4 <2,0,u,0>, <2,0,u,0>
+  1207959708U,	// <0,u,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  1544097078U,	// <0,u,0,4>: Cost 2 vsldoi4 <0,0,u,0>, RHS
+  1140856986U,	// <0,u,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  2698355253U,	// <0,u,0,6>: Cost 3 vsldoi8 <2,3,0,u>, <0,6,u,7>
+  1207962952U,	// <0,u,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  202162278U,	// <0,u,0,u>: Cost 1 vspltisw0 LHS
+  1142134483U,	// <0,u,1,0>: Cost 2 vmrghw LHS, <u,0,1,2>
+  67950382U,	// <0,u,1,1>: Cost 1 vmrghw LHS, LHS
+  1142175624U,	// <0,u,1,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  1142175676U,	// <0,u,1,3>: Cost 2 vmrghw LHS, <u,3,0,1>
+  1142134847U,	// <0,u,1,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  67950746U,	// <0,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1142175952U,	// <0,u,1,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221905736U,	// <0,u,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  67950949U,	// <0,u,1,u>: Cost 1 vmrghw LHS, LHS
+  1562026086U,	// <0,u,2,0>: Cost 2 vsldoi4 <3,0,u,2>, LHS
+  2216015662U,	// <0,u,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2698356328U,	// <0,u,2,2>: Cost 3 vsldoi8 <2,3,0,u>, <2,2,2,2>
+  835584U,	// <0,u,2,3>: Cost 0 copy LHS
+  1562029366U,	// <0,u,2,4>: Cost 2 vsldoi4 <3,0,u,2>, RHS
+  2216016026U,	// <0,u,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  2698356666U,	// <0,u,2,6>: Cost 3 vsldoi8 <2,3,0,u>, <2,6,3,7>
+  1585919033U,	// <0,u,2,7>: Cost 2 vsldoi4 <7,0,u,2>, <7,0,u,2>
+  835584U,	// <0,u,2,u>: Cost 0 copy LHS
+  2758989756U,	// <0,u,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,0,1>
+  2216662830U,	// <0,u,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2703665461U,	// <0,u,3,2>: Cost 3 vsldoi8 <3,2,0,u>, <3,2,0,u>
+  2758989782U,	// <0,u,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,3,0>
+  2758989796U,	// <0,u,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,4,5>
+  2216663194U,	// <0,u,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2706319993U,	// <0,u,3,6>: Cost 3 vsldoi8 <3,6,0,u>, <3,6,0,u>
+  2300972360U,	// <0,u,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2216663397U,	// <0,u,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217367251U,	// <0,u,4,0>: Cost 3 vmrghw <0,4,1,5>, <u,0,1,2>
+  1143625518U,	// <0,u,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2217367432U,	// <0,u,4,2>: Cost 3 vmrghw <0,4,1,5>, <u,2,3,3>
+  2217367484U,	// <0,u,4,3>: Cost 3 vmrghw <0,4,1,5>, <u,3,0,1>
+  1143619922U,	// <0,u,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1143625882U,	// <0,u,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2217367760U,	// <0,u,4,6>: Cost 3 vmrghw <0,4,1,5>, <u,6,3,7>
+  2291027272U,	// <0,u,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  1143626085U,	// <0,u,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2635792486U,	// <0,u,5,0>: Cost 3 vsldoi4 <3,0,u,5>, LHS
+  2635793302U,	// <0,u,5,1>: Cost 3 vsldoi4 <3,0,u,5>, <1,2,3,0>
+  2302314646U,	// <0,u,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  2635794648U,	// <0,u,5,3>: Cost 3 vsldoi4 <3,0,u,5>, <3,0,u,5>
+  2635795766U,	// <0,u,5,4>: Cost 3 vsldoi4 <3,0,u,5>, RHS
+  2717601754U,	// <0,u,5,5>: Cost 3 vsldoi8 <5,5,0,u>, <5,5,0,u>
+  1685248154U,	// <0,u,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2302315848U,	// <0,u,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  1685248172U,	// <0,u,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2759358645U,	// <0,u,6,0>: Cost 3 vsldoi12 <1,2,u,0>, <u,6,0,7>
+  2218637102U,	// <0,u,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2724901370U,	// <0,u,6,2>: Cost 3 vsldoi8 <6,7,0,u>, <6,2,7,3>
+  2758990032U,	// <0,u,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,6,3,7>
+  2659691830U,	// <0,u,6,4>: Cost 3 vsldoi4 <7,0,u,6>, RHS
+  2659471458U,	// <0,u,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  2724901688U,	// <0,u,6,6>: Cost 3 vsldoi8 <6,7,0,u>, <6,6,6,6>
+  1651159893U,	// <0,u,6,7>: Cost 2 vsldoi8 <6,7,0,u>, <6,7,0,u>
+  1651823526U,	// <0,u,6,u>: Cost 2 vsldoi8 <6,u,0,u>, <6,u,0,u>
+  2785827072U,	// <0,u,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,0,1>
+  2803964168U,	// <0,u,7,1>: Cost 3 vsldoi12 <u,7,1,0>, <u,7,1,0>
+  2727556249U,	// <0,u,7,2>: Cost 3 vsldoi8 <7,2,0,u>, <7,2,0,u>
+  2303656092U,	// <0,u,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2785827112U,	// <0,u,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,4,5>
+  2785827122U,	// <0,u,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,5,6>
+  2730210781U,	// <0,u,7,6>: Cost 3 vsldoi8 <7,6,0,u>, <7,6,0,u>
+  2303659336U,	// <0,u,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303656097U,	// <0,u,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  202162278U,	// <0,u,u,0>: Cost 1 vspltisw0 LHS
+  72595246U,	// <0,u,u,1>: Cost 1 vmrghw LHS, LHS
+  1146337160U,	// <0,u,u,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  835584U,	// <0,u,u,3>: Cost 0 copy LHS
+  1146337343U,	// <0,u,u,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  72595610U,	// <0,u,u,5>: Cost 1 vmrghw LHS, RHS
+  1146337488U,	// <0,u,u,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221963080U,	// <0,u,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  835584U,	// <0,u,u,u>: Cost 0 copy LHS
+  2756853760U,	// <1,0,0,0>: Cost 3 vsldoi12 <0,u,1,1>, <0,0,0,0>
+  1677803530U,	// <1,0,0,1>: Cost 2 vsldoi12 <0,0,1,1>, <0,0,1,1>
+  3759497387U,	// <1,0,0,2>: Cost 4 vsldoi8 <0,2,1,0>, <0,2,1,0>
+  2686419196U,	// <1,0,0,3>: Cost 3 vsldoi8 <0,3,1,0>, <0,3,1,0>
+  2751766565U,	// <1,0,0,4>: Cost 3 vsldoi12 <0,0,4,1>, <0,0,4,1>
+  2687746462U,	// <1,0,0,5>: Cost 3 vsldoi8 <0,5,1,0>, <0,5,1,0>
+  3776086518U,	// <1,0,0,6>: Cost 4 vsldoi8 <3,0,1,0>, <0,6,1,7>
+  2689073728U,	// <1,0,0,7>: Cost 3 vsldoi8 <0,7,1,0>, <0,7,1,0>
+  1678319689U,	// <1,0,0,u>: Cost 2 vsldoi12 <0,0,u,1>, <0,0,u,1>
+  2287091712U,	// <1,0,1,0>: Cost 3 vmrglw <0,u,1,1>, <0,0,0,0>
+  1147568230U,	// <1,0,1,1>: Cost 2 vmrghw <1,1,1,1>, LHS
+  1683112038U,	// <1,0,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  3294970108U,	// <1,0,1,3>: Cost 4 vmrghw <1,1,0,0>, <0,3,1,0>
+  2623892790U,	// <1,0,1,4>: Cost 3 vsldoi4 <1,1,0,1>, RHS
+  2647781007U,	// <1,0,1,5>: Cost 3 vsldoi4 <5,1,0,1>, <5,1,0,1>
+  2791948430U,	// <1,0,1,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  3721524218U,	// <1,0,1,7>: Cost 4 vsldoi4 <5,1,0,1>, <7,0,1,2>
+  1683112092U,	// <1,0,1,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2222112768U,	// <1,0,2,0>: Cost 3 vmrghw <1,2,3,0>, <0,0,0,0>
+  1148371046U,	// <1,0,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  3356862524U,	// <1,0,2,2>: Cost 4 vmrglw <0,2,1,2>, <2,u,0,2>
+  2702345894U,	// <1,0,2,3>: Cost 3 vsldoi8 <3,0,1,0>, <2,3,0,1>
+  2222113106U,	// <1,0,2,4>: Cost 3 vmrghw <1,2,3,0>, <0,4,1,5>
+  2299709908U,	// <1,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  3760162746U,	// <1,0,2,6>: Cost 4 vsldoi8 <0,3,1,0>, <2,6,3,7>
+  3369470584U,	// <1,0,2,7>: Cost 4 vmrglw <2,3,1,2>, <3,6,0,7>
+  1148371613U,	// <1,0,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  2686421142U,	// <1,0,3,0>: Cost 3 vsldoi8 <0,3,1,0>, <3,0,1,2>
+  2283128486U,	// <1,0,3,1>: Cost 3 vmrglw <0,2,1,3>, <2,3,0,1>
+  3296305326U,	// <1,0,3,2>: Cost 4 vmrghw <1,3,0,1>, <0,2,1,3>
+  3760163199U,	// <1,0,3,3>: Cost 4 vsldoi8 <0,3,1,0>, <3,3,0,1>
+  3760163330U,	// <1,0,3,4>: Cost 4 vsldoi8 <0,3,1,0>, <3,4,5,6>
+  3779406377U,	// <1,0,3,5>: Cost 4 vsldoi8 <3,5,1,0>, <3,5,1,0>
+  3865690416U,	// <1,0,3,6>: Cost 4 vsldoi12 <6,7,0,1>, <0,3,6,7>
+  3366824568U,	// <1,0,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,0,7>
+  2707655452U,	// <1,0,3,u>: Cost 3 vsldoi8 <3,u,1,0>, <3,u,1,0>
+  2734861202U,	// <1,0,4,0>: Cost 3 vsldoi8 <u,4,1,0>, <4,0,5,1>
+  2756854098U,	// <1,0,4,1>: Cost 3 vsldoi12 <0,u,1,1>, <0,4,1,5>
+  3830595931U,	// <1,0,4,2>: Cost 5 vsldoi12 <0,u,1,1>, <0,4,2,5>
+  3296968960U,	// <1,0,4,3>: Cost 4 vmrghw <1,4,0,1>, <0,3,1,4>
+  3830595949U,	// <1,0,4,4>: Cost 4 vsldoi12 <0,u,1,1>, <0,4,4,5>
+  2686422326U,	// <1,0,4,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  3297378806U,	// <1,0,4,6>: Cost 5 vmrghw <1,4,5,6>, <0,6,1,7>
+  3810594248U,	// <1,0,4,7>: Cost 4 vsldoi8 <u,7,1,0>, <4,7,5,0>
+  2686422569U,	// <1,0,4,u>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2284470272U,	// <1,0,5,0>: Cost 3 vmrglw <0,4,1,5>, <0,0,0,0>
+  2284471974U,	// <1,0,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,1>
+  3809267435U,	// <1,0,5,2>: Cost 4 vsldoi8 <u,5,1,0>, <5,2,1,3>
+  3297968384U,	// <1,0,5,3>: Cost 4 vmrghw <1,5,4,6>, <0,3,1,4>
+  2284471977U,	// <1,0,5,4>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,4>
+  3721555603U,	// <1,0,5,5>: Cost 4 vsldoi4 <5,1,0,5>, <5,1,0,5>
+  3792679010U,	// <1,0,5,6>: Cost 4 vsldoi8 <5,7,1,0>, <5,6,7,0>
+  3792679037U,	// <1,0,5,7>: Cost 4 vsldoi8 <5,7,1,0>, <5,7,1,0>
+  2284471981U,	// <1,0,5,u>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,u>
+  3356893184U,	// <1,0,6,0>: Cost 4 vmrglw <0,2,1,6>, <0,0,0,0>
+  2224676966U,	// <1,0,6,1>: Cost 3 vmrghw <1,6,1,7>, LHS
+  3298295985U,	// <1,0,6,2>: Cost 4 vmrghw <1,6,0,1>, <0,2,1,6>
+  3298345212U,	// <1,0,6,3>: Cost 4 vmrghw <1,6,0,7>, <0,3,1,0>
+  2224972114U,	// <1,0,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  3808604907U,	// <1,0,6,5>: Cost 4 vsldoi8 <u,4,1,0>, <6,5,7,1>
+  3799978808U,	// <1,0,6,6>: Cost 4 vsldoi8 <7,0,1,0>, <6,6,6,6>
+  2726237006U,	// <1,0,6,7>: Cost 3 vsldoi8 <7,0,1,0>, <6,7,0,1>
+  2224677522U,	// <1,0,6,u>: Cost 3 vmrghw <1,6,1,7>, <0,u,1,1>
+  2726237176U,	// <1,0,7,0>: Cost 3 vsldoi8 <7,0,1,0>, <7,0,1,0>
+  2285815462U,	// <1,0,7,1>: Cost 3 vmrglw <0,6,1,7>, <2,3,0,1>
+  3805951193U,	// <1,0,7,2>: Cost 4 vsldoi8 <u,0,1,0>, <7,2,u,0>
+  3807941859U,	// <1,0,7,3>: Cost 4 vsldoi8 <u,3,1,0>, <7,3,0,1>
+  3799979366U,	// <1,0,7,4>: Cost 4 vsldoi8 <7,0,1,0>, <7,4,5,6>
+  3803297165U,	// <1,0,7,5>: Cost 4 vsldoi8 <7,5,1,0>, <7,5,1,0>
+  3799979540U,	// <1,0,7,6>: Cost 4 vsldoi8 <7,0,1,0>, <7,6,7,0>
+  3799979628U,	// <1,0,7,7>: Cost 4 vsldoi8 <7,0,1,0>, <7,7,7,7>
+  2731546240U,	// <1,0,7,u>: Cost 3 vsldoi8 <7,u,1,0>, <7,u,1,0>
+  2284494848U,	// <1,0,u,0>: Cost 3 vmrglw <0,4,1,u>, <0,0,0,0>
+  1683112594U,	// <1,0,u,1>: Cost 2 vsldoi12 <0,u,1,1>, <0,u,1,1>
+  1683112605U,	// <1,0,u,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2734200772U,	// <1,0,u,3>: Cost 3 vsldoi8 <u,3,1,0>, <u,3,1,0>
+  2757075629U,	// <1,0,u,4>: Cost 3 vsldoi12 <0,u,4,1>, <0,u,4,1>
+  2686425242U,	// <1,0,u,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2791948430U,	// <1,0,u,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  2736855304U,	// <1,0,u,7>: Cost 3 vsldoi8 <u,7,1,0>, <u,7,1,0>
+  1683112659U,	// <1,0,u,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1610694666U,	// <1,1,0,0>: Cost 2 vsldoi8 <0,0,1,1>, <0,0,1,1>
+  1616003174U,	// <1,1,0,1>: Cost 2 vsldoi8 <0,u,1,1>, LHS
+  2283767958U,	// <1,1,0,2>: Cost 3 vmrglw <0,3,1,0>, <3,0,1,2>
+  3357507596U,	// <1,1,0,3>: Cost 4 vmrglw <0,3,1,0>, <0,0,1,3>
+  2689745234U,	// <1,1,0,4>: Cost 3 vsldoi8 <0,u,1,1>, <0,4,1,5>
+  3357507922U,	// <1,1,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,1,5>
+  3294397647U,	// <1,1,0,6>: Cost 4 vmrghw <1,0,1,2>, <1,6,1,7>
+  3373433334U,	// <1,1,0,7>: Cost 4 vmrglw <3,0,1,0>, <0,6,1,7>
+  1616003730U,	// <1,1,0,u>: Cost 2 vsldoi8 <0,u,1,1>, <0,u,1,1>
+  1550221414U,	// <1,1,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,1,1>: Cost 1 vspltisw1 LHS
+  2287093910U,	// <1,1,1,2>: Cost 3 vmrglw <0,u,1,1>, <3,0,1,2>
+  2287092615U,	// <1,1,1,3>: Cost 3 vmrglw <0,u,1,1>, <1,2,1,3>
+  1550224694U,	// <1,1,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  2287092050U,	// <1,1,1,5>: Cost 3 vmrglw <0,u,1,1>, <0,4,1,5>
+  2689746127U,	// <1,1,1,6>: Cost 3 vsldoi8 <0,u,1,1>, <1,6,1,7>
+  2659800138U,	// <1,1,1,7>: Cost 3 vsldoi4 <7,1,1,1>, <7,1,1,1>
+  269271142U,	// <1,1,1,u>: Cost 1 vspltisw1 LHS
+  2222113516U,	// <1,1,2,0>: Cost 3 vmrghw <1,2,3,0>, <1,0,2,1>
+  2756854663U,	// <1,1,2,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,2,1,3>
+  1148371862U,	// <1,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689746598U,	// <1,1,2,3>: Cost 3 vsldoi8 <0,u,1,1>, <2,3,0,1>
+  2618002742U,	// <1,1,2,4>: Cost 3 vsldoi4 <0,1,1,2>, RHS
+  2299707730U,	// <1,1,2,5>: Cost 3 vmrglw <3,0,1,2>, <0,4,1,5>
+  2689746874U,	// <1,1,2,6>: Cost 3 vsldoi8 <0,u,1,1>, <2,6,3,7>
+  3361506511U,	// <1,1,2,7>: Cost 4 vmrglw <1,0,1,2>, <1,6,1,7>
+  1148371862U,	// <1,1,2,u>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689747094U,	// <1,1,3,0>: Cost 3 vsldoi8 <0,u,1,1>, <3,0,1,2>
+  2691074278U,	// <1,1,3,1>: Cost 3 vsldoi8 <1,1,1,1>, <3,1,1,1>
+  3356870806U,	// <1,1,3,2>: Cost 4 vmrglw <0,2,1,3>, <3,0,1,2>
+  2283126958U,	// <1,1,3,3>: Cost 3 vmrglw <0,2,1,3>, <0,2,1,3>
+  2689747458U,	// <1,1,3,4>: Cost 3 vsldoi8 <0,u,1,1>, <3,4,5,6>
+  3356868946U,	// <1,1,3,5>: Cost 4 vmrglw <0,2,1,3>, <0,4,1,5>
+  3811265144U,	// <1,1,3,6>: Cost 4 vsldoi8 <u,u,1,1>, <3,6,0,7>
+  3362841807U,	// <1,1,3,7>: Cost 4 vmrglw <1,2,1,3>, <1,6,1,7>
+  2689747742U,	// <1,1,3,u>: Cost 3 vsldoi8 <0,u,1,1>, <3,u,1,2>
+  2623987814U,	// <1,1,4,0>: Cost 3 vsldoi4 <1,1,1,4>, LHS
+  2758181931U,	// <1,1,4,1>: Cost 3 vsldoi12 <1,1,1,1>, <1,4,1,5>
+  2223408022U,	// <1,1,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  3697731734U,	// <1,1,4,3>: Cost 4 vsldoi4 <1,1,1,4>, <3,0,1,2>
+  2283798784U,	// <1,1,4,4>: Cost 3 vmrglw <0,3,1,4>, <0,3,1,4>
+  1616006454U,	// <1,1,4,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  3297379535U,	// <1,1,4,6>: Cost 4 vmrghw <1,4,5,6>, <1,6,1,7>
+  3373466102U,	// <1,1,4,7>: Cost 4 vmrglw <3,0,1,4>, <0,6,1,7>
+  1616006697U,	// <1,1,4,u>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2760762479U,	// <1,1,5,0>: Cost 3 vsldoi12 <1,5,0,1>, <1,5,0,1>
+  2284470282U,	// <1,1,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,1>
+  2284472470U,	// <1,1,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,1,2>
+  3358212270U,	// <1,1,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,1,3>
+  2284470285U,	// <1,1,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,4>
+  1210728786U,	// <1,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2737524834U,	// <1,1,5,6>: Cost 3 vsldoi8 <u,u,1,1>, <5,6,7,0>
+  3360867535U,	// <1,1,5,7>: Cost 4 vmrglw <0,u,1,5>, <1,6,1,7>
+  1210728786U,	// <1,1,5,u>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  3697746022U,	// <1,1,6,0>: Cost 4 vsldoi4 <1,1,1,6>, LHS
+  2756854991U,	// <1,1,6,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,6,1,7>
+  2737525242U,	// <1,1,6,2>: Cost 3 vsldoi8 <u,u,1,1>, <6,2,7,3>
+  3839149281U,	// <1,1,6,3>: Cost 4 vsldoi12 <2,3,0,1>, <1,6,3,7>
+  3697749302U,	// <1,1,6,4>: Cost 4 vsldoi4 <1,1,1,6>, RHS
+  3356893522U,	// <1,1,6,5>: Cost 4 vmrglw <0,2,1,6>, <0,4,1,5>
+  2283151537U,	// <1,1,6,6>: Cost 3 vmrglw <0,2,1,6>, <0,2,1,6>
+  2791949566U,	// <1,1,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <1,6,7,0>
+  2792613127U,	// <1,1,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <1,6,u,0>
+  2737525754U,	// <1,1,7,0>: Cost 3 vsldoi8 <u,u,1,1>, <7,0,1,2>
+  2291786386U,	// <1,1,7,1>: Cost 3 vmrglw <1,6,1,7>, <0,u,1,1>
+  3365528292U,	// <1,1,7,2>: Cost 4 vmrglw <1,6,1,7>, <1,0,1,2>
+  3365528455U,	// <1,1,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,1,3>
+  2737526118U,	// <1,1,7,4>: Cost 3 vsldoi8 <u,u,1,1>, <7,4,5,6>
+  3365527890U,	// <1,1,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,1,5>
+  3365528377U,	// <1,1,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,1,1,6>
+  2291786959U,	// <1,1,7,7>: Cost 3 vmrglw <1,6,1,7>, <1,6,1,7>
+  2737526402U,	// <1,1,7,u>: Cost 3 vsldoi8 <u,u,1,1>, <7,u,1,2>
+  1550221414U,	// <1,1,u,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,u,1>: Cost 1 vspltisw1 LHS
+  1148371862U,	// <1,1,u,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689750972U,	// <1,1,u,3>: Cost 3 vsldoi8 <0,u,1,1>, <u,3,0,1>
+  1550224694U,	// <1,1,u,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1616009370U,	// <1,1,u,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2689751248U,	// <1,1,u,6>: Cost 3 vsldoi8 <0,u,1,1>, <u,6,3,7>
+  2736863497U,	// <1,1,u,7>: Cost 3 vsldoi8 <u,7,1,1>, <u,7,1,1>
+  269271142U,	// <1,1,u,u>: Cost 1 vspltisw1 LHS
+  2702360576U,	// <1,2,0,0>: Cost 3 vsldoi8 <3,0,1,2>, <0,0,0,0>
+  1628618854U,	// <1,2,0,1>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2685771949U,	// <1,2,0,2>: Cost 3 vsldoi8 <0,2,1,2>, <0,2,1,2>
+  2283765862U,	// <1,2,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  2702360914U,	// <1,2,0,4>: Cost 3 vsldoi8 <3,0,1,2>, <0,4,1,5>
+  3788046813U,	// <1,2,0,5>: Cost 4 vsldoi8 <5,0,1,2>, <0,5,u,0>
+  2688426481U,	// <1,2,0,6>: Cost 3 vsldoi8 <0,6,1,2>, <0,6,1,2>
+  2726249024U,	// <1,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1628619421U,	// <1,2,0,u>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2690417380U,	// <1,2,1,0>: Cost 3 vsldoi8 <1,0,1,2>, <1,0,1,2>
+  2702361396U,	// <1,2,1,1>: Cost 3 vsldoi8 <3,0,1,2>, <1,1,1,1>
+  2287093352U,	// <1,2,1,2>: Cost 3 vmrglw <0,u,1,1>, <2,2,2,2>
+  1213349990U,	// <1,2,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  3764159522U,	// <1,2,1,4>: Cost 4 vsldoi8 <1,0,1,2>, <1,4,0,5>
+  3295053672U,	// <1,2,1,5>: Cost 4 vmrghw <1,1,1,1>, <2,5,3,6>
+  2221311930U,	// <1,2,1,6>: Cost 3 vmrghw <1,1,1,1>, <2,6,3,7>
+  3799991593U,	// <1,2,1,7>: Cost 4 vsldoi8 <7,0,1,2>, <1,7,2,7>
+  1213349995U,	// <1,2,1,u>: Cost 2 vmrglw <0,u,1,1>, LHS
+  2624045158U,	// <1,2,2,0>: Cost 3 vsldoi4 <1,1,2,2>, LHS
+  2702362144U,	// <1,2,2,1>: Cost 3 vsldoi8 <3,0,1,2>, <2,1,3,2>
+  2283120232U,	// <1,2,2,2>: Cost 3 vmrglw <0,2,1,2>, <2,2,2,2>
+  1225965670U,	// <1,2,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2624048438U,	// <1,2,2,4>: Cost 3 vsldoi4 <1,1,2,2>, RHS
+  3356860763U,	// <1,2,2,5>: Cost 4 vmrglw <0,2,1,2>, <0,4,2,5>
+  2222114746U,	// <1,2,2,6>: Cost 3 vmrghw <1,2,3,0>, <2,6,3,7>
+  2299708632U,	// <1,2,2,7>: Cost 3 vmrglw <3,0,1,2>, <1,6,2,7>
+  1225965675U,	// <1,2,2,u>: Cost 2 vmrglw <3,0,1,2>, LHS
+  470597734U,	// <1,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544340276U,	// <1,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544341096U,	// <1,2,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544341916U,	// <1,2,3,3>: Cost 2 vsldoi4 LHS, <3,3,3,3>
+  470601014U,	// <1,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592119300U,	// <1,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592119802U,	// <1,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592120314U,	// <1,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470603566U,	// <1,2,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708335471U,	// <1,2,4,0>: Cost 3 vsldoi8 <4,0,1,2>, <4,0,1,2>
+  3838043908U,	// <1,2,4,1>: Cost 4 vsldoi12 <2,1,3,1>, <2,4,1,5>
+  3357541992U,	// <1,2,4,2>: Cost 4 vmrglw <0,3,1,4>, <2,2,2,2>
+  2283798630U,	// <1,2,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2726251728U,	// <1,2,4,4>: Cost 3 vsldoi8 <7,0,1,2>, <4,4,4,4>
+  1628622134U,	// <1,2,4,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  3297077178U,	// <1,2,4,6>: Cost 4 vmrghw <1,4,1,5>, <2,6,3,7>
+  2726251976U,	// <1,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1628622377U,	// <1,2,4,u>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  2714308168U,	// <1,2,5,0>: Cost 3 vsldoi8 <5,0,1,2>, <5,0,1,2>
+  3297633827U,	// <1,2,5,1>: Cost 4 vmrghw <1,5,0,1>, <2,1,3,5>
+  2284471912U,	// <1,2,5,2>: Cost 3 vmrglw <0,4,1,5>, <2,2,2,2>
+  1210728550U,	// <1,2,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  3776106420U,	// <1,2,5,4>: Cost 4 vsldoi8 <3,0,1,2>, <5,4,5,6>
+  2726252548U,	// <1,2,5,5>: Cost 3 vsldoi8 <7,0,1,2>, <5,5,5,5>
+  2726252642U,	// <1,2,5,6>: Cost 3 vsldoi8 <7,0,1,2>, <5,6,7,0>
+  3799994538U,	// <1,2,5,7>: Cost 4 vsldoi8 <7,0,1,2>, <5,7,6,0>
+  1210728555U,	// <1,2,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720280865U,	// <1,2,6,0>: Cost 3 vsldoi8 <6,0,1,2>, <6,0,1,2>
+  2702365096U,	// <1,2,6,1>: Cost 3 vsldoi8 <3,0,1,2>, <6,1,7,2>
+  2726253050U,	// <1,2,6,2>: Cost 3 vsldoi8 <7,0,1,2>, <6,2,7,3>
+  2283151462U,	// <1,2,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  3697823030U,	// <1,2,6,4>: Cost 4 vsldoi4 <1,1,2,6>, RHS
+  3298715497U,	// <1,2,6,5>: Cost 4 vmrghw <1,6,5,7>, <2,5,3,7>
+  2726253368U,	// <1,2,6,6>: Cost 3 vsldoi8 <7,0,1,2>, <6,6,6,6>
+  2724926296U,	// <1,2,6,7>: Cost 3 vsldoi8 <6,7,1,2>, <6,7,1,2>
+  2283151467U,	// <1,2,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652511738U,	// <1,2,7,0>: Cost 2 vsldoi8 <7,0,1,2>, <7,0,1,2>
+  3371500916U,	// <1,2,7,1>: Cost 4 vmrglw <2,6,1,7>, <1,u,2,1>
+  3365529192U,	// <1,2,7,2>: Cost 4 vmrglw <1,6,1,7>, <2,2,2,2>
+  2291785830U,	// <1,2,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2726253926U,	// <1,2,7,4>: Cost 3 vsldoi8 <7,0,1,2>, <7,4,5,6>
+  3788051845U,	// <1,2,7,5>: Cost 4 vsldoi8 <5,0,1,2>, <7,5,0,1>
+  3794023894U,	// <1,2,7,6>: Cost 4 vsldoi8 <6,0,1,2>, <7,6,0,1>
+  2726254119U,	// <1,2,7,7>: Cost 3 vsldoi8 <7,0,1,2>, <7,7,0,1>
+  1657820802U,	// <1,2,7,u>: Cost 2 vsldoi8 <7,u,1,2>, <7,u,1,2>
+  470638699U,	// <1,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1544381236U,	// <1,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544382056U,	// <1,2,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544382614U,	// <1,2,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  470641974U,	// <1,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1628625050U,	// <1,2,u,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  1592160762U,	// <1,2,u,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592161274U,	// <1,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470644526U,	// <1,2,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2769389708U,	// <1,3,0,0>: Cost 3 vsldoi12 <3,0,0,1>, <3,0,0,1>
+  2685780070U,	// <1,3,0,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2685780142U,	// <1,3,0,2>: Cost 3 vsldoi8 <0,2,1,3>, <0,2,1,3>
+  2686443775U,	// <1,3,0,3>: Cost 3 vsldoi8 <0,3,1,3>, <0,3,1,3>
+  2769684656U,	// <1,3,0,4>: Cost 3 vsldoi12 <3,0,4,1>, <3,0,4,1>
+  3357507940U,	// <1,3,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,3,5>
+  3759522294U,	// <1,3,0,6>: Cost 4 vsldoi8 <0,2,1,3>, <0,6,1,7>
+  3357509562U,	// <1,3,0,7>: Cost 4 vmrglw <0,3,1,0>, <2,6,3,7>
+  2685780637U,	// <1,3,0,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2287092630U,	// <1,3,1,0>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,0>
+  2221312230U,	// <1,3,1,1>: Cost 3 vmrghw <1,1,1,1>, <3,1,1,1>
+  2691752839U,	// <1,3,1,2>: Cost 3 vsldoi8 <1,2,1,3>, <1,2,1,3>
+  2287093362U,	// <1,3,1,3>: Cost 3 vmrglw <0,u,1,1>, <2,2,3,3>
+  2287092634U,	// <1,3,1,4>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,4>
+  3360835107U,	// <1,3,1,5>: Cost 4 vmrglw <0,u,1,1>, <2,1,3,5>
+  3759523041U,	// <1,3,1,6>: Cost 4 vsldoi8 <0,2,1,3>, <1,6,3,7>
+  2287093690U,	// <1,3,1,7>: Cost 3 vmrglw <0,u,1,1>, <2,6,3,7>
+  2287092638U,	// <1,3,1,u>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,u>
+  2222114966U,	// <1,3,2,0>: Cost 3 vmrghw <1,2,3,0>, <3,0,1,2>
+  2222115057U,	// <1,3,2,1>: Cost 3 vmrghw <1,2,3,0>, <3,1,2,3>
+  2630092320U,	// <1,3,2,2>: Cost 3 vsldoi4 <2,1,3,2>, <2,1,3,2>
+  2685781670U,	// <1,3,2,3>: Cost 3 vsldoi8 <0,2,1,3>, <2,3,0,1>
+  2222115330U,	// <1,3,2,4>: Cost 3 vmrghw <1,2,3,0>, <3,4,5,6>
+  3373449572U,	// <1,3,2,5>: Cost 4 vmrglw <3,0,1,2>, <0,4,3,5>
+  2222115448U,	// <1,3,2,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2299709370U,	// <1,3,2,7>: Cost 3 vmrglw <3,0,1,2>, <2,6,3,7>
+  2222115614U,	// <1,3,2,u>: Cost 3 vmrghw <1,2,3,0>, <3,u,1,2>
+  2771380607U,	// <1,3,3,0>: Cost 3 vsldoi12 <3,3,0,1>, <3,3,0,1>
+  3356874468U,	// <1,3,3,1>: Cost 4 vmrglw <0,2,1,3>, <u,0,3,1>
+  3759524168U,	// <1,3,3,2>: Cost 4 vsldoi8 <0,2,1,3>, <3,2,3,0>
+  2283792796U,	// <1,3,3,3>: Cost 3 vmrglw <0,3,1,3>, <3,3,3,3>
+  3356869530U,	// <1,3,3,4>: Cost 4 vmrglw <0,2,1,3>, <1,2,3,4>
+  3721760428U,	// <1,3,3,5>: Cost 4 vsldoi4 <5,1,3,3>, <5,1,3,3>
+  3296496248U,	// <1,3,3,6>: Cost 4 vmrghw <1,3,2,6>, <3,6,0,7>
+  3356870586U,	// <1,3,3,7>: Cost 4 vmrglw <0,2,1,3>, <2,6,3,7>
+  2771970503U,	// <1,3,3,u>: Cost 3 vsldoi12 <3,3,u,1>, <3,3,u,1>
+  2772044240U,	// <1,3,4,0>: Cost 3 vsldoi12 <3,4,0,1>, <3,4,0,1>
+  3362186135U,	// <1,3,4,1>: Cost 4 vmrglw <1,1,1,4>, <1,2,3,1>
+  3297151280U,	// <1,3,4,2>: Cost 4 vmrghw <1,4,2,5>, <3,2,0,3>
+  3357542002U,	// <1,3,4,3>: Cost 4 vmrglw <0,3,1,4>, <2,2,3,3>
+  3357540626U,	// <1,3,4,4>: Cost 4 vmrglw <0,3,1,4>, <0,3,3,4>
+  2685783350U,	// <1,3,4,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  3357546622U,	// <1,3,4,6>: Cost 4 vmrglw <0,3,1,4>, <u,5,3,6>
+  3357542330U,	// <1,3,4,7>: Cost 4 vmrglw <0,3,1,4>, <2,6,3,7>
+  2685783593U,	// <1,3,4,u>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2284471190U,	// <1,3,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,0>
+  3358213015U,	// <1,3,5,1>: Cost 4 vmrglw <0,4,1,5>, <1,2,3,1>
+  2630116899U,	// <1,3,5,2>: Cost 3 vsldoi4 <2,1,3,5>, <2,1,3,5>
+  2284471922U,	// <1,3,5,3>: Cost 3 vmrglw <0,4,1,5>, <2,2,3,3>
+  2284471194U,	// <1,3,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,4>
+  2284471843U,	// <1,3,5,5>: Cost 3 vmrglw <0,4,1,5>, <2,1,3,5>
+  3358218366U,	// <1,3,5,6>: Cost 4 vmrglw <0,4,1,5>, <u,5,3,6>
+  2284472250U,	// <1,3,5,7>: Cost 3 vmrglw <0,4,1,5>, <2,6,3,7>
+  2284471198U,	// <1,3,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,u>
+  2224752790U,	// <1,3,6,0>: Cost 3 vmrghw <1,6,2,7>, <3,0,1,2>
+  3832736385U,	// <1,3,6,1>: Cost 4 vsldoi12 <1,2,3,1>, <3,6,1,7>
+  3703866916U,	// <1,3,6,2>: Cost 4 vsldoi4 <2,1,3,6>, <2,1,3,6>
+  3356894834U,	// <1,3,6,3>: Cost 4 vmrglw <0,2,1,6>, <2,2,3,3>
+  3356894106U,	// <1,3,6,4>: Cost 4 vmrglw <0,2,1,6>, <1,2,3,4>
+  3356894755U,	// <1,3,6,5>: Cost 5 vmrglw <0,2,1,6>, <2,1,3,5>
+  3356899130U,	// <1,3,6,6>: Cost 4 vmrglw <0,2,1,6>, <u,1,3,6>
+  2283153338U,	// <1,3,6,7>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2283153338U,	// <1,3,6,u>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2774035139U,	// <1,3,7,0>: Cost 3 vsldoi12 <3,7,0,1>, <3,7,0,1>
+  3703874767U,	// <1,3,7,1>: Cost 4 vsldoi4 <2,1,3,7>, <1,6,1,7>
+  3703875109U,	// <1,3,7,2>: Cost 4 vsldoi4 <2,1,3,7>, <2,1,3,7>
+  3365529202U,	// <1,3,7,3>: Cost 4 vmrglw <1,6,1,7>, <2,2,3,3>
+  3365528474U,	// <1,3,7,4>: Cost 4 vmrglw <1,6,1,7>, <1,2,3,4>
+  3789387159U,	// <1,3,7,5>: Cost 4 vsldoi8 <5,2,1,3>, <7,5,2,1>
+  3865692927U,	// <1,3,7,6>: Cost 4 vsldoi12 <6,7,0,1>, <3,7,6,7>
+  3363538874U,	// <1,3,7,7>: Cost 4 vmrglw <1,3,1,7>, <2,6,3,7>
+  2774625035U,	// <1,3,7,u>: Cost 3 vsldoi12 <3,7,u,1>, <3,7,u,1>
+  2284495766U,	// <1,3,u,0>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,0>
+  2685785902U,	// <1,3,u,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2630141478U,	// <1,3,u,2>: Cost 3 vsldoi4 <2,1,3,u>, <2,1,3,u>
+  2283169880U,	// <1,3,u,3>: Cost 3 vmrglw <0,2,1,u>, <2,u,3,3>
+  2284495770U,	// <1,3,u,4>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,4>
+  2685786266U,	// <1,3,u,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2222115448U,	// <1,3,u,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2284496826U,	// <1,3,u,7>: Cost 3 vmrglw <0,4,1,u>, <2,6,3,7>
+  2685786469U,	// <1,3,u,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2684461069U,	// <1,4,0,0>: Cost 3 vsldoi8 <0,0,1,4>, <0,0,1,4>
+  2686451814U,	// <1,4,0,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  3759530159U,	// <1,4,0,2>: Cost 4 vsldoi8 <0,2,1,4>, <0,2,1,4>
+  2686451968U,	// <1,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2684461394U,	// <1,4,0,4>: Cost 3 vsldoi8 <0,0,1,4>, <0,4,1,5>
+  1701989266U,	// <1,4,0,5>: Cost 2 vsldoi12 <4,0,5,1>, <4,0,5,1>
+  3776119286U,	// <1,4,0,6>: Cost 4 vsldoi8 <3,0,1,4>, <0,6,1,7>
+  2689106500U,	// <1,4,0,7>: Cost 3 vsldoi8 <0,7,1,4>, <0,7,1,4>
+  1702210477U,	// <1,4,0,u>: Cost 2 vsldoi12 <4,0,u,1>, <4,0,u,1>
+  2221312914U,	// <1,4,1,0>: Cost 3 vmrghw <1,1,1,1>, <4,0,5,1>
+  2691097399U,	// <1,4,1,1>: Cost 3 vsldoi8 <1,1,1,4>, <1,1,1,4>
+  3760194454U,	// <1,4,1,2>: Cost 4 vsldoi8 <0,3,1,4>, <1,2,3,0>
+  3766166489U,	// <1,4,1,3>: Cost 4 vsldoi8 <1,3,1,4>, <1,3,1,4>
+  2334870736U,	// <1,4,1,4>: Cost 3 vmrglw <u,u,1,1>, <4,4,4,4>
+  1147571510U,	// <1,4,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  3760194794U,	// <1,4,1,6>: Cost 4 vsldoi8 <0,3,1,4>, <1,6,4,7>
+  3867315188U,	// <1,4,1,7>: Cost 4 vsldoi12 <7,0,4,1>, <4,1,7,0>
+  1147571753U,	// <1,4,1,u>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2222115730U,	// <1,4,2,0>: Cost 3 vmrghw <1,2,3,0>, <4,0,5,1>
+  2222115812U,	// <1,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  3760195176U,	// <1,4,2,2>: Cost 4 vsldoi8 <0,3,1,4>, <2,2,2,2>
+  2702378662U,	// <1,4,2,3>: Cost 3 vsldoi8 <3,0,1,4>, <2,3,0,1>
+  2323598544U,	// <1,4,2,4>: Cost 3 vmrglw <7,0,1,2>, <4,4,4,4>
+  1148374326U,	// <1,4,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  3760195514U,	// <1,4,2,6>: Cost 4 vsldoi8 <0,3,1,4>, <2,6,3,7>
+  3373451932U,	// <1,4,2,7>: Cost 4 vmrglw <3,0,1,2>, <3,6,4,7>
+  1148374569U,	// <1,4,2,u>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2702379160U,	// <1,4,3,0>: Cost 3 vsldoi8 <3,0,1,4>, <3,0,1,4>
+  3760195840U,	// <1,4,3,1>: Cost 4 vsldoi8 <0,3,1,4>, <3,1,4,0>
+  3776121160U,	// <1,4,3,2>: Cost 4 vsldoi8 <3,0,1,4>, <3,2,3,0>
+  3760195996U,	// <1,4,3,3>: Cost 4 vsldoi8 <0,3,1,4>, <3,3,3,3>
+  2686454274U,	// <1,4,3,4>: Cost 3 vsldoi8 <0,3,1,4>, <3,4,5,6>
+  3356870350U,	// <1,4,3,5>: Cost 4 vmrglw <0,2,1,3>, <2,3,4,5>
+  3800009392U,	// <1,4,3,6>: Cost 4 vsldoi8 <7,0,1,4>, <3,6,7,0>
+  3366824604U,	// <1,4,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,4,7>
+  2707688224U,	// <1,4,3,u>: Cost 3 vsldoi8 <3,u,1,4>, <3,u,1,4>
+  2775731368U,	// <1,4,4,0>: Cost 3 vsldoi12 <4,0,5,1>, <4,4,0,0>
+  3830820018U,	// <1,4,4,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,4,1,1>
+  3691980454U,	// <1,4,4,2>: Cost 4 vsldoi4 <0,1,4,4>, <2,3,0,1>
+  3357541282U,	// <1,4,4,3>: Cost 4 vmrglw <0,3,1,4>, <1,2,4,3>
+  2781039824U,	// <1,4,4,4>: Cost 3 vsldoi12 <4,u,5,1>, <4,4,4,4>
+  2686455094U,	// <1,4,4,5>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  3357541528U,	// <1,4,4,6>: Cost 4 vmrglw <0,3,1,4>, <1,5,4,6>
+  3810627020U,	// <1,4,4,7>: Cost 4 vsldoi8 <u,7,1,4>, <4,7,5,4>
+  2686455337U,	// <1,4,4,u>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  2624217190U,	// <1,4,5,0>: Cost 3 vsldoi4 <1,1,4,5>, LHS
+  2284470309U,	// <1,4,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,1>
+  2618246822U,	// <1,4,5,2>: Cost 3 vsldoi4 <0,1,4,5>, <2,3,0,1>
+  3358212297U,	// <1,4,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,4,3>
+  2284470312U,	// <1,4,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,4>
+  2284470637U,	// <1,4,5,5>: Cost 3 vmrglw <0,4,1,5>, <0,4,4,5>
+  1683115318U,	// <1,4,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3721851898U,	// <1,4,5,7>: Cost 4 vsldoi4 <5,1,4,5>, <7,0,1,2>
+  1683115336U,	// <1,4,5,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3794039075U,	// <1,4,6,0>: Cost 4 vsldoi8 <6,0,1,4>, <6,0,1,4>
+  3830820186U,	// <1,4,6,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,6,1,7>
+  3800011258U,	// <1,4,6,2>: Cost 4 vsldoi8 <7,0,1,4>, <6,2,7,3>
+  3807973938U,	// <1,4,6,3>: Cost 4 vsldoi8 <u,3,1,4>, <6,3,4,5>
+  3298716880U,	// <1,4,6,4>: Cost 4 vmrghw <1,6,5,7>, <4,4,4,4>
+  2224680246U,	// <1,4,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  3800011576U,	// <1,4,6,6>: Cost 4 vsldoi8 <7,0,1,4>, <6,6,6,6>
+  2726269774U,	// <1,4,6,7>: Cost 3 vsldoi8 <7,0,1,4>, <6,7,0,1>
+  2224680489U,	// <1,4,6,u>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726269948U,	// <1,4,7,0>: Cost 3 vsldoi8 <7,0,1,4>, <7,0,1,4>
+  3383444141U,	// <1,4,7,1>: Cost 4 vmrglw <4,6,1,7>, <0,u,4,1>
+  3805983961U,	// <1,4,7,2>: Cost 4 vsldoi8 <u,0,1,4>, <7,2,u,0>
+  3807974667U,	// <1,4,7,3>: Cost 4 vsldoi8 <u,3,1,4>, <7,3,4,5>
+  2736887142U,	// <1,4,7,4>: Cost 3 vsldoi8 <u,7,1,4>, <7,4,5,6>
+  3365528403U,	// <1,4,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,1,4,5>
+  3800012308U,	// <1,4,7,6>: Cost 4 vsldoi8 <7,0,1,4>, <7,6,7,0>
+  3800012396U,	// <1,4,7,7>: Cost 4 vsldoi8 <7,0,1,4>, <7,7,7,7>
+  2731579012U,	// <1,4,7,u>: Cost 3 vsldoi8 <7,u,1,4>, <7,u,1,4>
+  2624241766U,	// <1,4,u,0>: Cost 3 vsldoi4 <1,1,4,u>, LHS
+  2686457646U,	// <1,4,u,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  2618271398U,	// <1,4,u,2>: Cost 3 vsldoi4 <0,1,4,u>, <2,3,0,1>
+  2734233544U,	// <1,4,u,3>: Cost 3 vsldoi8 <u,3,1,4>, <u,3,1,4>
+  2689775679U,	// <1,4,u,4>: Cost 3 vsldoi8 <0,u,1,4>, <u,4,5,6>
+  1152355638U,	// <1,4,u,5>: Cost 2 vmrghw <1,u,3,0>, RHS
+  1683115561U,	// <1,4,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2736888076U,	// <1,4,u,7>: Cost 3 vsldoi8 <u,7,1,4>, <u,7,1,4>
+  1683115579U,	// <1,4,u,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2687123456U,	// <1,5,0,0>: Cost 3 vsldoi8 <0,4,1,5>, <0,0,0,0>
+  1613381734U,	// <1,5,0,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759538352U,	// <1,5,0,2>: Cost 4 vsldoi8 <0,2,1,5>, <0,2,1,5>
+  3760865532U,	// <1,5,0,3>: Cost 4 vsldoi8 <0,4,1,5>, <0,3,1,0>
+  1613381970U,	// <1,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2687787427U,	// <1,5,0,5>: Cost 3 vsldoi8 <0,5,1,5>, <0,5,1,5>
+  2781777524U,	// <1,5,0,6>: Cost 3 vsldoi12 <5,0,6,1>, <5,0,6,1>
+  3733828717U,	// <1,5,0,7>: Cost 4 vsldoi4 <7,1,5,0>, <7,1,5,0>
+  1613382301U,	// <1,5,0,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2781040271U,	// <1,5,1,0>: Cost 3 vsldoi12 <4,u,5,1>, <5,1,0,1>
+  2687124276U,	// <1,5,1,1>: Cost 3 vsldoi8 <0,4,1,5>, <1,1,1,1>
+  2687124374U,	// <1,5,1,2>: Cost 3 vsldoi8 <0,4,1,5>, <1,2,3,0>
+  3760866297U,	// <1,5,1,3>: Cost 4 vsldoi8 <0,4,1,5>, <1,3,5,0>
+  2693096491U,	// <1,5,1,4>: Cost 3 vsldoi8 <1,4,1,5>, <1,4,1,5>
+  2687124591U,	// <1,5,1,5>: Cost 3 vsldoi8 <0,4,1,5>, <1,5,0,1>
+  2687124723U,	// <1,5,1,6>: Cost 3 vsldoi8 <0,4,1,5>, <1,6,5,7>
+  3360834803U,	// <1,5,1,7>: Cost 4 vmrglw <0,u,1,1>, <1,6,5,7>
+  2687124860U,	// <1,5,1,u>: Cost 3 vsldoi8 <0,4,1,5>, <1,u,3,0>
+  2323598792U,	// <1,5,2,0>: Cost 3 vmrglw <7,0,1,2>, <4,7,5,0>
+  2687125027U,	// <1,5,2,1>: Cost 3 vsldoi8 <0,4,1,5>, <2,1,3,5>
+  2687125096U,	// <1,5,2,2>: Cost 3 vsldoi8 <0,4,1,5>, <2,2,2,2>
+  2687125158U,	// <1,5,2,3>: Cost 3 vsldoi8 <0,4,1,5>, <2,3,0,1>
+  2642185188U,	// <1,5,2,4>: Cost 3 vsldoi4 <4,1,5,2>, <4,1,5,2>
+  2323598554U,	// <1,5,2,5>: Cost 3 vmrglw <7,0,1,2>, <4,4,5,5>
+  2687125434U,	// <1,5,2,6>: Cost 3 vsldoi8 <0,4,1,5>, <2,6,3,7>
+  3373450483U,	// <1,5,2,7>: Cost 4 vmrglw <3,0,1,2>, <1,6,5,7>
+  2687125563U,	// <1,5,2,u>: Cost 3 vsldoi8 <0,4,1,5>, <2,u,0,1>
+  2687125654U,	// <1,5,3,0>: Cost 3 vsldoi8 <0,4,1,5>, <3,0,1,2>
+  2312990234U,	// <1,5,3,1>: Cost 3 vmrglw <5,2,1,3>, <4,u,5,1>
+  3760867649U,	// <1,5,3,2>: Cost 4 vsldoi8 <0,4,1,5>, <3,2,2,2>
+  2687125916U,	// <1,5,3,3>: Cost 3 vsldoi8 <0,4,1,5>, <3,3,3,3>
+  2687126018U,	// <1,5,3,4>: Cost 3 vsldoi8 <0,4,1,5>, <3,4,5,6>
+  3386731738U,	// <1,5,3,5>: Cost 4 vmrglw <5,2,1,3>, <4,4,5,5>
+  3356871170U,	// <1,5,3,6>: Cost 4 vmrglw <0,2,1,3>, <3,4,5,6>
+  3808643779U,	// <1,5,3,7>: Cost 4 vsldoi8 <u,4,1,5>, <3,7,0,1>
+  2687126302U,	// <1,5,3,u>: Cost 3 vsldoi8 <0,4,1,5>, <3,u,1,2>
+  2642198630U,	// <1,5,4,0>: Cost 3 vsldoi4 <4,1,5,4>, LHS
+  2687126498U,	// <1,5,4,1>: Cost 3 vsldoi8 <0,4,1,5>, <4,1,5,0>
+  3715941923U,	// <1,5,4,2>: Cost 4 vsldoi4 <4,1,5,4>, <2,1,3,5>
+  3709970701U,	// <1,5,4,3>: Cost 4 vsldoi4 <3,1,5,4>, <3,1,5,4>
+  2687126736U,	// <1,5,4,4>: Cost 3 vsldoi8 <0,4,1,5>, <4,4,4,4>
+  1613385014U,	// <1,5,4,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2283801090U,	// <1,5,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  3733861489U,	// <1,5,4,7>: Cost 4 vsldoi4 <7,1,5,4>, <7,1,5,4>
+  1613385257U,	// <1,5,4,u>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2624290918U,	// <1,5,5,0>: Cost 3 vsldoi4 <1,1,5,5>, LHS
+  2624291676U,	// <1,5,5,1>: Cost 3 vsldoi4 <1,1,5,5>, <1,1,5,5>
+  3698034211U,	// <1,5,5,2>: Cost 4 vsldoi4 <1,1,5,5>, <2,1,3,5>
+  2284471211U,	// <1,5,5,3>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,3>
+  2624294198U,	// <1,5,5,4>: Cost 3 vsldoi4 <1,1,5,5>, RHS
+  2284471132U,	// <1,5,5,5>: Cost 3 vmrglw <0,4,1,5>, <1,1,5,5>
+  2284472834U,	// <1,5,5,6>: Cost 3 vmrglw <0,4,1,5>, <3,4,5,6>
+  2284471539U,	// <1,5,5,7>: Cost 3 vmrglw <0,4,1,5>, <1,6,5,7>
+  2284471216U,	// <1,5,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,u>
+  2785316900U,	// <1,5,6,0>: Cost 3 vsldoi12 <5,6,0,1>, <5,6,0,1>
+  2781040691U,	// <1,5,6,1>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,1,7>
+  2734903802U,	// <1,5,6,2>: Cost 3 vsldoi8 <u,4,1,5>, <6,2,7,3>
+  3848736834U,	// <1,5,6,3>: Cost 4 vsldoi12 <3,u,4,1>, <5,6,3,4>
+  3298717620U,	// <1,5,6,4>: Cost 4 vmrghw <1,6,5,7>, <5,4,5,6>
+  3298717700U,	// <1,5,6,5>: Cost 4 vmrghw <1,6,5,7>, <5,5,5,5>
+  2734904120U,	// <1,5,6,6>: Cost 3 vsldoi8 <u,4,1,5>, <6,6,6,6>
+  2781040738U,	// <1,5,6,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,7,0>
+  2781040747U,	// <1,5,6,u>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,u,0>
+  2734904314U,	// <1,5,7,0>: Cost 3 vsldoi8 <u,4,1,5>, <7,0,1,2>
+  2315677210U,	// <1,5,7,1>: Cost 3 vmrglw <5,6,1,7>, <4,u,5,1>
+  3808646292U,	// <1,5,7,2>: Cost 4 vsldoi8 <u,4,1,5>, <7,2,0,3>
+  3808646371U,	// <1,5,7,3>: Cost 4 vsldoi8 <u,4,1,5>, <7,3,0,1>
+  2734904678U,	// <1,5,7,4>: Cost 3 vsldoi8 <u,4,1,5>, <7,4,5,6>
+  3389418714U,	// <1,5,7,5>: Cost 4 vmrglw <5,6,1,7>, <4,4,5,5>
+  3365528656U,	// <1,5,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,4,5,6>
+  2734904940U,	// <1,5,7,7>: Cost 3 vsldoi8 <u,4,1,5>, <7,7,7,7>
+  2734904962U,	// <1,5,7,u>: Cost 3 vsldoi8 <u,4,1,5>, <7,u,1,2>
+  2687129299U,	// <1,5,u,0>: Cost 3 vsldoi8 <0,4,1,5>, <u,0,1,2>
+  1613387566U,	// <1,5,u,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2687129480U,	// <1,5,u,2>: Cost 3 vsldoi8 <0,4,1,5>, <u,2,3,3>
+  2687129532U,	// <1,5,u,3>: Cost 3 vsldoi8 <0,4,1,5>, <u,3,0,1>
+  1661163546U,	// <1,5,u,4>: Cost 2 vsldoi8 <u,4,1,5>, <u,4,1,5>
+  1613387930U,	// <1,5,u,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2687129808U,	// <1,5,u,6>: Cost 3 vsldoi8 <0,4,1,5>, <u,6,3,7>
+  2781040900U,	// <1,5,u,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,u,7,0>
+  1613388133U,	// <1,5,u,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759546368U,	// <1,6,0,0>: Cost 4 vsldoi8 <0,2,1,6>, <0,0,0,0>
+  2685804646U,	// <1,6,0,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2685804721U,	// <1,6,0,2>: Cost 3 vsldoi8 <0,2,1,6>, <0,2,1,6>
+  3861270834U,	// <1,6,0,3>: Cost 4 vsldoi12 <6,0,3,1>, <6,0,3,1>
+  3759546706U,	// <1,6,0,4>: Cost 4 vsldoi8 <0,2,1,6>, <0,4,1,5>
+  2687795620U,	// <1,6,0,5>: Cost 3 vsldoi8 <0,5,1,6>, <0,5,1,6>
+  2688459253U,	// <1,6,0,6>: Cost 3 vsldoi8 <0,6,1,6>, <0,6,1,6>
+  2283769142U,	// <1,6,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  2685805213U,	// <1,6,0,u>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  3698073702U,	// <1,6,1,0>: Cost 4 vsldoi4 <1,1,6,1>, LHS
+  3759547188U,	// <1,6,1,1>: Cost 4 vsldoi8 <0,2,1,6>, <1,1,1,1>
+  2221314554U,	// <1,6,1,2>: Cost 3 vmrghw <1,1,1,1>, <6,2,7,3>
+  3759547401U,	// <1,6,1,3>: Cost 4 vsldoi8 <0,2,1,6>, <1,3,6,7>
+  3698076982U,	// <1,6,1,4>: Cost 4 vsldoi4 <1,1,6,1>, RHS
+  3767510141U,	// <1,6,1,5>: Cost 4 vsldoi8 <1,5,1,6>, <1,5,1,6>
+  2334872376U,	// <1,6,1,6>: Cost 3 vmrglw <u,u,1,1>, <6,6,6,6>
+  1213353270U,	// <1,6,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  1213353271U,	// <1,6,1,u>: Cost 2 vmrglw <0,u,1,1>, RHS
+  3704053862U,	// <1,6,2,0>: Cost 4 vsldoi4 <2,1,6,2>, LHS
+  3759547961U,	// <1,6,2,1>: Cost 4 vsldoi8 <0,2,1,6>, <2,1,6,0>
+  2222117370U,	// <1,6,2,2>: Cost 3 vmrghw <1,2,3,0>, <6,2,7,3>
+  3759548070U,	// <1,6,2,3>: Cost 4 vsldoi8 <0,2,1,6>, <2,3,0,1>
+  3704057142U,	// <1,6,2,4>: Cost 4 vsldoi4 <2,1,6,2>, RHS
+  3373451057U,	// <1,6,2,5>: Cost 4 vmrglw <3,0,1,2>, <2,4,6,5>
+  2685806522U,	// <1,6,2,6>: Cost 3 vsldoi8 <0,2,1,6>, <2,6,3,7>
+  1225968950U,	// <1,6,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1225968951U,	// <1,6,2,u>: Cost 2 vmrglw <3,0,1,2>, RHS
+  3759548566U,	// <1,6,3,0>: Cost 4 vsldoi8 <0,2,1,6>, <3,0,1,2>
+  3842912793U,	// <1,6,3,1>: Cost 4 vsldoi12 <2,u,6,1>, <6,3,1,7>
+  3759548774U,	// <1,6,3,2>: Cost 4 vsldoi8 <0,2,1,6>, <3,2,6,3>
+  3759548828U,	// <1,6,3,3>: Cost 4 vsldoi8 <0,2,1,6>, <3,3,3,3>
+  3759548930U,	// <1,6,3,4>: Cost 4 vsldoi8 <0,2,1,6>, <3,4,5,6>
+  3809315421U,	// <1,6,3,5>: Cost 4 vsldoi8 <u,5,1,6>, <3,5,6,7>
+  3386733368U,	// <1,6,3,6>: Cost 4 vmrglw <5,2,1,3>, <6,6,6,6>
+  2283130166U,	// <1,6,3,7>: Cost 3 vmrglw <0,2,1,3>, RHS
+  2283130167U,	// <1,6,3,u>: Cost 3 vmrglw <0,2,1,3>, RHS
+  3704070246U,	// <1,6,4,0>: Cost 4 vsldoi4 <2,1,6,4>, LHS
+  3862229608U,	// <1,6,4,1>: Cost 4 vsldoi12 <6,1,7,1>, <6,4,1,5>
+  3704071741U,	// <1,6,4,2>: Cost 4 vsldoi4 <2,1,6,4>, <2,1,6,4>
+  3721988610U,	// <1,6,4,3>: Cost 4 vsldoi4 <5,1,6,4>, <3,4,5,6>
+  3704073526U,	// <1,6,4,4>: Cost 4 vsldoi4 <2,1,6,4>, RHS
+  2685807926U,	// <1,6,4,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3865621141U,	// <1,6,4,6>: Cost 4 vsldoi12 <6,6,u,1>, <6,4,6,5>
+  2283801910U,	// <1,6,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  2685808169U,	// <1,6,4,u>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3710050406U,	// <1,6,5,0>: Cost 4 vsldoi4 <3,1,6,5>, LHS
+  3710051571U,	// <1,6,5,1>: Cost 4 vsldoi4 <3,1,6,5>, <1,6,5,7>
+  3405989597U,	// <1,6,5,2>: Cost 4 vmrglw <u,4,1,5>, <2,3,6,2>
+  3358214502U,	// <1,6,5,3>: Cost 4 vmrglw <0,4,1,5>, <3,2,6,3>
+  3710053686U,	// <1,6,5,4>: Cost 4 vsldoi4 <3,1,6,5>, RHS
+  3721998025U,	// <1,6,5,5>: Cost 4 vsldoi4 <5,1,6,5>, <5,1,6,5>
+  2332250936U,	// <1,6,5,6>: Cost 3 vmrglw <u,4,1,5>, <6,6,6,6>
+  1210731830U,	// <1,6,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210731831U,	// <1,6,5,u>: Cost 2 vmrglw <0,4,1,5>, RHS
+  2791289597U,	// <1,6,6,0>: Cost 3 vsldoi12 <6,6,0,1>, <6,6,0,1>
+  3698115430U,	// <1,6,6,1>: Cost 4 vsldoi4 <1,1,6,6>, <1,1,6,6>
+  3698116538U,	// <1,6,6,2>: Cost 4 vsldoi4 <1,1,6,6>, <2,6,3,7>
+  3356894132U,	// <1,6,6,3>: Cost 4 vmrglw <0,2,1,6>, <1,2,6,3>
+  3698117942U,	// <1,6,6,4>: Cost 4 vsldoi4 <1,1,6,6>, RHS
+  3722006218U,	// <1,6,6,5>: Cost 4 vsldoi4 <5,1,6,6>, <5,1,6,6>
+  2781041464U,	// <1,6,6,6>: Cost 3 vsldoi12 <4,u,5,1>, <6,6,6,6>
+  2283154742U,	// <1,6,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283154743U,	// <1,6,6,u>: Cost 3 vmrglw <0,2,1,6>, RHS
+  1718211406U,	// <1,6,7,0>: Cost 2 vsldoi12 <6,7,0,1>, <6,7,0,1>
+  2792026967U,	// <1,6,7,1>: Cost 3 vsldoi12 <6,7,1,1>, <6,7,1,1>
+  2765411170U,	// <1,6,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <6,7,2,3>
+  3854783336U,	// <1,6,7,3>: Cost 4 vsldoi12 <4,u,5,1>, <6,7,3,0>
+  2781041526U,	// <1,6,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,7,4,5>
+  3365528664U,	// <1,6,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,4,6,5>
+  2791953290U,	// <1,6,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <6,7,6,7>
+  2291789110U,	// <1,6,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1718801302U,	// <1,6,7,u>: Cost 2 vsldoi12 <6,7,u,1>, <6,7,u,1>
+  1718875039U,	// <1,6,u,0>: Cost 2 vsldoi12 <6,u,0,1>, <6,u,0,1>
+  2685810478U,	// <1,6,u,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2792764337U,	// <1,6,u,2>: Cost 3 vsldoi12 <6,u,2,1>, <6,u,2,1>
+  3759552444U,	// <1,6,u,3>: Cost 4 vsldoi8 <0,2,1,6>, <u,3,0,1>
+  2781041607U,	// <1,6,u,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,u,4,5>
+  2685810842U,	// <1,6,u,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  2689792208U,	// <1,6,u,6>: Cost 3 vsldoi8 <0,u,1,6>, <u,6,3,7>
+  1210756406U,	// <1,6,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  1210756407U,	// <1,6,u,u>: Cost 2 vmrglw <0,4,1,u>, RHS
+  2793280496U,	// <1,7,0,0>: Cost 3 vsldoi12 <7,0,0,1>, <7,0,0,1>
+  2694439014U,	// <1,7,0,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  3393343912U,	// <1,7,0,2>: Cost 4 vmrglw <6,3,1,0>, <6,1,7,2>
+  3397325306U,	// <1,7,0,3>: Cost 4 vmrglw <7,0,1,0>, <6,2,7,3>
+  2793575444U,	// <1,7,0,4>: Cost 3 vsldoi12 <7,0,4,1>, <7,0,4,1>
+  3722030797U,	// <1,7,0,5>: Cost 4 vsldoi4 <5,1,7,0>, <5,1,7,0>
+  2688467446U,	// <1,7,0,6>: Cost 3 vsldoi8 <0,6,1,7>, <0,6,1,7>
+  2689131079U,	// <1,7,0,7>: Cost 3 vsldoi8 <0,7,1,7>, <0,7,1,7>
+  2694439570U,	// <1,7,0,u>: Cost 3 vsldoi8 <1,6,1,7>, <0,u,1,1>
+  2654265354U,	// <1,7,1,0>: Cost 3 vsldoi4 <6,1,7,1>, <0,0,1,1>
+  2794017866U,	// <1,7,1,1>: Cost 3 vsldoi12 <7,1,1,1>, <7,1,1,1>
+  3768181639U,	// <1,7,1,2>: Cost 4 vsldoi8 <1,6,1,7>, <1,2,1,3>
+  2334872058U,	// <1,7,1,3>: Cost 3 vmrglw <u,u,1,1>, <6,2,7,3>
+  2654268726U,	// <1,7,1,4>: Cost 3 vsldoi4 <6,1,7,1>, RHS
+  3792069797U,	// <1,7,1,5>: Cost 4 vsldoi8 <5,6,1,7>, <1,5,6,1>
+  2694440143U,	// <1,7,1,6>: Cost 3 vsldoi8 <1,6,1,7>, <1,6,1,7>
+  2334872386U,	// <1,7,1,7>: Cost 3 vmrglw <u,u,1,1>, <6,6,7,7>
+  2695767409U,	// <1,7,1,u>: Cost 3 vsldoi8 <1,u,1,7>, <1,u,1,7>
+  2654273638U,	// <1,7,2,0>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2222117973U,	// <1,7,2,1>: Cost 3 vmrghw <1,2,3,0>, <7,1,2,3>
+  2299711912U,	// <1,7,2,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2654275734U,	// <1,7,2,3>: Cost 3 vsldoi4 <6,1,7,2>, <3,0,1,2>
+  2654276918U,	// <1,7,2,4>: Cost 3 vsldoi4 <6,1,7,2>, RHS
+  3385397675U,	// <1,7,2,5>: Cost 4 vmrglw <5,0,1,2>, <6,1,7,5>
+  2654278056U,	// <1,7,2,6>: Cost 3 vsldoi4 <6,1,7,2>, <6,1,7,2>
+  2323599627U,	// <1,7,2,7>: Cost 3 vmrglw <7,0,1,2>, <5,u,7,7>
+  2654279470U,	// <1,7,2,u>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2795271395U,	// <1,7,3,0>: Cost 3 vsldoi12 <7,3,0,1>, <7,3,0,1>
+  3768183059U,	// <1,7,3,1>: Cost 4 vsldoi8 <1,6,1,7>, <3,1,6,1>
+  3728025254U,	// <1,7,3,2>: Cost 4 vsldoi4 <6,1,7,3>, <2,3,0,1>
+  3768183196U,	// <1,7,3,3>: Cost 4 vsldoi8 <1,6,1,7>, <3,3,3,3>
+  3768183298U,	// <1,7,3,4>: Cost 4 vsldoi8 <1,6,1,7>, <3,4,5,6>
+  3792071255U,	// <1,7,3,5>: Cost 4 vsldoi8 <5,6,1,7>, <3,5,6,1>
+  3780127361U,	// <1,7,3,6>: Cost 4 vsldoi8 <3,6,1,7>, <3,6,1,7>
+  3847779617U,	// <1,7,3,7>: Cost 4 vsldoi12 <3,7,0,1>, <7,3,7,0>
+  2795861291U,	// <1,7,3,u>: Cost 3 vsldoi12 <7,3,u,1>, <7,3,u,1>
+  2795935028U,	// <1,7,4,0>: Cost 3 vsldoi12 <7,4,0,1>, <7,4,0,1>
+  3728032975U,	// <1,7,4,1>: Cost 4 vsldoi4 <6,1,7,4>, <1,6,1,7>
+  3839153480U,	// <1,7,4,2>: Cost 4 vsldoi12 <2,3,0,1>, <7,4,2,3>
+  3397358074U,	// <1,7,4,3>: Cost 4 vmrglw <7,0,1,4>, <6,2,7,3>
+  3854783835U,	// <1,7,4,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,4,4,4>
+  2694442294U,	// <1,7,4,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  3786100058U,	// <1,7,4,6>: Cost 4 vsldoi8 <4,6,1,7>, <4,6,1,7>
+  3722065254U,	// <1,7,4,7>: Cost 4 vsldoi4 <5,1,7,4>, <7,4,5,6>
+  2694442537U,	// <1,7,4,u>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654298214U,	// <1,7,5,0>: Cost 3 vsldoi4 <6,1,7,5>, LHS
+  3854783893U,	// <1,7,5,1>: Cost 4 vsldoi12 <4,u,5,1>, <7,5,1,u>
+  3710126010U,	// <1,7,5,2>: Cost 4 vsldoi4 <3,1,7,5>, <2,6,3,7>
+  2332250618U,	// <1,7,5,3>: Cost 3 vmrglw <u,4,1,5>, <6,2,7,3>
+  2654301494U,	// <1,7,5,4>: Cost 3 vsldoi4 <6,1,7,5>, RHS
+  2284474795U,	// <1,7,5,5>: Cost 3 vmrglw <0,4,1,5>, <6,1,7,5>
+  2718330931U,	// <1,7,5,6>: Cost 3 vsldoi8 <5,6,1,7>, <5,6,1,7>
+  2332250946U,	// <1,7,5,7>: Cost 3 vmrglw <u,4,1,5>, <6,6,7,7>
+  2719658197U,	// <1,7,5,u>: Cost 3 vsldoi8 <5,u,1,7>, <5,u,1,7>
+  2332921954U,	// <1,7,6,0>: Cost 3 vmrglw <u,5,1,6>, <5,6,7,0>
+  3768185254U,	// <1,7,6,1>: Cost 4 vsldoi8 <1,6,1,7>, <6,1,7,0>
+  3710134202U,	// <1,7,6,2>: Cost 4 vsldoi4 <3,1,7,6>, <2,6,3,7>
+  3710134561U,	// <1,7,6,3>: Cost 4 vsldoi4 <3,1,7,6>, <3,1,7,6>
+  3710135606U,	// <1,7,6,4>: Cost 4 vsldoi4 <3,1,7,6>, RHS
+  3864884745U,	// <1,7,6,5>: Cost 4 vsldoi12 <6,5,7,1>, <7,6,5,7>
+  3854784017U,	// <1,7,6,6>: Cost 4 vsldoi12 <4,u,5,1>, <7,6,6,6>
+  2791953940U,	// <1,7,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <7,6,7,0>
+  2792617501U,	// <1,7,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <7,6,u,0>
+  2797925927U,	// <1,7,7,0>: Cost 3 vsldoi12 <7,7,0,1>, <7,7,0,1>
+  3365528426U,	// <1,7,7,1>: Cost 4 vmrglw <1,6,1,7>, <1,1,7,1>
+  3728058022U,	// <1,7,7,2>: Cost 4 vsldoi4 <6,1,7,7>, <2,3,0,1>
+  3365528509U,	// <1,7,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,7,3>
+  3854784079U,	// <1,7,7,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,7,4,5>
+  3722088148U,	// <1,7,7,5>: Cost 4 vsldoi4 <5,1,7,7>, <5,1,7,7>
+  3728060845U,	// <1,7,7,6>: Cost 4 vsldoi4 <6,1,7,7>, <6,1,7,7>
+  2781042284U,	// <1,7,7,7>: Cost 3 vsldoi12 <4,u,5,1>, <7,7,7,7>
+  2798515823U,	// <1,7,7,u>: Cost 3 vsldoi12 <7,7,u,1>, <7,7,u,1>
+  2654322705U,	// <1,7,u,0>: Cost 3 vsldoi4 <6,1,7,u>, <0,0,1,u>
+  2694444846U,	// <1,7,u,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  2299711912U,	// <1,7,u,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2323649018U,	// <1,7,u,3>: Cost 3 vmrglw <7,0,1,u>, <6,2,7,3>
+  2654326070U,	// <1,7,u,4>: Cost 3 vsldoi4 <6,1,7,u>, RHS
+  2694445210U,	// <1,7,u,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654327214U,	// <1,7,u,6>: Cost 3 vsldoi4 <6,1,7,u>, <6,1,7,u>
+  2323649346U,	// <1,7,u,7>: Cost 3 vmrglw <7,0,1,u>, <6,6,7,7>
+  2694445413U,	// <1,7,u,u>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  1610752017U,	// <1,u,0,0>: Cost 2 vsldoi8 <0,0,1,u>, <0,0,1,u>
+  1613406310U,	// <1,u,0,1>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  2685821107U,	// <1,u,0,2>: Cost 3 vsldoi8 <0,2,1,u>, <0,2,1,u>
+  2283765916U,	// <1,u,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  1613406549U,	// <1,u,0,4>: Cost 2 vsldoi8 <0,4,1,u>, <0,4,1,u>
+  1725880054U,	// <1,u,0,5>: Cost 2 vsldoi12 <u,0,5,1>, <u,0,5,1>
+  2688475639U,	// <1,u,0,6>: Cost 3 vsldoi8 <0,6,1,u>, <0,6,1,u>
+  2283769160U,	// <1,u,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  1613406877U,	// <1,u,0,u>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  1550221414U,	// <1,u,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,u,1,1>: Cost 1 vspltisw1 LHS
+  1683117870U,	// <1,u,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1213350044U,	// <1,u,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  1550224694U,	// <1,u,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1147574426U,	// <1,u,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2687149326U,	// <1,u,1,6>: Cost 3 vsldoi8 <0,4,1,u>, <1,6,u,7>
+  1213353288U,	// <1,u,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  269271142U,	// <1,u,1,u>: Cost 1 vspltisw1 LHS
+  2222118611U,	// <1,u,2,0>: Cost 3 vmrghw <1,2,3,0>, <u,0,1,2>
+  1148376878U,	// <1,u,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  1148371862U,	// <1,u,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  1225965724U,	// <1,u,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2222118975U,	// <1,u,2,4>: Cost 3 vmrghw <1,2,3,0>, <u,4,5,6>
+  1148377242U,	// <1,u,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2687150010U,	// <1,u,2,6>: Cost 3 vsldoi8 <0,4,1,u>, <2,6,3,7>
+  1225968968U,	// <1,u,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1148377445U,	// <1,u,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  471040156U,	// <1,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544782644U,	// <1,u,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544783464U,	// <1,u,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544784022U,	// <1,u,3,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471043382U,	// <1,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592561668U,	// <1,u,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592562170U,	// <1,u,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592562682U,	// <1,u,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  471045934U,	// <1,u,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708384629U,	// <1,u,4,0>: Cost 3 vsldoi8 <4,0,1,u>, <4,0,1,u>
+  2687151101U,	// <1,u,4,1>: Cost 3 vsldoi8 <0,4,1,u>, <4,1,u,0>
+  2223408022U,	// <1,u,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  2283798684U,	// <1,u,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2642422785U,	// <1,u,4,4>: Cost 3 vsldoi4 <4,1,u,4>, <4,1,u,4>
+  1613409590U,	// <1,u,4,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2283801090U,	// <1,u,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  2283801928U,	// <1,u,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  1613409833U,	// <1,u,4,u>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2284471235U,	// <1,u,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,0>
+  2284472046U,	// <1,u,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,u,1>
+  2284472533U,	// <1,u,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,u,2>
+  1210728604U,	// <1,u,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2284471239U,	// <1,u,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,4>
+  1210728786U,	// <1,u,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  1683118234U,	// <1,u,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210731848U,	// <1,u,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210728609U,	// <1,u,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720330023U,	// <1,u,6,0>: Cost 3 vsldoi8 <6,0,1,u>, <6,0,1,u>
+  2757376190U,	// <1,u,6,1>: Cost 3 vsldoi12 <0,u,u,1>, <u,6,1,7>
+  2726302202U,	// <1,u,6,2>: Cost 3 vsldoi8 <7,0,1,u>, <6,2,7,3>
+  2283151516U,	// <1,u,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  2224972114U,	// <1,u,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  2224683162U,	// <1,u,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726302520U,	// <1,u,6,6>: Cost 3 vsldoi8 <7,0,1,u>, <6,6,6,6>
+  2283154760U,	// <1,u,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283151521U,	// <1,u,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652560896U,	// <1,u,7,0>: Cost 2 vsldoi8 <7,0,1,u>, <7,0,1,u>
+  2333590225U,	// <1,u,7,1>: Cost 3 vmrglw <u,6,1,7>, <0,u,u,1>
+  2765412628U,	// <1,u,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <u,7,2,3>
+  2291785884U,	// <1,u,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2781042984U,	// <1,u,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <u,7,4,5>
+  3365527953U,	// <1,u,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,u,5>
+  2791954748U,	// <1,u,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <u,7,6,7>
+  2291789128U,	// <1,u,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1657869960U,	// <1,u,7,u>: Cost 2 vsldoi8 <7,u,1,u>, <7,u,1,u>
+  471081121U,	// <1,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  269271142U,	// <1,u,u,1>: Cost 1 vspltisw1 LHS
+  1544824424U,	// <1,u,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544824982U,	// <1,u,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471084342U,	// <1,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1613412506U,	// <1,u,u,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  1683118477U,	// <1,u,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210756424U,	// <1,u,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  471086894U,	// <1,u,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2226757632U,	// <2,0,0,0>: Cost 3 vmrghw <2,0,3,0>, <0,0,0,0>
+  2226757734U,	// <2,0,0,1>: Cost 3 vmrghw <2,0,3,0>, LHS
+  3826622483U,	// <2,0,0,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,0,2,1>
+  3843211292U,	// <2,0,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,0,3,1>
+  3300499794U,	// <2,0,0,4>: Cost 4 vmrghw <2,0,3,0>, <0,4,1,5>
+  3356256724U,	// <2,0,0,5>: Cost 4 vmrglw <0,1,2,0>, <3,4,0,5>
+  3825664056U,	// <2,0,0,6>: Cost 4 vsldoi12 <0,0,6,2>, <0,0,6,2>
+  3762889289U,	// <2,0,0,7>: Cost 4 vsldoi8 <0,7,2,0>, <0,7,2,0>
+  2226758301U,	// <2,0,0,u>: Cost 3 vmrghw <2,0,3,0>, LHS
+  2227429386U,	// <2,0,1,0>: Cost 3 vmrghw <2,1,3,1>, <0,0,1,1>
+  2227429478U,	// <2,0,1,1>: Cost 3 vmrghw <2,1,3,1>, LHS
+  1691156582U,	// <2,0,1,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2666358997U,	// <2,0,1,3>: Cost 3 vsldoi4 <u,2,0,1>, <3,0,u,2>
+  2227462482U,	// <2,0,1,4>: Cost 3 vmrghw <2,1,3,5>, <0,4,1,5>
+  3722186464U,	// <2,0,1,5>: Cost 4 vsldoi4 <5,2,0,1>, <5,2,0,1>
+  3867099278U,	// <2,0,1,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,1,6,7>
+  3366881912U,	// <2,0,1,7>: Cost 4 vmrglw <1,u,2,1>, <3,6,0,7>
+  1691156636U,	// <2,0,1,u>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2228027392U,	// <2,0,2,0>: Cost 3 vmrghw <2,2,2,2>, <0,0,0,0>
+  1154285670U,	// <2,0,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  2228027565U,	// <2,0,2,2>: Cost 3 vmrghw <2,2,2,2>, <0,2,1,2>
+  3301769468U,	// <2,0,2,3>: Cost 4 vmrghw <2,2,2,2>, <0,3,1,0>
+  2228027730U,	// <2,0,2,4>: Cost 3 vmrghw <2,2,2,2>, <0,4,1,5>
+  3301769635U,	// <2,0,2,5>: Cost 4 vmrghw <2,2,2,2>, <0,5,1,5>
+  3780806586U,	// <2,0,2,6>: Cost 4 vsldoi8 <3,7,2,0>, <2,6,3,7>
+  3368880760U,	// <2,0,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,0,7>
+  1154286237U,	// <2,0,2,u>: Cost 2 vmrghw <2,2,2,2>, LHS
+  1213440000U,	// <2,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1213441702U,	// <2,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2228535470U,	// <2,0,3,2>: Cost 3 vmrghw <2,3,0,1>, <0,2,1,3>
+  2636515632U,	// <2,0,3,3>: Cost 3 vsldoi4 <3,2,0,3>, <3,2,0,3>
+  2287182962U,	// <2,0,3,4>: Cost 3 vmrglw LHS, <1,5,0,4>
+  2660405346U,	// <2,0,3,5>: Cost 3 vsldoi4 <7,2,0,3>, <5,6,7,0>
+  2228535798U,	// <2,0,3,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660406420U,	// <2,0,3,7>: Cost 3 vsldoi4 <7,2,0,3>, <7,2,0,3>
+  1213441709U,	// <2,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3368894464U,	// <2,0,4,0>: Cost 4 vmrglw <2,2,2,4>, <0,0,0,0>
+  2764898642U,	// <2,0,4,1>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,1,5>
+  3826622811U,	// <2,0,4,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,4,2,5>
+  3843211620U,	// <2,0,4,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,4,3,5>
+  3838640493U,	// <2,0,4,4>: Cost 4 vsldoi12 <2,2,2,2>, <0,4,4,5>
+  2732944694U,	// <2,0,4,5>: Cost 3 vsldoi8 <u,1,2,0>, RHS
+  3797396857U,	// <2,0,4,6>: Cost 4 vsldoi8 <6,5,2,0>, <4,6,5,2>
+  3867099528U,	// <2,0,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <0,4,7,5>
+  2764898705U,	// <2,0,4,u>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,u,5>
+  3364257792U,	// <2,0,5,0>: Cost 4 vmrglw <1,4,2,5>, <0,0,0,0>
+  2230124646U,	// <2,0,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  3304235184U,	// <2,0,5,2>: Cost 4 vmrghw <2,5,u,6>, <0,2,1,5>
+  3364260144U,	// <2,0,5,3>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,3>
+  3303817554U,	// <2,0,5,4>: Cost 4 vmrghw <2,5,3,0>, <0,4,1,5>
+  3364260146U,	// <2,0,5,5>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,5>
+  3867099602U,	// <2,0,5,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,5,6,7>
+  3364260472U,	// <2,0,5,7>: Cost 4 vmrglw <1,4,2,5>, <3,6,0,7>
+  2230125213U,	// <2,0,5,u>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2230796288U,	// <2,0,6,0>: Cost 3 vmrghw <2,6,3,7>, <0,0,0,0>
+  1157054566U,	// <2,0,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  2230796465U,	// <2,0,6,2>: Cost 3 vmrghw <2,6,3,7>, <0,2,1,6>
+  3304538364U,	// <2,0,6,3>: Cost 4 vmrghw <2,6,3,7>, <0,3,1,0>
+  2230796626U,	// <2,0,6,4>: Cost 3 vmrghw <2,6,3,7>, <0,4,1,5>
+  3797398205U,	// <2,0,6,5>: Cost 4 vsldoi8 <6,5,2,0>, <6,5,2,0>
+  3304538614U,	// <2,0,6,6>: Cost 4 vmrghw <2,6,3,7>, <0,6,1,7>
+  3798725471U,	// <2,0,6,7>: Cost 4 vsldoi8 <6,7,2,0>, <6,7,2,0>
+  1157055133U,	// <2,0,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  3371573248U,	// <2,0,7,0>: Cost 4 vmrglw <2,6,2,7>, <0,0,0,0>
+  2231189606U,	// <2,0,7,1>: Cost 3 vmrghw <2,7,0,1>, LHS
+  3801380003U,	// <2,0,7,2>: Cost 4 vsldoi8 <7,2,2,0>, <7,2,2,0>
+  3802043636U,	// <2,0,7,3>: Cost 4 vsldoi8 <7,3,2,0>, <7,3,2,0>
+  3806688614U,	// <2,0,7,4>: Cost 4 vsldoi8 <u,1,2,0>, <7,4,5,6>
+  3356317308U,	// <2,0,7,5>: Cost 4 vmrglw <0,1,2,7>, <7,u,0,5>
+  3804034535U,	// <2,0,7,6>: Cost 4 vsldoi8 <7,6,2,0>, <7,6,2,0>
+  3806688876U,	// <2,0,7,7>: Cost 4 vsldoi8 <u,1,2,0>, <7,7,7,7>
+  2231190173U,	// <2,0,7,u>: Cost 3 vmrghw <2,7,0,1>, LHS
+  1208836096U,	// <2,0,u,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1208837798U,	// <2,0,u,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  1691157149U,	// <2,0,u,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2636556597U,	// <2,0,u,3>: Cost 3 vsldoi4 <3,2,0,u>, <3,2,0,u>
+  2282579625U,	// <2,0,u,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2660446306U,	// <2,0,u,5>: Cost 3 vsldoi4 <7,2,0,u>, <5,6,7,0>
+  2228535798U,	// <2,0,u,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660447385U,	// <2,0,u,7>: Cost 3 vsldoi4 <7,2,0,u>, <7,2,0,u>
+  1208837805U,	// <2,0,u,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3692388523U,	// <2,1,0,0>: Cost 4 vsldoi4 <0,2,1,0>, <0,2,1,0>
+  2757526244U,	// <2,1,0,1>: Cost 3 vsldoi12 <1,0,1,2>, <1,0,1,2>
+  2330290974U,	// <2,1,0,2>: Cost 3 vmrglw <u,1,2,0>, <3,u,1,2>
+  3843212020U,	// <2,1,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <1,0,3,0>
+  3692391734U,	// <2,1,0,4>: Cost 4 vsldoi4 <0,2,1,0>, RHS
+  3300533362U,	// <2,1,0,5>: Cost 4 vmrghw <2,0,3,4>, <1,5,0,4>
+  3794084337U,	// <2,1,0,6>: Cost 4 vsldoi8 <6,0,2,1>, <0,6,1,2>
+  3374170614U,	// <2,1,0,7>: Cost 5 vmrglw <3,1,2,0>, <0,6,1,7>
+  2758042403U,	// <2,1,0,u>: Cost 3 vsldoi12 <1,0,u,2>, <1,0,u,2>
+  2690482924U,	// <2,1,1,0>: Cost 3 vsldoi8 <1,0,2,1>, <1,0,2,1>
+  2764899124U,	// <2,1,1,1>: Cost 3 vsldoi12 <2,2,2,2>, <1,1,1,1>
+  2695791510U,	// <2,1,1,2>: Cost 3 vsldoi8 <1,u,2,1>, <1,2,3,0>
+  3362235271U,	// <2,1,1,3>: Cost 4 vmrglw <1,1,2,1>, <1,2,1,3>
+  3692399926U,	// <2,1,1,4>: Cost 4 vsldoi4 <0,2,1,1>, RHS
+  3832226649U,	// <2,1,1,5>: Cost 4 vsldoi12 <1,1,5,2>, <1,1,5,2>
+  3301205235U,	// <2,1,1,6>: Cost 4 vmrghw <2,1,3,5>, <1,6,5,7>
+  3768870179U,	// <2,1,1,7>: Cost 4 vsldoi8 <1,7,2,1>, <1,7,2,1>
+  2695791988U,	// <2,1,1,u>: Cost 3 vsldoi8 <1,u,2,1>, <1,u,2,1>
+  2618663085U,	// <2,1,2,0>: Cost 3 vsldoi4 <0,2,1,2>, <0,2,1,2>
+  2228028212U,	// <2,1,2,1>: Cost 3 vmrghw <2,2,2,2>, <1,1,1,1>
+  2618664552U,	// <2,1,2,2>: Cost 3 vsldoi4 <0,2,1,2>, <2,2,2,2>
+  2759000984U,	// <2,1,2,3>: Cost 3 vsldoi12 <1,2,3,2>, <1,2,3,2>
+  2618666294U,	// <2,1,2,4>: Cost 3 vsldoi4 <0,2,1,2>, RHS
+  2295136594U,	// <2,1,2,5>: Cost 3 vmrglw <2,2,2,2>, <0,4,1,5>
+  3769534376U,	// <2,1,2,6>: Cost 4 vsldoi8 <1,u,2,1>, <2,6,1,7>
+  2793358266U,	// <2,1,2,7>: Cost 3 vsldoi12 <7,0,1,2>, <1,2,7,0>
+  2618668846U,	// <2,1,2,u>: Cost 3 vsldoi4 <0,2,1,2>, LHS
+  2282536969U,	// <2,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208795146U,	// <2,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1213442198U,	// <2,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2287181998U,	// <2,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2618674486U,	// <2,1,3,4>: Cost 3 vsldoi4 <0,2,1,3>, RHS
+  1208795474U,	// <2,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2287182001U,	// <2,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287183055U,	// <2,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208795153U,	// <2,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  3692421295U,	// <2,1,4,0>: Cost 4 vsldoi4 <0,2,1,4>, <0,2,1,4>
+  3838641195U,	// <2,1,4,1>: Cost 4 vsldoi12 <2,2,2,2>, <1,4,1,5>
+  2330323742U,	// <2,1,4,2>: Cost 3 vmrglw <u,1,2,4>, <3,u,1,2>
+  3692423318U,	// <2,1,4,3>: Cost 5 vsldoi4 <0,2,1,4>, <3,0,1,2>
+  3692424502U,	// <2,1,4,4>: Cost 4 vsldoi4 <0,2,1,4>, RHS
+  2695793974U,	// <2,1,4,5>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3799395705U,	// <2,1,4,6>: Cost 4 vsldoi8 <6,u,2,1>, <4,6,5,2>
+  3368895695U,	// <2,1,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,1,7>
+  2695794217U,	// <2,1,4,u>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3692429488U,	// <2,1,5,0>: Cost 4 vsldoi4 <0,2,1,5>, <0,2,1,5>
+  3364257802U,	// <2,1,5,1>: Cost 4 vmrglw <1,4,2,5>, <0,0,1,1>
+  3692431253U,	// <2,1,5,2>: Cost 4 vsldoi4 <0,2,1,5>, <2,5,u,6>
+  3692431874U,	// <2,1,5,3>: Cost 4 vsldoi4 <0,2,1,5>, <3,4,5,6>
+  3692432694U,	// <2,1,5,4>: Cost 4 vsldoi4 <0,2,1,5>, RHS
+  3364258130U,	// <2,1,5,5>: Cost 4 vmrglw <1,4,2,5>, <0,4,1,5>
+  3303875827U,	// <2,1,5,6>: Cost 4 vmrghw <2,5,3,7>, <1,6,5,7>
+  3867100333U,	// <2,1,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <1,5,7,0>
+  3692435246U,	// <2,1,5,u>: Cost 4 vsldoi4 <0,2,1,5>, LHS
+  2618695857U,	// <2,1,6,0>: Cost 3 vsldoi4 <0,2,1,6>, <0,2,1,6>
+  2230797108U,	// <2,1,6,1>: Cost 3 vmrghw <2,6,3,7>, <1,1,1,1>
+  2618697658U,	// <2,1,6,2>: Cost 3 vsldoi4 <0,2,1,6>, <2,6,3,7>
+  3692439702U,	// <2,1,6,3>: Cost 4 vsldoi4 <0,2,1,6>, <3,0,1,2>
+  2618699062U,	// <2,1,6,4>: Cost 3 vsldoi4 <0,2,1,6>, RHS
+  3364929874U,	// <2,1,6,5>: Cost 4 vmrglw <1,5,2,6>, <0,4,1,5>
+  3692442424U,	// <2,1,6,6>: Cost 4 vsldoi4 <0,2,1,6>, <6,6,6,6>
+  3798733664U,	// <2,1,6,7>: Cost 4 vsldoi8 <6,7,2,1>, <6,7,2,1>
+  2618701614U,	// <2,1,6,u>: Cost 3 vsldoi4 <0,2,1,6>, LHS
+  3799397370U,	// <2,1,7,0>: Cost 4 vsldoi8 <6,u,2,1>, <7,0,1,2>
+  3371573258U,	// <2,1,7,1>: Cost 4 vmrglw <2,6,2,7>, <0,0,1,1>
+  2330351234U,	// <2,1,7,2>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  3799397658U,	// <2,1,7,3>: Cost 4 vsldoi8 <6,u,2,1>, <7,3,6,2>
+  3799397734U,	// <2,1,7,4>: Cost 4 vsldoi8 <6,u,2,1>, <7,4,5,6>
+  3371573586U,	// <2,1,7,5>: Cost 4 vmrglw <2,6,2,7>, <0,4,1,5>
+  3799397870U,	// <2,1,7,6>: Cost 4 vsldoi8 <6,u,2,1>, <7,6,2,7>
+  3799397956U,	// <2,1,7,7>: Cost 4 vsldoi8 <6,u,2,1>, <7,7,3,3>
+  2330351234U,	// <2,1,7,u>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  2282577929U,	// <2,1,u,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208836106U,	// <2,1,u,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1208838294U,	// <2,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282578094U,	// <2,1,u,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282577933U,	// <2,1,u,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1208836434U,	// <2,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282578097U,	// <2,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287224015U,	// <2,1,u,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208836113U,	// <2,1,u,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  2226759117U,	// <2,2,0,0>: Cost 3 vmrghw <2,0,3,0>, <2,0,3,0>
+  1624047718U,	// <2,2,0,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  2697789613U,	// <2,2,0,2>: Cost 3 vsldoi8 <2,2,2,2>, <0,2,1,2>
+  2226767526U,	// <2,2,0,3>: Cost 3 vmrghw <2,0,3,1>, <2,3,0,1>
+  2697789778U,	// <2,2,0,4>: Cost 3 vsldoi8 <2,2,2,2>, <0,4,1,5>
+  3300657000U,	// <2,2,0,5>: Cost 4 vmrghw <2,0,5,1>, <2,5,3,6>
+  2226988986U,	// <2,2,0,6>: Cost 3 vmrghw <2,0,6,1>, <2,6,3,7>
+  3734271139U,	// <2,2,0,7>: Cost 4 vsldoi4 <7,2,2,0>, <7,2,2,0>
+  1624048285U,	// <2,2,0,u>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  3831268868U,	// <2,2,1,0>: Cost 4 vsldoi12 <1,0,1,2>, <2,1,0,1>
+  2293138804U,	// <2,2,1,1>: Cost 3 vmrglw <1,u,2,1>, <1,u,2,1>
+  2697790358U,	// <2,2,1,2>: Cost 3 vsldoi8 <2,2,2,2>, <1,2,3,0>
+  2293137510U,	// <2,2,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  3771532331U,	// <2,2,1,4>: Cost 4 vsldoi8 <2,2,2,2>, <1,4,1,5>
+  3767551106U,	// <2,2,1,5>: Cost 4 vsldoi8 <1,5,2,2>, <1,5,2,2>
+  3301173178U,	// <2,2,1,6>: Cost 4 vmrghw <2,1,3,1>, <2,6,3,7>
+  3372853169U,	// <2,2,1,7>: Cost 4 vmrglw <2,u,2,1>, <2,6,2,7>
+  2293137515U,	// <2,2,1,u>: Cost 3 vmrglw <1,u,2,1>, LHS
+  1556938854U,	// <2,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2295137733U,	// <2,2,2,1>: Cost 3 vmrglw <2,2,2,2>, <2,0,2,1>
+  336380006U,	// <2,2,2,2>: Cost 1 vspltisw2 LHS
+  1221394534U,	// <2,2,2,3>: Cost 2 vmrglw <2,2,2,2>, LHS
+  1556942134U,	// <2,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <2,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2228029370U,	// <2,2,2,6>: Cost 3 vmrghw <2,2,2,2>, <2,6,3,7>
+  2660545701U,	// <2,2,2,7>: Cost 3 vsldoi4 <7,2,2,2>, <7,2,2,2>
+  336380006U,	// <2,2,2,u>: Cost 1 vspltisw2 LHS
+  2697791638U,	// <2,2,3,0>: Cost 3 vsldoi8 <2,2,2,2>, <3,0,1,2>
+  2765489840U,	// <2,2,3,1>: Cost 3 vsldoi12 <2,3,1,2>, <2,3,1,2>
+  1213441640U,	// <2,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135053414U,	// <2,2,3,3>: Cost 1 vmrglw LHS, LHS
+  2697792002U,	// <2,2,3,4>: Cost 3 vsldoi8 <2,2,2,2>, <3,4,5,6>
+  2330313780U,	// <2,2,3,5>: Cost 3 vmrglw LHS, <1,4,2,5>
+  2287183549U,	// <2,2,3,6>: Cost 3 vmrglw LHS, <2,3,2,6>
+  2660553894U,	// <2,2,3,7>: Cost 3 vsldoi4 <7,2,2,3>, <7,2,2,3>
+  135053419U,	// <2,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2630697062U,	// <2,2,4,0>: Cost 3 vsldoi4 <2,2,2,4>, LHS
+  3771534282U,	// <2,2,4,1>: Cost 4 vsldoi8 <2,2,2,2>, <4,1,2,3>
+  2764900109U,	// <2,2,4,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,4,2,5>
+  2295152742U,	// <2,2,4,3>: Cost 3 vmrglw <2,2,2,4>, LHS
+  2295154282U,	// <2,2,4,4>: Cost 3 vmrglw <2,2,2,4>, <2,2,2,4>
+  1624050998U,	// <2,2,4,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  2229675962U,	// <2,2,4,6>: Cost 3 vmrghw <2,4,6,5>, <2,6,3,7>
+  3368896433U,	// <2,2,4,7>: Cost 4 vmrglw <2,2,2,4>, <2,6,2,7>
+  1624051241U,	// <2,2,4,u>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  3771534920U,	// <2,2,5,0>: Cost 4 vsldoi8 <2,2,2,2>, <5,0,1,2>
+  3364258540U,	// <2,2,5,1>: Cost 4 vmrglw <1,4,2,5>, <1,0,2,1>
+  2296489576U,	// <2,2,5,2>: Cost 3 vmrglw <2,4,2,5>, <2,2,2,2>
+  2290516070U,	// <2,2,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  3771535284U,	// <2,2,5,4>: Cost 4 vsldoi8 <2,2,2,2>, <5,4,5,6>
+  2290517044U,	// <2,2,5,5>: Cost 3 vmrglw <1,4,2,5>, <1,4,2,5>
+  2697793634U,	// <2,2,5,6>: Cost 3 vsldoi8 <2,2,2,2>, <5,6,7,0>
+  3370231729U,	// <2,2,5,7>: Cost 4 vmrglw <2,4,2,5>, <2,6,2,7>
+  2290516075U,	// <2,2,5,u>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2230797801U,	// <2,2,6,0>: Cost 3 vmrghw <2,6,3,7>, <2,0,6,1>
+  3304539679U,	// <2,2,6,1>: Cost 4 vmrghw <2,6,3,7>, <2,1,3,1>
+  2764900273U,	// <2,2,6,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,2,7>
+  2764900282U,	// <2,2,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,3,7>
+  2230798129U,	// <2,2,6,4>: Cost 3 vmrghw <2,6,3,7>, <2,4,6,5>
+  3304540008U,	// <2,2,6,5>: Cost 4 vmrghw <2,6,3,7>, <2,5,3,6>
+  1157056442U,	// <2,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725000033U,	// <2,2,6,7>: Cost 3 vsldoi8 <6,7,2,2>, <6,7,2,2>
+  1157056442U,	// <2,2,6,u>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2793359338U,	// <2,2,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <2,7,0,1>
+  3371574725U,	// <2,2,7,1>: Cost 4 vmrglw <2,6,2,7>, <2,0,2,1>
+  2297833064U,	// <2,2,7,2>: Cost 3 vmrglw <2,6,2,7>, <2,2,2,2>
+  2297831526U,	// <2,2,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  2697794918U,	// <2,2,7,4>: Cost 3 vsldoi8 <2,2,2,2>, <7,4,5,6>
+  3371575053U,	// <2,2,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,2,5>
+  3304933297U,	// <2,2,7,6>: Cost 4 vmrghw <2,7,0,1>, <2,6,2,7>
+  2297833393U,	// <2,2,7,7>: Cost 3 vmrglw <2,6,2,7>, <2,6,2,7>
+  2297831531U,	// <2,2,7,u>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1556938854U,	// <2,2,u,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1624053550U,	// <2,2,u,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  336380006U,	// <2,2,u,2>: Cost 1 vspltisw2 LHS
+  135094374U,	// <2,2,u,3>: Cost 1 vmrglw LHS, LHS
+  1556942134U,	// <2,2,u,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1624053914U,	// <2,2,u,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  1157056442U,	// <2,2,u,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2660594859U,	// <2,2,u,7>: Cost 3 vsldoi4 <7,2,2,u>, <7,2,2,u>
+  135094379U,	// <2,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611448320U,	// <2,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537706598U,	// <2,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2689835181U,	// <2,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2689835260U,	// <2,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611448658U,	// <2,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2732966354U,	// <2,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2732966390U,	// <2,3,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660603052U,	// <2,3,0,7>: Cost 3 vsldoi4 <7,2,3,0>, <7,2,3,0>
+  537707165U,	// <2,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689835748U,	// <2,3,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611449140U,	// <2,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611449238U,	// <2,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  3763577805U,	// <2,3,1,3>: Cost 4 vsldoi8 LHS, <1,3,0,1>
+  2689836112U,	// <2,3,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689836143U,	// <2,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689836239U,	// <2,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  3366881210U,	// <2,3,1,7>: Cost 4 vmrglw <1,u,2,1>, <2,6,3,7>
+  1616094588U,	// <2,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2689836493U,	// <2,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,3,0>
+  2685191711U,	// <2,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611449960U,	// <2,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611450022U,	// <2,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2689836822U,	// <2,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,3,5>
+  2689836904U,	// <2,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,3,6>
+  1611450298U,	// <2,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2295138234U,	// <2,3,2,7>: Cost 3 vmrglw <2,2,2,2>, <2,6,3,7>
+  1611450456U,	// <2,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,3,3>
+  1213440918U,	// <2,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282538527U,	// <2,3,3,1>: Cost 3 vmrglw LHS, <2,1,3,1>
+  1557022322U,	// <2,3,3,2>: Cost 2 vsldoi4 <2,2,3,3>, <2,2,3,3>
+  1208796786U,	// <2,3,3,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1213440922U,	// <2,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282538531U,	// <2,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2287188094U,	// <2,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1213441978U,	// <2,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  1208796791U,	// <2,3,3,u>: Cost 2 vmrglw LHS, <2,2,3,u>
+  1551056998U,	// <2,3,4,0>: Cost 2 vsldoi4 <1,2,3,4>, LHS
+  1551057818U,	// <2,3,4,1>: Cost 2 vsldoi4 <1,2,3,4>, <1,2,3,4>
+  2624800360U,	// <2,3,4,2>: Cost 3 vsldoi4 <1,2,3,4>, <2,2,2,2>
+  2624800918U,	// <2,3,4,3>: Cost 3 vsldoi4 <1,2,3,4>, <3,0,1,2>
+  1551060278U,	// <2,3,4,4>: Cost 2 vsldoi4 <1,2,3,4>, RHS
+  537709878U,	// <2,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2732969337U,	// <2,3,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2660635824U,	// <2,3,4,7>: Cost 3 vsldoi4 <7,2,3,4>, <7,2,3,4>
+  537710121U,	// <2,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689838664U,	// <2,3,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2732969615U,	// <2,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2732969707U,	// <2,3,5,2>: Cost 3 vsldoi8 LHS, <5,2,1,3>
+  3763580721U,	// <2,3,5,3>: Cost 4 vsldoi8 LHS, <5,3,0,1>
+  2689839028U,	// <2,3,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659228164U,	// <2,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659228258U,	// <2,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  3364259770U,	// <2,3,5,7>: Cost 4 vmrglw <1,4,2,5>, <2,6,3,7>
+  1659228420U,	// <2,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2230798486U,	// <2,3,6,0>: Cost 3 vmrghw <2,6,3,7>, <3,0,1,2>
+  2732970407U,	// <2,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1659228666U,	// <2,3,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2230798748U,	// <2,3,6,3>: Cost 3 vmrghw <2,6,3,7>, <3,3,3,3>
+  2230798850U,	// <2,3,6,4>: Cost 3 vmrghw <2,6,3,7>, <3,4,5,6>
+  2732970731U,	// <2,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659228984U,	// <2,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659229006U,	// <2,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1659229087U,	// <2,3,6,u>: Cost 2 vsldoi8 LHS, <6,u,0,1>
+  1659229178U,	// <2,3,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2726999125U,	// <2,3,7,1>: Cost 3 vsldoi8 <7,1,2,3>, <7,1,2,3>
+  2727662758U,	// <2,3,7,2>: Cost 3 vsldoi8 <7,2,2,3>, <7,2,2,3>
+  2732971235U,	// <2,3,7,3>: Cost 3 vsldoi8 LHS, <7,3,0,1>
+  1659229542U,	// <2,3,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2732971446U,	// <2,3,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2732971484U,	// <2,3,7,6>: Cost 3 vsldoi8 LHS, <7,6,0,7>
+  1659229804U,	// <2,3,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659229826U,	// <2,3,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1208837014U,	// <2,3,u,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  537712430U,	// <2,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1616099205U,	// <2,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,0>
+  1208837746U,	// <2,3,u,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1208837018U,	// <2,3,u,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  537712794U,	// <2,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1616099536U,	// <2,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1208838074U,	// <2,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  537712997U,	// <2,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  3771547648U,	// <2,4,0,0>: Cost 4 vsldoi8 <2,2,2,4>, <0,0,0,0>
+  2697805926U,	// <2,4,0,1>: Cost 3 vsldoi8 <2,2,2,4>, LHS
+  3770884269U,	// <2,4,0,2>: Cost 4 vsldoi8 <2,1,2,4>, <0,2,1,2>
+  3806716164U,	// <2,4,0,3>: Cost 4 vsldoi8 <u,1,2,4>, <0,3,1,u>
+  3771547986U,	// <2,4,0,4>: Cost 4 vsldoi8 <2,2,2,4>, <0,4,1,5>
+  2226761014U,	// <2,4,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3853462427U,	// <2,4,0,6>: Cost 4 vsldoi12 <4,6,5,2>, <4,0,6,1>
+  3867102116U,	// <2,4,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,0,7,1>
+  2226761257U,	// <2,4,0,u>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3849186231U,	// <2,4,1,0>: Cost 4 vsldoi12 <4,0,1,2>, <4,1,0,2>
+  3301207010U,	// <2,4,1,1>: Cost 4 vmrghw <2,1,3,5>, <4,1,5,0>
+  3766240150U,	// <2,4,1,2>: Cost 4 vsldoi8 <1,3,2,4>, <1,2,3,0>
+  3766240226U,	// <2,4,1,3>: Cost 4 vsldoi8 <1,3,2,4>, <1,3,2,4>
+  3301207248U,	// <2,4,1,4>: Cost 4 vmrghw <2,1,3,5>, <4,4,4,4>
+  2227432758U,	// <2,4,1,5>: Cost 3 vmrghw <2,1,3,1>, RHS
+  3758941400U,	// <2,4,1,6>: Cost 4 vsldoi8 <0,1,2,4>, <1,6,2,7>
+  3768894758U,	// <2,4,1,7>: Cost 4 vsldoi8 <1,7,2,4>, <1,7,2,4>
+  2227433001U,	// <2,4,1,u>: Cost 3 vmrghw <2,1,3,1>, RHS
+  2228030354U,	// <2,4,2,0>: Cost 3 vmrghw <2,2,2,2>, <4,0,5,1>
+  3770885657U,	// <2,4,2,1>: Cost 4 vsldoi8 <2,1,2,4>, <2,1,2,4>
+  2697807466U,	// <2,4,2,2>: Cost 3 vsldoi8 <2,2,2,4>, <2,2,2,4>
+  3368880468U,	// <2,4,2,3>: Cost 4 vmrglw <2,2,2,2>, <3,2,4,3>
+  2228030672U,	// <2,4,2,4>: Cost 3 vmrghw <2,2,2,2>, <4,4,4,4>
+  1154288950U,	// <2,4,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  3771549617U,	// <2,4,2,6>: Cost 4 vsldoi8 <2,2,2,4>, <2,6,2,7>
+  3368880796U,	// <2,4,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,4,7>
+  1154289193U,	// <2,4,2,u>: Cost 2 vmrghw <2,2,2,2>, RHS
+  2636808294U,	// <2,4,3,0>: Cost 3 vsldoi4 <3,2,4,3>, LHS
+  2287181861U,	// <2,4,3,1>: Cost 3 vmrglw LHS, <0,0,4,1>
+  2228866102U,	// <2,4,3,2>: Cost 3 vmrghw <2,3,4,5>, <4,2,5,3>
+  2636810580U,	// <2,4,3,3>: Cost 3 vsldoi4 <3,2,4,3>, <3,2,4,3>
+  1256574160U,	// <2,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1213441742U,	// <2,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2228866430U,	// <2,4,3,6>: Cost 3 vmrghw <2,3,4,5>, <4,6,5,7>
+  2660701368U,	// <2,4,3,7>: Cost 3 vsldoi4 <7,2,4,3>, <7,2,4,3>
+  1213441745U,	// <2,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3704586342U,	// <2,4,4,0>: Cost 4 vsldoi4 <2,2,4,4>, LHS
+  3782831051U,	// <2,4,4,1>: Cost 4 vsldoi8 <4,1,2,4>, <4,1,2,4>
+  3704587900U,	// <2,4,4,2>: Cost 4 vsldoi4 <2,2,4,4>, <2,2,4,4>
+  3368896123U,	// <2,4,4,3>: Cost 4 vmrglw <2,2,2,4>, <2,2,4,3>
+  2793360592U,	// <2,4,4,4>: Cost 3 vsldoi12 <7,0,1,2>, <4,4,4,4>
+  2697809206U,	// <2,4,4,5>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  3303198078U,	// <2,4,4,6>: Cost 4 vmrghw <2,4,3,5>, <4,6,5,7>
+  3867102444U,	// <2,4,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,4,7,5>
+  2697809449U,	// <2,4,4,u>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  2630852710U,	// <2,4,5,0>: Cost 3 vsldoi4 <2,2,4,5>, LHS
+  2624881572U,	// <2,4,5,1>: Cost 3 vsldoi4 <1,2,4,5>, <1,2,4,5>
+  2630854269U,	// <2,4,5,2>: Cost 3 vsldoi4 <2,2,4,5>, <2,2,4,5>
+  2666686677U,	// <2,4,5,3>: Cost 3 vsldoi4 <u,2,4,5>, <3,0,u,2>
+  2630855990U,	// <2,4,5,4>: Cost 3 vsldoi4 <2,2,4,5>, RHS
+  2230127926U,	// <2,4,5,5>: Cost 3 vmrghw <2,5,3,6>, RHS
+  1691159862U,	// <2,4,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  3867102520U,	// <2,4,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,5,7,0>
+  1691159880U,	// <2,4,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230799250U,	// <2,4,6,0>: Cost 3 vmrghw <2,6,3,7>, <4,0,5,1>
+  3304541130U,	// <2,4,6,1>: Cost 4 vmrghw <2,6,3,7>, <4,1,2,3>
+  2230799417U,	// <2,4,6,2>: Cost 3 vmrghw <2,6,3,7>, <4,2,5,6>
+  3304541323U,	// <2,4,6,3>: Cost 4 vmrghw <2,6,3,7>, <4,3,5,7>
+  2230799568U,	// <2,4,6,4>: Cost 3 vmrghw <2,6,3,7>, <4,4,4,4>
+  1157057846U,	// <2,4,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3304541566U,	// <2,4,6,6>: Cost 4 vmrghw <2,6,3,7>, <4,6,5,7>
+  3798758243U,	// <2,4,6,7>: Cost 4 vsldoi8 <6,7,2,4>, <6,7,2,4>
+  1157058089U,	// <2,4,6,u>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3806721018U,	// <2,4,7,0>: Cost 4 vsldoi8 <u,1,2,4>, <7,0,1,2>
+  3853831590U,	// <2,4,7,1>: Cost 4 vsldoi12 <4,7,1,2>, <4,7,1,2>
+  3801412775U,	// <2,4,7,2>: Cost 4 vsldoi8 <7,2,2,4>, <7,2,2,4>
+  3802076408U,	// <2,4,7,3>: Cost 4 vsldoi8 <7,3,2,4>, <7,3,2,4>
+  3401436368U,	// <2,4,7,4>: Cost 4 vmrglw <7,6,2,7>, <4,4,4,4>
+  2793360840U,	// <2,4,7,5>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,5,0>
+  3804067307U,	// <2,4,7,6>: Cost 4 vsldoi8 <7,6,2,4>, <7,6,2,4>
+  3867102682U,	// <2,4,7,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,7,7,0>
+  2793360867U,	// <2,4,7,u>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,u,0>
+  2630877286U,	// <2,4,u,0>: Cost 3 vsldoi4 <2,2,4,u>, LHS
+  2282580144U,	// <2,4,u,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2630878848U,	// <2,4,u,2>: Cost 3 vsldoi4 <2,2,4,u>, <2,2,4,u>
+  2636851545U,	// <2,4,u,3>: Cost 3 vsldoi4 <3,2,4,u>, <3,2,4,u>
+  1256615120U,	// <2,4,u,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1208837838U,	// <2,4,u,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  1691160105U,	// <2,4,u,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2660742333U,	// <2,4,u,7>: Cost 3 vsldoi4 <7,2,4,u>, <7,2,4,u>
+  1208837841U,	// <2,4,u,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3766910976U,	// <2,5,0,0>: Cost 4 vsldoi8 <1,4,2,5>, <0,0,0,0>
+  2693169254U,	// <2,5,0,1>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3760939181U,	// <2,5,0,2>: Cost 4 vsldoi8 <0,4,2,5>, <0,2,1,2>
+  3843214936U,	// <2,5,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <5,0,3,0>
+  3760939355U,	// <2,5,0,4>: Cost 4 vsldoi8 <0,4,2,5>, <0,4,2,5>
+  3867102827U,	// <2,5,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,5,1>
+  3867102836U,	// <2,5,0,6>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,6,1>
+  3867102844U,	// <2,5,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,7,0>
+  2693169821U,	// <2,5,0,u>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3766911724U,	// <2,5,1,0>: Cost 4 vsldoi8 <1,4,2,5>, <1,0,2,1>
+  3766911796U,	// <2,5,1,1>: Cost 4 vsldoi8 <1,4,2,5>, <1,1,1,1>
+  2693170070U,	// <2,5,1,2>: Cost 3 vsldoi8 <1,4,2,5>, <1,2,3,0>
+  3384798262U,	// <2,5,1,3>: Cost 4 vmrglw <4,u,2,1>, <4,2,5,3>
+  2693170228U,	// <2,5,1,4>: Cost 3 vsldoi8 <1,4,2,5>, <1,4,2,5>
+  3301208068U,	// <2,5,1,5>: Cost 4 vmrghw <2,1,3,5>, <5,5,5,5>
+  3366879607U,	// <2,5,1,6>: Cost 4 vmrglw <1,u,2,1>, <0,4,5,6>
+  3867102925U,	// <2,5,1,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,1,7,0>
+  2695824760U,	// <2,5,1,u>: Cost 3 vsldoi8 <1,u,2,5>, <1,u,2,5>
+  2642845798U,	// <2,5,2,0>: Cost 3 vsldoi4 <4,2,5,2>, LHS
+  2295139218U,	// <2,5,2,1>: Cost 3 vmrglw <2,2,2,2>, <4,0,5,1>
+  2699142760U,	// <2,5,2,2>: Cost 3 vsldoi8 <2,4,2,5>, <2,2,2,2>
+  3766912678U,	// <2,5,2,3>: Cost 4 vsldoi8 <1,4,2,5>, <2,3,0,1>
+  2699142925U,	// <2,5,2,4>: Cost 3 vsldoi8 <2,4,2,5>, <2,4,2,5>
+  2228031492U,	// <2,5,2,5>: Cost 3 vmrghw <2,2,2,2>, <5,5,5,5>
+  2295138818U,	// <2,5,2,6>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,6>
+  3368879347U,	// <2,5,2,7>: Cost 4 vmrglw <2,2,2,2>, <1,6,5,7>
+  2295138820U,	// <2,5,2,u>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,u>
+  2287184866U,	// <2,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256573842U,	// <2,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642855630U,	// <2,5,3,2>: Cost 3 vsldoi4 <4,2,5,3>, <2,3,4,5>
+  2287182763U,	// <2,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287184870U,	// <2,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256574170U,	// <2,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1213442562U,	// <2,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287183091U,	// <2,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1213442564U,	// <2,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3716604006U,	// <2,5,4,0>: Cost 4 vsldoi4 <4,2,5,4>, LHS
+  3716604822U,	// <2,5,4,1>: Cost 4 vsldoi4 <4,2,5,4>, <1,2,3,0>
+  3766914099U,	// <2,5,4,2>: Cost 4 vsldoi8 <1,4,2,5>, <4,2,5,0>
+  3368895403U,	// <2,5,4,3>: Cost 5 vmrglw <2,2,2,4>, <1,2,5,3>
+  3716607031U,	// <2,5,4,4>: Cost 4 vsldoi4 <4,2,5,4>, <4,2,5,4>
+  2693172534U,	// <2,5,4,5>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3363588610U,	// <2,5,4,6>: Cost 4 vmrglw <1,3,2,4>, <3,4,5,6>
+  3368895731U,	// <2,5,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,5,7>
+  2693172777U,	// <2,5,4,u>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3704668262U,	// <2,5,5,0>: Cost 4 vsldoi4 <2,2,5,5>, LHS
+  3704669078U,	// <2,5,5,1>: Cost 4 vsldoi4 <2,2,5,5>, <1,2,3,0>
+  3704669830U,	// <2,5,5,2>: Cost 4 vsldoi4 <2,2,5,5>, <2,2,5,5>
+  3364259460U,	// <2,5,5,3>: Cost 4 vmrglw <1,4,2,5>, <2,2,5,3>
+  3704671542U,	// <2,5,5,4>: Cost 4 vsldoi4 <2,2,5,5>, RHS
+  2793361412U,	// <2,5,5,5>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  3364258167U,	// <2,5,5,6>: Cost 4 vmrglw <1,4,2,5>, <0,4,5,6>
+  3867103249U,	// <2,5,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,5,7,0>
+  2793361412U,	// <2,5,5,u>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  2642878566U,	// <2,5,6,0>: Cost 3 vsldoi4 <4,2,5,6>, LHS
+  3386166810U,	// <2,5,6,1>: Cost 4 vmrglw <5,1,2,6>, <4,u,5,1>
+  2723033594U,	// <2,5,6,2>: Cost 3 vsldoi8 <6,4,2,5>, <6,2,7,3>
+  3848523842U,	// <2,5,6,3>: Cost 4 vsldoi12 <3,u,1,2>, <5,6,3,4>
+  2723033713U,	// <2,5,6,4>: Cost 3 vsldoi8 <6,4,2,5>, <6,4,2,5>
+  2230800388U,	// <2,5,6,5>: Cost 3 vmrghw <2,6,3,7>, <5,5,5,5>
+  2230800482U,	// <2,5,6,6>: Cost 3 vmrghw <2,6,3,7>, <5,6,7,0>
+  2785841252U,	// <2,5,6,7>: Cost 3 vsldoi12 <5,6,7,2>, <5,6,7,2>
+  2785914989U,	// <2,5,6,u>: Cost 3 vsldoi12 <5,6,u,2>, <5,6,u,2>
+  3796775930U,	// <2,5,7,0>: Cost 4 vsldoi8 <6,4,2,5>, <7,0,1,2>
+  3800757335U,	// <2,5,7,1>: Cost 4 vsldoi8 <7,1,2,5>, <7,1,2,5>
+  3853463689U,	// <2,5,7,2>: Cost 4 vsldoi12 <4,6,5,2>, <5,7,2,3>
+  3796776218U,	// <2,5,7,3>: Cost 4 vsldoi8 <6,4,2,5>, <7,3,6,2>
+  3796776294U,	// <2,5,7,4>: Cost 4 vsldoi8 <6,4,2,5>, <7,4,5,6>
+  3803411867U,	// <2,5,7,5>: Cost 4 vsldoi8 <7,5,2,5>, <7,5,2,5>
+  3371575081U,	// <2,5,7,6>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,6>
+  3796776516U,	// <2,5,7,7>: Cost 4 vsldoi8 <6,4,2,5>, <7,7,3,3>
+  3371575083U,	// <2,5,7,u>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,u>
+  2287225826U,	// <2,5,u,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256614802U,	// <2,5,u,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642896590U,	// <2,5,u,2>: Cost 3 vsldoi4 <4,2,5,u>, <2,3,4,5>
+  2287223723U,	// <2,5,u,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287225830U,	// <2,5,u,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256615130U,	// <2,5,u,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1208838658U,	// <2,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287224051U,	// <2,5,u,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1208838660U,	// <2,5,u,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3772227584U,	// <2,6,0,0>: Cost 4 vsldoi8 <2,3,2,6>, <0,0,0,0>
+  2698485862U,	// <2,6,0,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3759620282U,	// <2,6,0,2>: Cost 4 vsldoi8 <0,2,2,6>, <0,2,2,6>
+  3710675299U,	// <2,6,0,3>: Cost 4 vsldoi4 <3,2,6,0>, <3,2,6,0>
+  3767583058U,	// <2,6,0,4>: Cost 4 vsldoi8 <1,5,2,6>, <0,4,1,5>
+  3378153265U,	// <2,6,0,5>: Cost 5 vmrglw <3,7,2,0>, <2,4,6,5>
+  3865186637U,	// <2,6,0,6>: Cost 4 vsldoi12 <6,6,2,2>, <6,0,6,1>
+  2330291510U,	// <2,6,0,7>: Cost 3 vmrglw <u,1,2,0>, RHS
+  2698486429U,	// <2,6,0,u>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3734569062U,	// <2,6,1,0>: Cost 4 vsldoi4 <7,2,6,1>, LHS
+  3764929346U,	// <2,6,1,1>: Cost 4 vsldoi8 <1,1,2,6>, <1,1,2,6>
+  3772228502U,	// <2,6,1,2>: Cost 4 vsldoi8 <2,3,2,6>, <1,2,3,0>
+  3734571158U,	// <2,6,1,3>: Cost 4 vsldoi4 <7,2,6,1>, <3,0,1,2>
+  3734572342U,	// <2,6,1,4>: Cost 4 vsldoi4 <7,2,6,1>, RHS
+  3767583878U,	// <2,6,1,5>: Cost 4 vsldoi8 <1,5,2,6>, <1,5,2,6>
+  3768247511U,	// <2,6,1,6>: Cost 4 vsldoi8 <1,6,2,6>, <1,6,2,6>
+  2293140790U,	// <2,6,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  2293140791U,	// <2,6,1,u>: Cost 3 vmrglw <1,u,2,1>, RHS
+  3704717414U,	// <2,6,2,0>: Cost 4 vsldoi4 <2,2,6,2>, LHS
+  3395424589U,	// <2,6,2,1>: Cost 4 vmrglw <6,6,2,2>, <6,0,6,1>
+  2228031993U,	// <2,6,2,2>: Cost 3 vmrghw <2,2,2,2>, <6,2,7,2>
+  2698487485U,	// <2,6,2,3>: Cost 3 vsldoi8 <2,3,2,6>, <2,3,2,6>
+  3704720694U,	// <2,6,2,4>: Cost 4 vsldoi4 <2,2,6,2>, RHS
+  3773556575U,	// <2,6,2,5>: Cost 4 vsldoi8 <2,5,2,6>, <2,5,2,6>
+  2698487738U,	// <2,6,2,6>: Cost 3 vsldoi8 <2,3,2,6>, <2,6,3,7>
+  1221397814U,	// <2,6,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  1221397815U,	// <2,6,2,u>: Cost 2 vmrglw <2,2,2,2>, RHS
+  2636955750U,	// <2,6,3,0>: Cost 3 vsldoi4 <3,2,6,3>, LHS
+  2330314217U,	// <2,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2636957626U,	// <2,6,3,2>: Cost 3 vsldoi4 <3,2,6,3>, <2,6,3,7>
+  2287184230U,	// <2,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636959030U,	// <2,6,3,4>: Cost 3 vsldoi4 <3,2,6,3>, RHS
+  2648903448U,	// <2,6,3,5>: Cost 3 vsldoi4 <5,2,6,3>, <5,2,6,3>
+  1256575800U,	// <2,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135056694U,	// <2,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135056695U,	// <2,6,3,u>: Cost 1 vmrglw LHS, RHS
+  3710705766U,	// <2,6,4,0>: Cost 4 vsldoi4 <3,2,6,4>, LHS
+  3698762677U,	// <2,6,4,1>: Cost 5 vsldoi4 <1,2,6,4>, <1,2,6,4>
+  3710707389U,	// <2,6,4,2>: Cost 4 vsldoi4 <3,2,6,4>, <2,3,2,6>
+  3710708071U,	// <2,6,4,3>: Cost 4 vsldoi4 <3,2,6,4>, <3,2,6,4>
+  3710709046U,	// <2,6,4,4>: Cost 4 vsldoi4 <3,2,6,4>, RHS
+  2698489142U,	// <2,6,4,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  3796782457U,	// <2,6,4,6>: Cost 4 vsldoi8 <6,4,2,6>, <4,6,5,2>
+  2295156022U,	// <2,6,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  2295156023U,	// <2,6,4,u>: Cost 3 vmrglw <2,2,2,4>, RHS
+  3303870753U,	// <2,6,5,0>: Cost 4 vmrghw <2,5,3,6>, <6,0,1,2>
+  3788820134U,	// <2,6,5,1>: Cost 4 vsldoi8 <5,1,2,6>, <5,1,2,6>
+  3779530520U,	// <2,6,5,2>: Cost 4 vsldoi8 <3,5,2,6>, <5,2,6,3>
+  3303871026U,	// <2,6,5,3>: Cost 4 vmrghw <2,5,3,6>, <6,3,4,5>
+  3303871117U,	// <2,6,5,4>: Cost 4 vmrghw <2,5,3,6>, <6,4,5,6>
+  3791474666U,	// <2,6,5,5>: Cost 4 vsldoi8 <5,5,2,6>, <5,5,2,6>
+  3792138299U,	// <2,6,5,6>: Cost 4 vsldoi8 <5,6,2,6>, <5,6,2,6>
+  2290519350U,	// <2,6,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2290519351U,	// <2,6,5,u>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2631008358U,	// <2,6,6,0>: Cost 3 vsldoi4 <2,2,6,6>, LHS
+  3372893673U,	// <2,6,6,1>: Cost 4 vmrglw <2,u,2,6>, <2,0,6,1>
+  2791445264U,	// <2,6,6,2>: Cost 3 vsldoi12 <6,6,2,2>, <6,6,2,2>
+  2230800968U,	// <2,6,6,3>: Cost 3 vmrghw <2,6,3,7>, <6,3,7,0>
+  2631011638U,	// <2,6,6,4>: Cost 3 vsldoi4 <2,2,6,6>, RHS
+  3372894001U,	// <2,6,6,5>: Cost 4 vmrglw <2,u,2,6>, <2,4,6,5>
+  2793362232U,	// <2,6,6,6>: Cost 3 vsldoi12 <7,0,1,2>, <6,6,6,6>
+  2295835958U,	// <2,6,6,7>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2295835959U,	// <2,6,6,u>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2793362254U,	// <2,6,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,0,1>
+  2792035160U,	// <2,6,7,1>: Cost 3 vsldoi12 <6,7,1,2>, <6,7,1,2>
+  2792108897U,	// <2,6,7,2>: Cost 3 vsldoi12 <6,7,2,2>, <6,7,2,2>
+  2769474408U,	// <2,6,7,3>: Cost 3 vsldoi12 <3,0,1,2>, <6,7,3,0>
+  2793362294U,	// <2,6,7,4>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,4,5>
+  3371575089U,	// <2,6,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,6,5>
+  2792403845U,	// <2,6,7,6>: Cost 3 vsldoi12 <6,7,6,2>, <6,7,6,2>
+  2297834806U,	// <2,6,7,7>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2297834807U,	// <2,6,7,u>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2636996710U,	// <2,6,u,0>: Cost 3 vsldoi4 <3,2,6,u>, LHS
+  2698491694U,	// <2,6,u,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  2636998631U,	// <2,6,u,2>: Cost 3 vsldoi4 <3,2,6,u>, <2,6,u,7>
+  2282580326U,	// <2,6,u,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636999990U,	// <2,6,u,4>: Cost 3 vsldoi4 <3,2,6,u>, RHS
+  2698492058U,	// <2,6,u,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  1256616760U,	// <2,6,u,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135097654U,	// <2,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135097655U,	// <2,6,u,u>: Cost 1 vmrglw LHS, RHS
+  2666864742U,	// <2,7,0,0>: Cost 3 vsldoi4 <u,2,7,0>, LHS
+  1719620602U,	// <2,7,0,1>: Cost 2 vsldoi12 <7,0,1,2>, <7,0,1,2>
+  3768254637U,	// <2,7,0,2>: Cost 4 vsldoi8 <1,6,2,7>, <0,2,1,2>
+  3393417722U,	// <2,7,0,3>: Cost 4 vmrglw <6,3,2,0>, <6,2,7,3>
+  2666868022U,	// <2,7,0,4>: Cost 3 vsldoi4 <u,2,7,0>, RHS
+  3867104290U,	// <2,7,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,0,5,6>
+  3728667127U,	// <2,7,0,6>: Cost 4 vsldoi4 <6,2,7,0>, <6,2,7,0>
+  2666869817U,	// <2,7,0,7>: Cost 3 vsldoi4 <u,2,7,0>, <7,0,u,2>
+  1720136761U,	// <2,7,0,u>: Cost 2 vsldoi12 <7,0,u,2>, <7,0,u,2>
+  3728670822U,	// <2,7,1,0>: Cost 4 vsldoi4 <6,2,7,1>, LHS
+  3774227252U,	// <2,7,1,1>: Cost 4 vsldoi8 <2,6,2,7>, <1,1,1,1>
+  3774227350U,	// <2,7,1,2>: Cost 4 vsldoi8 <2,6,2,7>, <1,2,3,0>
+  2323001850U,	// <2,7,1,3>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  3728674102U,	// <2,7,1,4>: Cost 4 vsldoi4 <6,2,7,1>, RHS
+  3774227567U,	// <2,7,1,5>: Cost 5 vsldoi8 <2,6,2,7>, <1,5,0,1>
+  2694513880U,	// <2,7,1,6>: Cost 3 vsldoi8 <1,6,2,7>, <1,6,2,7>
+  3396744002U,	// <2,7,1,7>: Cost 4 vmrglw <6,u,2,1>, <6,6,7,7>
+  2323001850U,	// <2,7,1,u>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  2654937190U,	// <2,7,2,0>: Cost 3 vsldoi4 <6,2,7,2>, LHS
+  3728679732U,	// <2,7,2,1>: Cost 4 vsldoi4 <6,2,7,2>, <1,1,1,1>
+  2700486248U,	// <2,7,2,2>: Cost 3 vsldoi8 <2,6,2,7>, <2,2,2,2>
+  2321682938U,	// <2,7,2,3>: Cost 3 vmrglw <6,6,2,2>, <6,2,7,3>
+  2654940470U,	// <2,7,2,4>: Cost 3 vsldoi4 <6,2,7,2>, RHS
+  3859584196U,	// <2,7,2,5>: Cost 4 vsldoi12 <5,6,7,2>, <7,2,5,6>
+  2700486577U,	// <2,7,2,6>: Cost 3 vsldoi8 <2,6,2,7>, <2,6,2,7>
+  2228033132U,	// <2,7,2,7>: Cost 3 vmrghw <2,2,2,2>, <7,7,7,7>
+  2701813843U,	// <2,7,2,u>: Cost 3 vsldoi8 <2,u,2,7>, <2,u,2,7>
+  1581203558U,	// <2,7,3,0>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  2654946100U,	// <2,7,3,1>: Cost 3 vsldoi4 <6,2,7,3>, <1,1,1,1>
+  2637031354U,	// <2,7,3,2>: Cost 3 vsldoi4 <3,2,7,3>, <2,6,3,7>
+  1256575482U,	// <2,7,3,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581206838U,	// <2,7,3,4>: Cost 2 vsldoi4 <6,2,7,3>, RHS
+  2654949380U,	// <2,7,3,5>: Cost 3 vsldoi4 <6,2,7,3>, <5,5,5,5>
+  1581208058U,	// <2,7,3,6>: Cost 2 vsldoi4 <6,2,7,3>, <6,2,7,3>
+  1256575810U,	// <2,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581209390U,	// <2,7,3,u>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  3728695398U,	// <2,7,4,0>: Cost 4 vsldoi4 <6,2,7,4>, LHS
+  3869758782U,	// <2,7,4,1>: Cost 4 vsldoi12 <7,4,1,2>, <7,4,1,2>
+  3728696936U,	// <2,7,4,2>: Cost 4 vsldoi4 <6,2,7,4>, <2,2,2,2>
+  3393450490U,	// <2,7,4,3>: Cost 4 vmrglw <6,3,2,4>, <6,2,7,3>
+  3728698678U,	// <2,7,4,4>: Cost 4 vsldoi4 <6,2,7,4>, RHS
+  2700487990U,	// <2,7,4,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3728699899U,	// <2,7,4,6>: Cost 4 vsldoi4 <6,2,7,4>, <6,2,7,4>
+  3867104626U,	// <2,7,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <7,4,7,0>
+  2700488233U,	// <2,7,4,u>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3855160709U,	// <2,7,5,0>: Cost 4 vsldoi12 <5,0,1,2>, <7,5,0,1>
+  3728704406U,	// <2,7,5,1>: Cost 4 vsldoi4 <6,2,7,5>, <1,2,3,0>
+  3370233956U,	// <2,7,5,2>: Cost 4 vmrglw <2,4,2,5>, <5,6,7,2>
+  2320380410U,	// <2,7,5,3>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  3728706870U,	// <2,7,5,4>: Cost 4 vsldoi4 <6,2,7,5>, RHS
+  3867104694U,	// <2,7,5,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,5,5,5>
+  3792146492U,	// <2,7,5,6>: Cost 4 vsldoi8 <5,6,2,7>, <5,6,2,7>
+  3394122562U,	// <2,7,5,7>: Cost 4 vmrglw <6,4,2,5>, <6,6,7,7>
+  2320380410U,	// <2,7,5,u>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  2230801402U,	// <2,7,6,0>: Cost 3 vmrghw <2,6,3,7>, <7,0,1,2>
+  3768258984U,	// <2,7,6,1>: Cost 4 vsldoi8 <1,6,2,7>, <6,1,7,2>
+  2730349050U,	// <2,7,6,2>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  3372894575U,	// <2,7,6,3>: Cost 4 vmrglw <2,u,2,6>, <3,2,7,3>
+  2230801766U,	// <2,7,6,4>: Cost 3 vmrghw <2,6,3,7>, <7,4,5,6>
+  3304543670U,	// <2,7,6,5>: Cost 4 vmrghw <2,6,3,7>, <7,5,5,5>
+  3728716285U,	// <2,7,6,6>: Cost 4 vsldoi4 <6,2,7,6>, <6,2,7,6>
+  2230802028U,	// <2,7,6,7>: Cost 3 vmrghw <2,6,3,7>, <7,7,7,7>
+  2730349050U,	// <2,7,6,u>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  2793362983U,	// <2,7,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,0,1>
+  3728721112U,	// <2,7,7,1>: Cost 4 vsldoi4 <6,2,7,7>, <1,6,2,7>
+  3371574933U,	// <2,7,7,2>: Cost 4 vmrglw <2,6,2,7>, <2,2,7,2>
+  2327695866U,	// <2,7,7,3>: Cost 3 vmrglw <7,6,2,7>, <6,2,7,3>
+  3728723254U,	// <2,7,7,4>: Cost 4 vsldoi4 <6,2,7,7>, RHS
+  3371574855U,	// <2,7,7,5>: Cost 5 vmrglw <2,6,2,7>, <2,1,7,5>
+  2730350062U,	// <2,7,7,6>: Cost 3 vsldoi8 <7,6,2,7>, <7,6,2,7>
+  2793363052U,	// <2,7,7,7>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,7,7>
+  2798671471U,	// <2,7,7,u>: Cost 3 vsldoi12 <7,u,1,2>, <7,7,u,1>
+  1581244518U,	// <2,7,u,0>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1724929666U,	// <2,7,u,1>: Cost 2 vsldoi12 <7,u,1,2>, <7,u,1,2>
+  2637072314U,	// <2,7,u,2>: Cost 3 vsldoi4 <3,2,7,u>, <2,6,3,7>
+  1256616442U,	// <2,7,u,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581247798U,	// <2,7,u,4>: Cost 2 vsldoi4 <6,2,7,u>, RHS
+  2700490906U,	// <2,7,u,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  1581249023U,	// <2,7,u,6>: Cost 2 vsldoi4 <6,2,7,u>, <6,2,7,u>
+  1256616770U,	// <2,7,u,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581250350U,	// <2,7,u,u>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1611489280U,	// <2,u,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537747563U,	// <2,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685231277U,	// <2,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685231356U,	// <2,u,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611489618U,	// <2,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2226763930U,	// <2,u,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  2733007350U,	// <2,u,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660971737U,	// <2,u,0,7>: Cost 3 vsldoi4 <7,2,u,0>, <7,2,u,0>
+  537748125U,	// <2,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689876708U,	// <2,u,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611490100U,	// <2,u,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611490198U,	// <2,u,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  2293137564U,	// <2,u,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  2689877072U,	// <2,u,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689877103U,	// <2,u,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689877199U,	// <2,u,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2293140808U,	// <2,u,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  1616135548U,	// <2,u,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  1556938854U,	// <2,u,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1154291502U,	// <2,u,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  336380006U,	// <2,u,2,2>: Cost 1 vspltisw2 LHS
+  1611490982U,	// <2,u,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  1556942134U,	// <2,u,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1154291866U,	// <2,u,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  1611491258U,	// <2,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1221397832U,	// <2,u,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  336380006U,	// <2,u,2,u>: Cost 1 vspltisw2 LHS
+  1611491478U,	// <2,u,3,0>: Cost 2 vsldoi8 LHS, <3,0,1,2>
+  1213440073U,	// <2,u,3,1>: Cost 2 vmrglw LHS, <0,0,u,1>
+  1213442261U,	// <2,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135053468U,	// <2,u,3,3>: Cost 1 vmrglw LHS, LHS
+  1611491842U,	// <2,u,3,4>: Cost 2 vsldoi8 LHS, <3,4,5,6>
+  1213440401U,	// <2,u,3,5>: Cost 2 vmrglw LHS, <0,4,u,5>
+  1213442589U,	// <2,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135056712U,	// <2,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135053473U,	// <2,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1551425638U,	// <2,u,4,0>: Cost 2 vsldoi4 <1,2,u,4>, LHS
+  1551426503U,	// <2,u,4,1>: Cost 2 vsldoi4 <1,2,u,4>, <1,2,u,4>
+  2625169000U,	// <2,u,4,2>: Cost 3 vsldoi4 <1,2,u,4>, <2,2,2,2>
+  2625169558U,	// <2,u,4,3>: Cost 3 vsldoi4 <1,2,u,4>, <3,0,1,2>
+  1551428918U,	// <2,u,4,4>: Cost 2 vsldoi4 <1,2,u,4>, RHS
+  537750838U,	// <2,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2733010297U,	// <2,u,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2295156040U,	// <2,u,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  537751081U,	// <2,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689879624U,	// <2,u,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2230130478U,	// <2,u,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2631149217U,	// <2,u,5,2>: Cost 3 vsldoi4 <2,2,u,5>, <2,2,u,5>
+  2290516124U,	// <2,u,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2689879988U,	// <2,u,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659269124U,	// <2,u,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1691162778U,	// <2,u,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2290519368U,	// <2,u,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  1691162796U,	// <2,u,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230802131U,	// <2,u,6,0>: Cost 3 vmrghw <2,6,3,7>, <u,0,1,2>
+  1157060398U,	// <2,u,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659269626U,	// <2,u,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2764904656U,	// <2,u,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <u,6,3,7>
+  2230802495U,	// <2,u,6,4>: Cost 3 vmrghw <2,6,3,7>, <u,4,5,6>
+  1157060762U,	// <2,u,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  1659269944U,	// <2,u,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659269966U,	// <2,u,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1157060965U,	// <2,u,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659270138U,	// <2,u,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2727040090U,	// <2,u,7,1>: Cost 3 vsldoi8 <7,1,2,u>, <7,1,2,u>
+  2727703723U,	// <2,u,7,2>: Cost 3 vsldoi8 <7,2,2,u>, <7,2,2,u>
+  2297831580U,	// <2,u,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1659270502U,	// <2,u,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2733012406U,	// <2,u,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2730358255U,	// <2,u,7,6>: Cost 3 vsldoi8 <7,6,2,u>, <7,6,2,u>
+  1659270764U,	// <2,u,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659270786U,	// <2,u,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1213481923U,	// <2,u,u,0>: Cost 2 vmrglw LHS, <1,2,u,0>
+  537753390U,	// <2,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  336380006U,	// <2,u,u,2>: Cost 1 vspltisw2 LHS
+  135094428U,	// <2,u,u,3>: Cost 1 vmrglw LHS, LHS
+  1213481927U,	// <2,u,u,4>: Cost 2 vmrglw LHS, <1,2,u,4>
+  537753754U,	// <2,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1208838685U,	// <2,u,u,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135097672U,	// <2,u,u,7>: Cost 1 vmrglw LHS, RHS
+  135094433U,	// <2,u,u,u>: Cost 1 vmrglw LHS, LHS
+  1678557184U,	// <3,0,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1678557194U,	// <3,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2631181989U,	// <3,0,0,2>: Cost 3 vsldoi4 <2,3,0,0>, <2,3,0,0>
+  2289223984U,	// <3,0,0,3>: Cost 3 vmrglw <1,2,3,0>, <3,2,0,3>
+  2756943909U,	// <3,0,0,4>: Cost 3 vsldoi12 LHS, <0,0,4,1>
+  3362965729U,	// <3,0,0,5>: Cost 4 vmrglw <1,2,3,0>, <3,1,0,5>
+  3362966054U,	// <3,0,0,6>: Cost 4 vmrglw <1,2,3,0>, <3,5,0,6>
+  2289224312U,	// <3,0,0,7>: Cost 3 vmrglw <1,2,3,0>, <3,6,0,7>
+  1683202121U,	// <3,0,0,u>: Cost 2 vsldoi12 LHS, <0,0,u,1>
+  1557446758U,	// <3,0,1,0>: Cost 2 vsldoi4 <2,3,0,1>, LHS
+  2752741467U,	// <3,0,1,1>: Cost 3 vsldoi12 LHS, <0,1,1,1>
+  604815462U,	// <3,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2631190676U,	// <3,0,1,3>: Cost 3 vsldoi4 <2,3,0,1>, <3,0,1,0>
+  1557450038U,	// <3,0,1,4>: Cost 2 vsldoi4 <2,3,0,1>, RHS
+  2667024388U,	// <3,0,1,5>: Cost 3 vsldoi4 <u,3,0,1>, <5,5,5,5>
+  2800074894U,	// <3,0,1,6>: Cost 3 vsldoi12 LHS, <0,1,6,7>
+  2661053667U,	// <3,0,1,7>: Cost 3 vsldoi4 <7,3,0,1>, <7,3,0,1>
+  604815516U,	// <3,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696521165U,	// <3,0,2,0>: Cost 3 vsldoi8 <2,0,3,0>, <2,0,3,0>
+  2752741549U,	// <3,0,2,1>: Cost 3 vsldoi12 LHS, <0,2,1,2>
+  2691876456U,	// <3,0,2,2>: Cost 3 vsldoi8 <1,2,3,0>, <2,2,2,2>
+  2691876518U,	// <3,0,2,3>: Cost 3 vsldoi8 <1,2,3,0>, <2,3,0,1>
+  3830685895U,	// <3,0,2,4>: Cost 4 vsldoi12 LHS, <0,2,4,1>
+  3765618536U,	// <3,0,2,5>: Cost 4 vsldoi8 <1,2,3,0>, <2,5,3,6>
+  2691876794U,	// <3,0,2,6>: Cost 3 vsldoi8 <1,2,3,0>, <2,6,3,7>
+  2701166596U,	// <3,0,2,7>: Cost 3 vsldoi8 <2,7,3,0>, <2,7,3,0>
+  2756944108U,	// <3,0,2,u>: Cost 3 vsldoi12 LHS, <0,2,u,2>
+  2691877014U,	// <3,0,3,0>: Cost 3 vsldoi8 <1,2,3,0>, <3,0,1,2>
+  1161003110U,	// <3,0,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2691877168U,	// <3,0,3,2>: Cost 3 vsldoi8 <1,2,3,0>, <3,2,0,3>
+  2691877246U,	// <3,0,3,3>: Cost 3 vsldoi8 <1,2,3,0>, <3,3,0,0>
+  2691877378U,	// <3,0,3,4>: Cost 3 vsldoi8 <1,2,3,0>, <3,4,5,6>
+  3765619238U,	// <3,0,3,5>: Cost 4 vsldoi8 <1,2,3,0>, <3,5,0,6>
+  2691877496U,	// <3,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  3368962680U,	// <3,0,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,0,7>
+  1161003677U,	// <3,0,3,u>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2289254400U,	// <3,0,4,0>: Cost 3 vmrglw <1,2,3,4>, <0,0,0,0>
+  1678557522U,	// <3,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2631214761U,	// <3,0,4,2>: Cost 3 vsldoi4 <2,3,0,4>, <2,3,0,4>
+  2235580672U,	// <3,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  2756944237U,	// <3,0,4,4>: Cost 3 vsldoi12 LHS, <0,4,4,5>
+  1618136374U,	// <3,0,4,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  3309322742U,	// <3,0,4,6>: Cost 4 vmrghw <3,4,5,6>, <0,6,1,7>
+  3362998904U,	// <3,0,4,7>: Cost 4 vmrglw <1,2,3,4>, <3,6,0,7>
+  1683202449U,	// <3,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  3765620296U,	// <3,0,5,0>: Cost 4 vsldoi8 <1,2,3,0>, <5,0,1,2>
+  2752299427U,	// <3,0,5,1>: Cost 3 vsldoi12 LHS, <0,5,1,5>
+  3789508346U,	// <3,0,5,2>: Cost 4 vsldoi8 <5,2,3,0>, <5,2,3,0>
+  3403486842U,	// <3,0,5,3>: Cost 4 vmrglw <u,0,3,5>, <7,u,0,3>
+  3765620660U,	// <3,0,5,4>: Cost 4 vsldoi8 <1,2,3,0>, <5,4,5,6>
+  2733682692U,	// <3,0,5,5>: Cost 3 vsldoi8 <u,2,3,0>, <5,5,5,5>
+  2800075218U,	// <3,0,5,6>: Cost 3 vsldoi12 LHS, <0,5,6,7>
+  3873817044U,	// <3,0,5,7>: Cost 4 vsldoi12 LHS, <0,5,7,0>
+  2800075234U,	// <3,0,5,u>: Cost 3 vsldoi12 LHS, <0,5,u,5>
+  2752299501U,	// <3,0,6,0>: Cost 3 vsldoi12 LHS, <0,6,0,7>
+  2236547174U,	// <3,0,6,1>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2733683194U,	// <3,0,6,2>: Cost 3 vsldoi8 <u,2,3,0>, <6,2,7,3>
+  3844473352U,	// <3,0,6,3>: Cost 4 vsldoi12 <3,2,0,3>, <0,6,3,7>
+  3310289234U,	// <3,0,6,4>: Cost 4 vmrghw <3,6,0,7>, <0,4,1,5>
+  3873817114U,	// <3,0,6,5>: Cost 4 vsldoi12 LHS, <0,6,5,7>
+  2733683512U,	// <3,0,6,6>: Cost 3 vsldoi8 <u,2,3,0>, <6,6,6,6>
+  2725057384U,	// <3,0,6,7>: Cost 3 vsldoi8 <6,7,3,0>, <6,7,3,0>
+  2236547741U,	// <3,0,6,u>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2297905152U,	// <3,0,7,0>: Cost 3 vmrglw <2,6,3,7>, <0,0,0,0>
+  2297906854U,	// <3,0,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,1>
+  2727711916U,	// <3,0,7,2>: Cost 3 vsldoi8 <7,2,3,0>, <7,2,3,0>
+  3371649328U,	// <3,0,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,0,3>
+  2733684070U,	// <3,0,7,4>: Cost 3 vsldoi8 <u,2,3,0>, <7,4,5,6>
+  3734843490U,	// <3,0,7,5>: Cost 4 vsldoi4 <7,3,0,7>, <5,6,7,0>
+  3798799895U,	// <3,0,7,6>: Cost 4 vsldoi8 <6,7,3,0>, <7,6,7,3>
+  2733684332U,	// <3,0,7,7>: Cost 3 vsldoi8 <u,2,3,0>, <7,7,7,7>
+  2297906861U,	// <3,0,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,u>
+  1557504102U,	// <3,0,u,0>: Cost 2 vsldoi4 <2,3,0,u>, LHS
+  1678557842U,	// <3,0,u,1>: Cost 2 vsldoi12 LHS, <0,u,1,1>
+  604816029U,	// <3,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2691880892U,	// <3,0,u,3>: Cost 3 vsldoi8 <1,2,3,0>, <u,3,0,1>
+  1557507382U,	// <3,0,u,4>: Cost 2 vsldoi4 <2,3,0,u>, RHS
+  1618139290U,	// <3,0,u,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  2691881168U,	// <3,0,u,6>: Cost 3 vsldoi8 <1,2,3,0>, <u,6,3,7>
+  2661111018U,	// <3,0,u,7>: Cost 3 vsldoi4 <7,3,0,u>, <7,3,0,u>
+  604816083U,	// <3,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2619310332U,	// <3,1,0,0>: Cost 3 vsldoi4 <0,3,1,0>, <0,3,1,0>
+  2756944612U,	// <3,1,0,1>: Cost 3 vsldoi12 LHS, <1,0,1,2>
+  2289221724U,	// <3,1,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,1,2>
+  2619312278U,	// <3,1,0,3>: Cost 3 vsldoi4 <0,3,1,0>, <3,0,1,2>
+  2619313462U,	// <3,1,0,4>: Cost 3 vsldoi4 <0,3,1,0>, RHS
+  2289221970U,	// <3,1,0,5>: Cost 3 vmrglw <1,2,3,0>, <0,4,1,5>
+  2232599768U,	// <3,1,0,6>: Cost 3 vmrghw <3,0,1,2>, <1,6,2,7>
+  3362964687U,	// <3,1,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,1,7>
+  2619316014U,	// <3,1,0,u>: Cost 3 vsldoi4 <0,3,1,0>, LHS
+  2756944683U,	// <3,1,1,0>: Cost 3 vsldoi12 LHS, <1,1,0,1>
+  1678558004U,	// <3,1,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2691883927U,	// <3,1,1,2>: Cost 3 vsldoi8 <1,2,3,1>, <1,2,3,1>
+  3826631496U,	// <3,1,1,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,1,3,3>
+  2756944723U,	// <3,1,1,4>: Cost 3 vsldoi12 LHS, <1,1,4,5>
+  2756944732U,	// <3,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  3830686561U,	// <3,1,1,6>: Cost 4 vsldoi12 LHS, <1,1,6,1>
+  3734869228U,	// <3,1,1,7>: Cost 4 vsldoi4 <7,3,1,1>, <7,3,1,1>
+  1678558004U,	// <3,1,1,u>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2696529358U,	// <3,1,2,0>: Cost 3 vsldoi8 <2,0,3,1>, <2,0,3,1>
+  2756944775U,	// <3,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  2294548630U,	// <3,1,2,2>: Cost 3 vmrglw <2,1,3,2>, <3,0,1,2>
+  1678558102U,	// <3,1,2,3>: Cost 2 vsldoi12 LHS, <1,2,3,0>
+  2631273782U,	// <3,1,2,4>: Cost 3 vsldoi4 <2,3,1,2>, RHS
+  2756944811U,	// <3,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  3830686644U,	// <3,1,2,6>: Cost 4 vsldoi12 LHS, <1,2,6,3>
+  2800075706U,	// <3,1,2,7>: Cost 3 vsldoi12 LHS, <1,2,7,0>
+  1679000515U,	// <3,1,2,u>: Cost 2 vsldoi12 LHS, <1,2,u,0>
+  2619334911U,	// <3,1,3,0>: Cost 3 vsldoi4 <0,3,1,3>, <0,3,1,3>
+  2295218186U,	// <3,1,3,1>: Cost 3 vmrglw <2,2,3,3>, <0,0,1,1>
+  2293229718U,	// <3,1,3,2>: Cost 3 vmrglw <1,u,3,3>, <3,0,1,2>
+  2619337116U,	// <3,1,3,3>: Cost 3 vsldoi4 <0,3,1,3>, <3,3,3,3>
+  2619338038U,	// <3,1,3,4>: Cost 3 vsldoi4 <0,3,1,3>, RHS
+  2295218514U,	// <3,1,3,5>: Cost 3 vmrglw <2,2,3,3>, <0,4,1,5>
+  3830686729U,	// <3,1,3,6>: Cost 4 vsldoi12 LHS, <1,3,6,7>
+  3368961231U,	// <3,1,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,1,7>
+  2619340590U,	// <3,1,3,u>: Cost 3 vsldoi4 <0,3,1,3>, LHS
+  2619343104U,	// <3,1,4,0>: Cost 3 vsldoi4 <0,3,1,4>, <0,3,1,4>
+  2289254410U,	// <3,1,4,1>: Cost 3 vmrglw <1,2,3,4>, <0,0,1,1>
+  2289256598U,	// <3,1,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,1,2>
+  2619345410U,	// <3,1,4,3>: Cost 3 vsldoi4 <0,3,1,4>, <3,4,5,6>
+  2619346230U,	// <3,1,4,4>: Cost 3 vsldoi4 <0,3,1,4>, RHS
+  2756944976U,	// <3,1,4,5>: Cost 3 vsldoi12 LHS, <1,4,5,6>
+  3362996401U,	// <3,1,4,6>: Cost 4 vmrglw <1,2,3,4>, <0,2,1,6>
+  3362997455U,	// <3,1,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,1,7>
+  2619348782U,	// <3,1,4,u>: Cost 3 vsldoi4 <0,3,1,4>, LHS
+  2756945007U,	// <3,1,5,0>: Cost 3 vsldoi12 LHS, <1,5,0,1>
+  3830686840U,	// <3,1,5,1>: Cost 4 vsldoi12 LHS, <1,5,1,1>
+  3358361750U,	// <3,1,5,2>: Cost 4 vmrglw <0,4,3,5>, <3,0,1,2>
+  3830686857U,	// <3,1,5,3>: Cost 4 vsldoi12 LHS, <1,5,3,0>
+  2756945047U,	// <3,1,5,4>: Cost 3 vsldoi12 LHS, <1,5,4,5>
+  2294571346U,	// <3,1,5,5>: Cost 3 vmrglw <2,1,3,5>, <0,4,1,5>
+  3806105698U,	// <3,1,5,6>: Cost 4 vsldoi8 <u,0,3,1>, <5,6,7,0>
+  3873817774U,	// <3,1,5,7>: Cost 4 vsldoi12 LHS, <1,5,7,1>
+  2756945079U,	// <3,1,5,u>: Cost 3 vsldoi12 LHS, <1,5,u,1>
+  3830686912U,	// <3,1,6,0>: Cost 4 vsldoi12 LHS, <1,6,0,1>
+  2756945103U,	// <3,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2236547990U,	// <3,1,6,2>: Cost 3 vmrghw <3,6,0,7>, <1,2,3,0>
+  3826631905U,	// <3,1,6,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,6,3,7>
+  3830686952U,	// <3,1,6,4>: Cost 4 vsldoi12 LHS, <1,6,4,5>
+  2756945139U,	// <3,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  3830686972U,	// <3,1,6,6>: Cost 4 vsldoi12 LHS, <1,6,6,7>
+  2800076030U,	// <3,1,6,7>: Cost 3 vsldoi12 LHS, <1,6,7,0>
+  2756945166U,	// <3,1,6,u>: Cost 3 vsldoi12 LHS, <1,6,u,7>
+  3699081318U,	// <3,1,7,0>: Cost 4 vsldoi4 <1,3,1,7>, LHS
+  2297905162U,	// <3,1,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,1>
+  2297907350U,	// <3,1,7,2>: Cost 3 vmrglw <2,6,3,7>, <3,0,1,2>
+  3365675182U,	// <3,1,7,3>: Cost 4 vmrglw <1,6,3,7>, <0,2,1,3>
+  3699084598U,	// <3,1,7,4>: Cost 4 vsldoi4 <1,3,1,7>, RHS
+  2297905490U,	// <3,1,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,1,5>
+  2297905329U,	// <3,1,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  3368330447U,	// <3,1,7,7>: Cost 4 vmrglw <2,1,3,7>, <1,6,1,7>
+  2297905169U,	// <3,1,7,u>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,u>
+  2619375876U,	// <3,1,u,0>: Cost 3 vsldoi4 <0,3,1,u>, <0,3,1,u>
+  1678558004U,	// <3,1,u,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2289289366U,	// <3,1,u,2>: Cost 3 vmrglw <1,2,3,u>, <3,0,1,2>
+  1679000956U,	// <3,1,u,3>: Cost 2 vsldoi12 LHS, <1,u,3,0>
+  2619378998U,	// <3,1,u,4>: Cost 3 vsldoi4 <0,3,1,u>, RHS
+  2756945297U,	// <3,1,u,5>: Cost 3 vsldoi12 LHS, <1,u,5,3>
+  2297905329U,	// <3,1,u,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  2800076192U,	// <3,1,u,7>: Cost 3 vsldoi12 LHS, <1,u,7,0>
+  1683203497U,	// <3,1,u,u>: Cost 2 vsldoi12 LHS, <1,u,u,0>
+  3362964203U,	// <3,2,0,0>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,0>
+  2289222380U,	// <3,2,0,1>: Cost 3 vmrglw <1,2,3,0>, <1,0,2,1>
+  2289222462U,	// <3,2,0,2>: Cost 3 vmrglw <1,2,3,0>, <1,1,2,2>
+  1215479910U,	// <3,2,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3362964207U,	// <3,2,0,4>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,4>
+  2289222708U,	// <3,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2232600506U,	// <3,2,0,6>: Cost 3 vmrghw <3,0,1,2>, <2,6,3,7>
+  3396142296U,	// <3,2,0,7>: Cost 4 vmrglw <6,7,3,0>, <1,6,2,7>
+  1215479915U,	// <3,2,0,u>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3699105894U,	// <3,2,1,0>: Cost 4 vsldoi4 <1,3,2,1>, LHS
+  3765633844U,	// <3,2,1,1>: Cost 4 vsldoi8 <1,2,3,2>, <1,1,1,1>
+  2691892120U,	// <3,2,1,2>: Cost 3 vsldoi8 <1,2,3,2>, <1,2,3,2>
+  2752300575U,	// <3,2,1,3>: Cost 3 vsldoi12 LHS, <2,1,3,1>
+  3699109174U,	// <3,2,1,4>: Cost 4 vsldoi4 <1,3,2,1>, RHS
+  3830687280U,	// <3,2,1,5>: Cost 5 vsldoi12 LHS, <2,1,5,0>
+  3830687289U,	// <3,2,1,6>: Cost 4 vsldoi12 LHS, <2,1,6,0>
+  3874260548U,	// <3,2,1,7>: Cost 4 vsldoi12 LHS, <2,1,7,2>
+  2752742988U,	// <3,2,1,u>: Cost 3 vsldoi12 LHS, <2,1,u,1>
+  2631344230U,	// <3,2,2,0>: Cost 3 vsldoi4 <2,3,2,2>, LHS
+  2697201184U,	// <3,2,2,1>: Cost 3 vsldoi8 <2,1,3,2>, <2,1,3,2>
+  1678558824U,	// <3,2,2,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678558834U,	// <3,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  2631347510U,	// <3,2,2,4>: Cost 3 vsldoi4 <2,3,2,2>, RHS
+  3368953613U,	// <3,2,2,5>: Cost 4 vmrglw <2,2,3,2>, <2,4,2,5>
+  2234304442U,	// <3,2,2,6>: Cost 3 vmrghw <3,2,6,3>, <2,6,3,7>
+  3368953777U,	// <3,2,2,7>: Cost 4 vmrglw <2,2,3,2>, <2,6,2,7>
+  1679001247U,	// <3,2,2,u>: Cost 2 vsldoi12 LHS, <2,2,u,3>
+  1678558886U,	// <3,2,3,0>: Cost 2 vsldoi12 LHS, <2,3,0,1>
+  2752300719U,	// <3,2,3,1>: Cost 3 vsldoi12 LHS, <2,3,1,1>
+  2752300729U,	// <3,2,3,2>: Cost 3 vsldoi12 LHS, <2,3,2,2>
+  1221476454U,	// <3,2,3,3>: Cost 2 vmrglw <2,2,3,3>, LHS
+  1678558926U,	// <3,2,3,4>: Cost 2 vsldoi12 LHS, <2,3,4,5>
+  2800076503U,	// <3,2,3,5>: Cost 3 vsldoi12 LHS, <2,3,5,5>
+  2234746810U,	// <3,2,3,6>: Cost 3 vmrghw <3,3,3,3>, <2,6,3,7>
+  2800076516U,	// <3,2,3,7>: Cost 3 vsldoi12 LHS, <2,3,7,0>
+  1678558958U,	// <3,2,3,u>: Cost 2 vsldoi12 LHS, <2,3,u,1>
+  3699130470U,	// <3,2,4,0>: Cost 4 vsldoi4 <1,3,2,4>, LHS
+  3362996972U,	// <3,2,4,1>: Cost 4 vmrglw <1,2,3,4>, <1,0,2,1>
+  2289256040U,	// <3,2,4,2>: Cost 3 vmrglw <1,2,3,4>, <2,2,2,2>
+  1215512678U,	// <3,2,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3362998676U,	// <3,2,4,4>: Cost 4 vmrglw <1,2,3,4>, <3,3,2,4>
+  2691894582U,	// <3,2,4,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2235582394U,	// <3,2,4,6>: Cost 3 vmrghw <3,4,5,6>, <2,6,3,7>
+  3734967544U,	// <3,2,4,7>: Cost 4 vsldoi4 <7,3,2,4>, <7,3,2,4>
+  1215512683U,	// <3,2,4,u>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3705110630U,	// <3,2,5,0>: Cost 4 vsldoi4 <2,3,2,5>, LHS
+  3368313985U,	// <3,2,5,1>: Cost 4 vmrglw <2,1,3,5>, <1,5,2,1>
+  3368314472U,	// <3,2,5,2>: Cost 4 vmrglw <2,1,3,5>, <2,2,2,2>
+  2756945768U,	// <3,2,5,3>: Cost 3 vsldoi12 LHS, <2,5,3,6>
+  3705113910U,	// <3,2,5,4>: Cost 4 vsldoi4 <2,3,2,5>, RHS
+  3310061416U,	// <3,2,5,5>: Cost 4 vmrghw <3,5,6,6>, <2,5,3,6>
+  3310135226U,	// <3,2,5,6>: Cost 4 vmrghw <3,5,7,6>, <2,6,3,7>
+  3370305457U,	// <3,2,5,7>: Cost 5 vmrglw <2,4,3,5>, <2,6,2,7>
+  2752743317U,	// <3,2,5,u>: Cost 3 vsldoi12 LHS, <2,5,u,6>
+  2631376998U,	// <3,2,6,0>: Cost 3 vsldoi4 <2,3,2,6>, LHS
+  3705119540U,	// <3,2,6,1>: Cost 4 vsldoi4 <2,3,2,6>, <1,1,1,1>
+  2631378621U,	// <3,2,6,2>: Cost 3 vsldoi4 <2,3,2,6>, <2,3,2,6>
+  1678559162U,	// <3,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2631380278U,	// <3,2,6,4>: Cost 3 vsldoi4 <2,3,2,6>, RHS
+  3370976956U,	// <3,2,6,5>: Cost 4 vmrglw <2,5,3,6>, <2,3,2,5>
+  2237065146U,	// <3,2,6,6>: Cost 3 vmrghw <3,6,7,7>, <2,6,3,7>
+  3798815594U,	// <3,2,6,7>: Cost 4 vsldoi8 <6,7,3,2>, <6,7,3,2>
+  1679001575U,	// <3,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  2800076778U,	// <3,2,7,0>: Cost 3 vsldoi12 LHS, <2,7,0,1>
+  3371647724U,	// <3,2,7,1>: Cost 4 vmrglw <2,6,3,7>, <1,0,2,1>
+  2297906792U,	// <3,2,7,2>: Cost 3 vmrglw <2,6,3,7>, <2,2,2,2>
+  1224163430U,	// <3,2,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  3705130294U,	// <3,2,7,4>: Cost 4 vsldoi4 <2,3,2,7>, RHS
+  3371648052U,	// <3,2,7,5>: Cost 4 vmrglw <2,6,3,7>, <1,4,2,5>
+  2297906877U,	// <3,2,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,3,2,6>
+  3371648702U,	// <3,2,7,7>: Cost 4 vmrglw <2,6,3,7>, <2,3,2,7>
+  1224163435U,	// <3,2,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1679001659U,	// <3,2,u,0>: Cost 2 vsldoi12 LHS, <2,u,0,1>
+  2752743492U,	// <3,2,u,1>: Cost 3 vsldoi12 LHS, <2,u,1,1>
+  1678558824U,	// <3,2,u,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678559320U,	// <3,2,u,3>: Cost 2 vsldoi12 LHS, <2,u,3,3>
+  1679001699U,	// <3,2,u,4>: Cost 2 vsldoi12 LHS, <2,u,4,5>
+  2691897498U,	// <3,2,u,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2237908922U,	// <3,2,u,6>: Cost 3 vmrghw <3,u,1,2>, <2,6,3,7>
+  2800519289U,	// <3,2,u,7>: Cost 3 vsldoi12 LHS, <2,u,7,0>
+  1679001731U,	// <3,2,u,u>: Cost 2 vsldoi12 LHS, <2,u,u,1>
+  1215480726U,	// <3,3,0,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1678559382U,	// <3,3,0,1>: Cost 2 vsldoi12 LHS, <3,0,1,2>
+  2631403200U,	// <3,3,0,2>: Cost 3 vsldoi4 <2,3,3,0>, <2,3,3,0>
+  2289223282U,	// <3,3,0,3>: Cost 3 vmrglw <1,2,3,0>, <2,2,3,3>
+  2752301232U,	// <3,3,0,4>: Cost 3 vsldoi12 LHS, <3,0,4,1>
+  3362965027U,	// <3,3,0,5>: Cost 4 vmrglw <1,2,3,0>, <2,1,3,5>
+  3362965352U,	// <3,3,0,6>: Cost 4 vmrglw <1,2,3,0>, <2,5,3,6>
+  2289223610U,	// <3,3,0,7>: Cost 3 vmrglw <1,2,3,0>, <2,6,3,7>
+  1678559445U,	// <3,3,0,u>: Cost 2 vsldoi12 LHS, <3,0,u,2>
+  3830687964U,	// <3,3,1,0>: Cost 4 vsldoi12 LHS, <3,1,0,0>
+  2752301286U,	// <3,3,1,1>: Cost 3 vsldoi12 LHS, <3,1,1,1>
+  2752301297U,	// <3,3,1,2>: Cost 3 vsldoi12 LHS, <3,1,2,3>
+  2305157532U,	// <3,3,1,3>: Cost 3 vmrglw <3,u,3,1>, <3,3,3,3>
+  3830688000U,	// <3,3,1,4>: Cost 4 vsldoi12 LHS, <3,1,4,0>
+  3830688009U,	// <3,3,1,5>: Cost 4 vsldoi12 LHS, <3,1,5,0>
+  3830688019U,	// <3,3,1,6>: Cost 4 vsldoi12 LHS, <3,1,6,1>
+  3362973626U,	// <3,3,1,7>: Cost 4 vmrglw <1,2,3,1>, <2,6,3,7>
+  2752743719U,	// <3,3,1,u>: Cost 3 vsldoi12 LHS, <3,1,u,3>
+  2631417958U,	// <3,3,2,0>: Cost 3 vsldoi4 <2,3,3,2>, LHS
+  3826043193U,	// <3,3,2,1>: Cost 4 vsldoi12 LHS, <3,2,1,3>
+  1624131186U,	// <3,3,2,2>: Cost 2 vsldoi8 <2,2,3,3>, <2,2,3,3>
+  2752301384U,	// <3,3,2,3>: Cost 3 vsldoi12 LHS, <3,2,3,0>
+  2631421238U,	// <3,3,2,4>: Cost 3 vsldoi4 <2,3,3,2>, RHS
+  3826485602U,	// <3,3,2,5>: Cost 4 vsldoi12 LHS, <3,2,5,u>
+  2752301414U,	// <3,3,2,6>: Cost 3 vsldoi12 LHS, <3,2,6,3>
+  2771249519U,	// <3,3,2,7>: Cost 3 vsldoi12 <3,2,7,3>, <3,2,7,3>
+  1628112984U,	// <3,3,2,u>: Cost 2 vsldoi8 <2,u,3,3>, <2,u,3,3>
+  1563656294U,	// <3,3,3,0>: Cost 2 vsldoi4 <3,3,3,3>, LHS
+  2301855911U,	// <3,3,3,1>: Cost 3 vmrglw <3,3,3,3>, <3,0,3,1>
+  2697873730U,	// <3,3,3,2>: Cost 3 vsldoi8 <2,2,3,3>, <3,2,2,3>
+  403488870U,	// <3,3,3,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,3,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  2301856239U,	// <3,3,3,5>: Cost 3 vmrglw <3,3,3,3>, <3,4,3,5>
+  2697874067U,	// <3,3,3,6>: Cost 3 vsldoi8 <2,2,3,3>, <3,6,3,7>
+  2295220154U,	// <3,3,3,7>: Cost 3 vmrglw <2,2,3,3>, <2,6,3,7>
+  403488870U,	// <3,3,3,u>: Cost 1 vspltisw3 LHS
+  2289255318U,	// <3,3,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,3,0>
+  2631435162U,	// <3,3,4,1>: Cost 3 vsldoi4 <2,3,3,4>, <1,2,3,4>
+  2631435972U,	// <3,3,4,2>: Cost 3 vsldoi4 <2,3,3,4>, <2,3,3,4>
+  2289256050U,	// <3,3,4,3>: Cost 3 vmrglw <1,2,3,4>, <2,2,3,3>
+  1215513498U,	// <3,3,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679002114U,	// <3,3,4,5>: Cost 2 vsldoi12 LHS, <3,4,5,6>
+  3362998120U,	// <3,3,4,6>: Cost 4 vmrglw <1,2,3,4>, <2,5,3,6>
+  2289256378U,	// <3,3,4,7>: Cost 3 vmrglw <1,2,3,4>, <2,6,3,7>
+  1679002141U,	// <3,3,4,u>: Cost 2 vsldoi12 LHS, <3,4,u,6>
+  3831130657U,	// <3,3,5,0>: Cost 4 vsldoi12 LHS, <3,5,0,1>
+  3376277671U,	// <3,3,5,1>: Cost 4 vmrglw <3,4,3,5>, <3,0,3,1>
+  3771617012U,	// <3,3,5,2>: Cost 4 vsldoi8 <2,2,3,3>, <5,2,2,3>
+  2302536092U,	// <3,3,5,3>: Cost 3 vmrglw <3,4,3,5>, <3,3,3,3>
+  3831130697U,	// <3,3,5,4>: Cost 4 vsldoi12 LHS, <3,5,4,5>
+  2294572579U,	// <3,3,5,5>: Cost 3 vmrglw <2,1,3,5>, <2,1,3,5>
+  2800519773U,	// <3,3,5,6>: Cost 3 vsldoi12 LHS, <3,5,6,7>
+  3368314810U,	// <3,3,5,7>: Cost 4 vmrglw <2,1,3,5>, <2,6,3,7>
+  2800519791U,	// <3,3,5,u>: Cost 3 vsldoi12 LHS, <3,5,u,7>
+  2800077432U,	// <3,3,6,0>: Cost 3 vsldoi12 LHS, <3,6,0,7>
+  3310291185U,	// <3,3,6,1>: Cost 4 vmrghw <3,6,0,7>, <3,1,2,3>
+  2789165706U,	// <3,3,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <3,6,2,7>
+  2764982931U,	// <3,3,6,3>: Cost 3 vsldoi12 <2,2,3,3>, <3,6,3,7>
+  2800077468U,	// <3,3,6,4>: Cost 3 vsldoi12 LHS, <3,6,4,7>
+  3873819301U,	// <3,3,6,5>: Cost 4 vsldoi12 LHS, <3,6,5,7>
+  2297235304U,	// <3,3,6,6>: Cost 3 vmrglw <2,5,3,6>, <2,5,3,6>
+  2725081963U,	// <3,3,6,7>: Cost 3 vsldoi8 <6,7,3,3>, <6,7,3,3>
+  2725745596U,	// <3,3,6,u>: Cost 3 vsldoi8 <6,u,3,3>, <6,u,3,3>
+  2631458918U,	// <3,3,7,0>: Cost 3 vsldoi4 <2,3,3,7>, LHS
+  3705201460U,	// <3,3,7,1>: Cost 4 vsldoi4 <2,3,3,7>, <1,1,1,1>
+  2631460551U,	// <3,3,7,2>: Cost 3 vsldoi4 <2,3,3,7>, <2,3,3,7>
+  2297906802U,	// <3,3,7,3>: Cost 3 vmrglw <2,6,3,7>, <2,2,3,3>
+  2631462198U,	// <3,3,7,4>: Cost 3 vsldoi4 <2,3,3,7>, RHS
+  3371648547U,	// <3,3,7,5>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,5>
+  3371648548U,	// <3,3,7,6>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,6>
+  1224165306U,	// <3,3,7,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1224165306U,	// <3,3,7,u>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1215480726U,	// <3,3,u,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1679002398U,	// <3,3,u,1>: Cost 2 vsldoi12 LHS, <3,u,1,2>
+  1659967368U,	// <3,3,u,2>: Cost 2 vsldoi8 <u,2,3,3>, <u,2,3,3>
+  403488870U,	// <3,3,u,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,u,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  1679002438U,	// <3,3,u,5>: Cost 2 vsldoi12 LHS, <3,u,5,6>
+  2756946764U,	// <3,3,u,6>: Cost 3 vsldoi12 LHS, <3,u,6,3>
+  1224165306U,	// <3,3,u,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  403488870U,	// <3,3,u,u>: Cost 1 vspltisw3 LHS
+  2691907584U,	// <3,4,0,0>: Cost 3 vsldoi8 <1,2,3,4>, <0,0,0,0>
+  1618165862U,	// <3,4,0,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631476937U,	// <3,4,0,2>: Cost 3 vsldoi4 <2,3,4,0>, <2,3,4,0>
+  2232601732U,	// <3,4,0,3>: Cost 3 vmrghw <3,0,1,2>, <4,3,5,0>
+  2691907922U,	// <3,4,0,4>: Cost 3 vsldoi8 <1,2,3,4>, <0,4,1,5>
+  1158860086U,	// <3,4,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  3306343806U,	// <3,4,0,6>: Cost 4 vmrghw <3,0,1,2>, <4,6,5,7>
+  3366947484U,	// <3,4,0,7>: Cost 4 vmrglw <1,u,3,0>, <3,6,4,7>
+  1618166429U,	// <3,4,0,u>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631483494U,	// <3,4,1,0>: Cost 3 vsldoi4 <2,3,4,1>, LHS
+  2691908404U,	// <3,4,1,1>: Cost 3 vsldoi8 <1,2,3,4>, <1,1,1,1>
+  1618166682U,	// <3,4,1,2>: Cost 2 vsldoi8 <1,2,3,4>, <1,2,3,4>
+  3765650393U,	// <3,4,1,3>: Cost 4 vsldoi8 <1,2,3,4>, <1,3,1,4>
+  2631486774U,	// <3,4,1,4>: Cost 3 vsldoi4 <2,3,4,1>, RHS
+  2756946914U,	// <3,4,1,5>: Cost 3 vsldoi12 LHS, <4,1,5,0>
+  3765650639U,	// <3,4,1,6>: Cost 4 vsldoi8 <1,2,3,4>, <1,6,1,7>
+  3735090439U,	// <3,4,1,7>: Cost 4 vsldoi4 <7,3,4,1>, <7,3,4,1>
+  1622148480U,	// <3,4,1,u>: Cost 2 vsldoi8 <1,u,3,4>, <1,u,3,4>
+  3765650893U,	// <3,4,2,0>: Cost 4 vsldoi8 <1,2,3,4>, <2,0,3,0>
+  3831131154U,	// <3,4,2,1>: Cost 4 vsldoi12 LHS, <4,2,1,3>
+  2691909224U,	// <3,4,2,2>: Cost 3 vsldoi8 <1,2,3,4>, <2,2,2,2>
+  2691909286U,	// <3,4,2,3>: Cost 3 vsldoi8 <1,2,3,4>, <2,3,0,1>
+  2699208469U,	// <3,4,2,4>: Cost 3 vsldoi8 <2,4,3,4>, <2,4,3,4>
+  2233863478U,	// <3,4,2,5>: Cost 3 vmrghw <3,2,0,3>, RHS
+  2691909562U,	// <3,4,2,6>: Cost 3 vsldoi8 <1,2,3,4>, <2,6,3,7>
+  2701199368U,	// <3,4,2,7>: Cost 3 vsldoi8 <2,7,3,4>, <2,7,3,4>
+  2691909691U,	// <3,4,2,u>: Cost 3 vsldoi8 <1,2,3,4>, <2,u,0,1>
+  2691909782U,	// <3,4,3,0>: Cost 3 vsldoi8 <1,2,3,4>, <3,0,1,2>
+  3765651686U,	// <3,4,3,1>: Cost 4 vsldoi8 <1,2,3,4>, <3,1,1,1>
+  2691909972U,	// <3,4,3,2>: Cost 3 vsldoi8 <1,2,3,4>, <3,2,4,3>
+  2691910044U,	// <3,4,3,3>: Cost 3 vsldoi8 <1,2,3,4>, <3,3,3,3>
+  2691910096U,	// <3,4,3,4>: Cost 3 vsldoi8 <1,2,3,4>, <3,4,0,1>
+  1161006390U,	// <3,4,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691910300U,	// <3,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  3368962716U,	// <3,4,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,4,7>
+  1161006633U,	// <3,4,3,u>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2631508070U,	// <3,4,4,0>: Cost 3 vsldoi4 <2,3,4,4>, LHS
+  2631508890U,	// <3,4,4,1>: Cost 3 vsldoi4 <2,3,4,4>, <1,2,3,4>
+  2631509709U,	// <3,4,4,2>: Cost 3 vsldoi4 <2,3,4,4>, <2,3,4,4>
+  2289256788U,	// <3,4,4,3>: Cost 3 vmrglw <1,2,3,4>, <3,2,4,3>
+  1726336208U,	// <3,4,4,4>: Cost 2 vsldoi12 LHS, <4,4,4,4>
+  1618169142U,	// <3,4,4,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  3362998858U,	// <3,4,4,6>: Cost 4 vmrglw <1,2,3,4>, <3,5,4,6>
+  2289257116U,	// <3,4,4,7>: Cost 3 vmrglw <1,2,3,4>, <3,6,4,7>
+  1618169385U,	// <3,4,4,u>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  1557774438U,	// <3,4,5,0>: Cost 2 vsldoi4 <2,3,4,5>, LHS
+  2631516980U,	// <3,4,5,1>: Cost 3 vsldoi4 <2,3,4,5>, <1,1,1,1>
+  1557776078U,	// <3,4,5,2>: Cost 2 vsldoi4 <2,3,4,5>, <2,3,4,5>
+  2631518358U,	// <3,4,5,3>: Cost 3 vsldoi4 <2,3,4,5>, <3,0,1,2>
+  1557777718U,	// <3,4,5,4>: Cost 2 vsldoi4 <2,3,4,5>, RHS
+  2296563406U,	// <3,4,5,5>: Cost 3 vmrglw <2,4,3,5>, <2,3,4,5>
+  604818742U,	// <3,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2661381387U,	// <3,4,5,7>: Cost 3 vsldoi4 <7,3,4,5>, <7,3,4,5>
+  604818760U,	// <3,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  3705266278U,	// <3,4,6,0>: Cost 4 vsldoi4 <2,3,4,6>, LHS
+  3831131482U,	// <3,4,6,1>: Cost 4 vsldoi12 LHS, <4,6,1,7>
+  2733715962U,	// <3,4,6,2>: Cost 3 vsldoi8 <u,2,3,4>, <6,2,7,3>
+  3844771180U,	// <3,4,6,3>: Cost 4 vsldoi12 <3,2,4,3>, <4,6,3,7>
+  2800078197U,	// <3,4,6,4>: Cost 3 vsldoi12 LHS, <4,6,4,7>
+  2236550454U,	// <3,4,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716280U,	// <3,4,6,6>: Cost 3 vsldoi8 <u,2,3,4>, <6,6,6,6>
+  2725090156U,	// <3,4,6,7>: Cost 3 vsldoi8 <6,7,3,4>, <6,7,3,4>
+  2236550697U,	// <3,4,6,u>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716474U,	// <3,4,7,0>: Cost 3 vsldoi8 <u,2,3,4>, <7,0,1,2>
+  3371647013U,	// <3,4,7,1>: Cost 4 vmrglw <2,6,3,7>, <0,0,4,1>
+  2727744688U,	// <3,4,7,2>: Cost 3 vsldoi8 <7,2,3,4>, <7,2,3,4>
+  3371649364U,	// <3,4,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,4,3>
+  2733716838U,	// <3,4,7,4>: Cost 3 vsldoi8 <u,2,3,4>, <7,4,5,6>
+  2297906894U,	// <3,4,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,5>
+  3371647180U,	// <3,4,7,6>: Cost 4 vmrglw <2,6,3,7>, <0,2,4,6>
+  2733717100U,	// <3,4,7,7>: Cost 3 vsldoi8 <u,2,3,4>, <7,7,7,7>
+  2297906897U,	// <3,4,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,u>
+  1557799014U,	// <3,4,u,0>: Cost 2 vsldoi4 <2,3,4,u>, LHS
+  1618171694U,	// <3,4,u,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  1557800657U,	// <3,4,u,2>: Cost 2 vsldoi4 <2,3,4,u>, <2,3,4,u>
+  2691913660U,	// <3,4,u,3>: Cost 3 vsldoi8 <1,2,3,4>, <u,3,0,1>
+  1557802294U,	// <3,4,u,4>: Cost 2 vsldoi4 <2,3,4,u>, RHS
+  1618172058U,	// <3,4,u,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  604818985U,	// <3,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2661405966U,	// <3,4,u,7>: Cost 3 vsldoi4 <7,3,4,u>, <7,3,4,u>
+  604819003U,	// <3,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2643492966U,	// <3,5,0,0>: Cost 3 vsldoi4 <4,3,5,0>, LHS
+  2756947528U,	// <3,5,0,1>: Cost 3 vsldoi12 LHS, <5,0,1,2>
+  2331029019U,	// <3,5,0,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2643495062U,	// <3,5,0,3>: Cost 3 vsldoi4 <4,3,5,0>, <3,0,1,2>
+  2756947554U,	// <3,5,0,4>: Cost 3 vsldoi12 LHS, <5,0,4,1>
+  2800078443U,	// <3,5,0,5>: Cost 3 vsldoi12 LHS, <5,0,5,1>
+  2289224194U,	// <3,5,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,5,6>
+  3362964723U,	// <3,5,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,5,7>
+  2756947590U,	// <3,5,0,u>: Cost 3 vsldoi12 LHS, <5,0,u,1>
+  2800078479U,	// <3,5,1,0>: Cost 3 vsldoi12 LHS, <5,1,0,1>
+  2333027218U,	// <3,5,1,1>: Cost 3 vmrglw <u,5,3,1>, <4,0,5,1>
+  2691916699U,	// <3,5,1,2>: Cost 3 vsldoi8 <1,2,3,5>, <1,2,3,5>
+  3832901294U,	// <3,5,1,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,1,3,5>
+  2800078519U,	// <3,5,1,4>: Cost 3 vsldoi12 LHS, <5,1,4,5>
+  3830689467U,	// <3,5,1,5>: Cost 4 vsldoi12 LHS, <5,1,5,0>
+  3830689481U,	// <3,5,1,6>: Cost 4 vsldoi12 LHS, <5,1,6,5>
+  3873820365U,	// <3,5,1,7>: Cost 4 vsldoi12 LHS, <5,1,7,0>
+  2800078551U,	// <3,5,1,u>: Cost 3 vsldoi12 LHS, <5,1,u,1>
+  3770967487U,	// <3,5,2,0>: Cost 4 vsldoi8 <2,1,3,5>, <2,0,1,4>
+  2697225763U,	// <3,5,2,1>: Cost 3 vsldoi8 <2,1,3,5>, <2,1,3,5>
+  3830689523U,	// <3,5,2,2>: Cost 4 vsldoi12 LHS, <5,2,2,2>
+  2699216590U,	// <3,5,2,3>: Cost 3 vsldoi8 <2,4,3,5>, <2,3,4,5>
+  2699216662U,	// <3,5,2,4>: Cost 3 vsldoi8 <2,4,3,5>, <2,4,3,5>
+  2783047439U,	// <3,5,2,5>: Cost 3 vsldoi12 <5,2,5,3>, <5,2,5,3>
+  2783121176U,	// <3,5,2,6>: Cost 3 vsldoi12 <5,2,6,3>, <5,2,6,3>
+  3856936737U,	// <3,5,2,7>: Cost 4 vsldoi12 <5,2,7,3>, <5,2,7,3>
+  2701871194U,	// <3,5,2,u>: Cost 3 vsldoi8 <2,u,3,5>, <2,u,3,5>
+  2643517542U,	// <3,5,3,0>: Cost 3 vsldoi4 <4,3,5,3>, LHS
+  2331052946U,	// <3,5,3,1>: Cost 3 vmrglw <u,2,3,3>, <4,0,5,1>
+  3699345010U,	// <3,5,3,2>: Cost 4 vsldoi4 <1,3,5,3>, <2,2,3,3>
+  2705189276U,	// <3,5,3,3>: Cost 3 vsldoi8 <3,4,3,5>, <3,3,3,3>
+  2705189359U,	// <3,5,3,4>: Cost 3 vsldoi8 <3,4,3,5>, <3,4,3,5>
+  2331053274U,	// <3,5,3,5>: Cost 3 vmrglw <u,2,3,3>, <4,4,5,5>
+  2295220738U,	// <3,5,3,6>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,6>
+  3368961267U,	// <3,5,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,5,7>
+  2295220740U,	// <3,5,3,u>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,u>
+  2643525734U,	// <3,5,4,0>: Cost 3 vsldoi4 <4,3,5,4>, LHS
+  2331061138U,	// <3,5,4,1>: Cost 3 vmrglw <u,2,3,4>, <4,0,5,1>
+  2235584280U,	// <3,5,4,2>: Cost 3 vmrghw <3,4,5,6>, <5,2,6,3>
+  2643528194U,	// <3,5,4,3>: Cost 3 vsldoi4 <4,3,5,4>, <3,4,5,6>
+  2735713498U,	// <3,5,4,4>: Cost 3 vsldoi8 <u,5,3,5>, <4,4,5,5>
+  2756947892U,	// <3,5,4,5>: Cost 3 vsldoi12 LHS, <5,4,5,6>
+  2289256962U,	// <3,5,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,5,6>
+  3362997491U,	// <3,5,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,5,7>
+  2756947919U,	// <3,5,4,u>: Cost 3 vsldoi12 LHS, <5,4,u,6>
+  2800078803U,	// <3,5,5,0>: Cost 3 vsldoi12 LHS, <5,5,0,1>
+  2800078812U,	// <3,5,5,1>: Cost 3 vsldoi12 LHS, <5,5,1,1>
+  2631591639U,	// <3,5,5,2>: Cost 3 vsldoi4 <2,3,5,5>, <2,3,5,5>
+  3832901616U,	// <3,5,5,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,5,3,3>
+  2800078843U,	// <3,5,5,4>: Cost 3 vsldoi12 LHS, <5,5,4,5>
+  1726337028U,	// <3,5,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078862U,	// <3,5,5,6>: Cost 3 vsldoi12 LHS, <5,5,6,6>
+  3368314099U,	// <3,5,5,7>: Cost 4 vmrglw <2,1,3,5>, <1,6,5,7>
+  1726337028U,	// <3,5,5,u>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078884U,	// <3,5,6,0>: Cost 3 vsldoi12 LHS, <5,6,0,1>
+  2800078899U,	// <3,5,6,1>: Cost 3 vsldoi12 LHS, <5,6,1,7>
+  2631599832U,	// <3,5,6,2>: Cost 3 vsldoi4 <2,3,5,6>, <2,3,5,6>
+  2800078914U,	// <3,5,6,3>: Cost 3 vsldoi12 LHS, <5,6,3,4>
+  2800078924U,	// <3,5,6,4>: Cost 3 vsldoi12 LHS, <5,6,4,5>
+  2800078935U,	// <3,5,6,5>: Cost 3 vsldoi12 LHS, <5,6,5,7>
+  2297235970U,	// <3,5,6,6>: Cost 3 vmrglw <2,5,3,6>, <3,4,5,6>
+  1726337122U,	// <3,5,6,7>: Cost 2 vsldoi12 LHS, <5,6,7,0>
+  1726337131U,	// <3,5,6,u>: Cost 2 vsldoi12 LHS, <5,6,u,0>
+  3699376230U,	// <3,5,7,0>: Cost 4 vsldoi4 <1,3,5,7>, LHS
+  2333739922U,	// <3,5,7,1>: Cost 3 vmrglw <u,6,3,7>, <4,0,5,1>
+  3699378106U,	// <3,5,7,2>: Cost 4 vsldoi4 <1,3,5,7>, <2,6,3,7>
+  3371647915U,	// <3,5,7,3>: Cost 4 vmrglw <2,6,3,7>, <1,2,5,3>
+  3699379510U,	// <3,5,7,4>: Cost 4 vsldoi4 <1,3,5,7>, RHS
+  2333740250U,	// <3,5,7,5>: Cost 3 vmrglw <u,6,3,7>, <4,4,5,5>
+  2297907714U,	// <3,5,7,6>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,6>
+  3370984691U,	// <3,5,7,7>: Cost 4 vmrglw <2,5,3,7>, <1,6,5,7>
+  2297907716U,	// <3,5,7,u>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,u>
+  2800079046U,	// <3,5,u,0>: Cost 3 vsldoi12 LHS, <5,u,0,1>
+  2756948176U,	// <3,5,u,1>: Cost 3 vsldoi12 LHS, <5,u,1,2>
+  2331029019U,	// <3,5,u,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2800079076U,	// <3,5,u,3>: Cost 3 vsldoi12 LHS, <5,u,3,4>
+  2800079085U,	// <3,5,u,4>: Cost 3 vsldoi12 LHS, <5,u,4,4>
+  1726337028U,	// <3,5,u,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2289289730U,	// <3,5,u,6>: Cost 3 vmrglw <1,2,3,u>, <3,4,5,6>
+  1726337284U,	// <3,5,u,7>: Cost 2 vsldoi12 LHS, <5,u,7,0>
+  1726337293U,	// <3,5,u,u>: Cost 2 vsldoi12 LHS, <5,u,u,0>
+  3773628416U,	// <3,6,0,0>: Cost 4 vsldoi8 <2,5,3,6>, <0,0,0,0>
+  2699886694U,	// <3,6,0,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789167401U,	// <3,6,0,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,0,2,1>
+  3362965862U,	// <3,6,0,3>: Cost 4 vmrglw <1,2,3,0>, <3,2,6,3>
+  3773628754U,	// <3,6,0,4>: Cost 4 vsldoi8 <2,5,3,6>, <0,4,1,5>
+  3723284326U,	// <3,6,0,5>: Cost 4 vsldoi4 <5,3,6,0>, <5,3,6,0>
+  2800079181U,	// <3,6,0,6>: Cost 3 vsldoi12 LHS, <6,0,6,1>
+  1215483190U,	// <3,6,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1215483191U,	// <3,6,0,u>: Cost 2 vmrglw <1,2,3,0>, RHS
+  3873821032U,	// <3,6,1,0>: Cost 4 vsldoi12 LHS, <6,1,0,1>
+  3773629236U,	// <3,6,1,1>: Cost 4 vsldoi8 <2,5,3,6>, <1,1,1,1>
+  2691924892U,	// <3,6,1,2>: Cost 3 vsldoi8 <1,2,3,6>, <1,2,3,6>
+  3830690184U,	// <3,6,1,3>: Cost 5 vsldoi12 LHS, <6,1,3,6>
+  3873821072U,	// <3,6,1,4>: Cost 4 vsldoi12 LHS, <6,1,4,5>
+  3873821082U,	// <3,6,1,5>: Cost 4 vsldoi12 LHS, <6,1,5,6>
+  3403453240U,	// <3,6,1,6>: Cost 4 vmrglw <u,0,3,1>, <6,6,6,6>
+  2289233206U,	// <3,6,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2289233207U,	// <3,6,1,u>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2661498982U,	// <3,6,2,0>: Cost 3 vsldoi4 <7,3,6,2>, LHS
+  3770975780U,	// <3,6,2,1>: Cost 4 vsldoi8 <2,1,3,6>, <2,1,3,6>
+  2631640797U,	// <3,6,2,2>: Cost 3 vsldoi4 <2,3,6,2>, <2,3,6,2>
+  3771639485U,	// <3,6,2,3>: Cost 4 vsldoi8 <2,2,3,6>, <2,3,2,6>
+  2661502262U,	// <3,6,2,4>: Cost 3 vsldoi4 <7,3,6,2>, RHS
+  2699888488U,	// <3,6,2,5>: Cost 3 vsldoi8 <2,5,3,6>, <2,5,3,6>
+  2661503482U,	// <3,6,2,6>: Cost 3 vsldoi4 <7,3,6,2>, <6,2,7,3>
+  1715425786U,	// <3,6,2,7>: Cost 2 vsldoi12 <6,2,7,3>, <6,2,7,3>
+  1715499523U,	// <3,6,2,u>: Cost 2 vsldoi12 <6,2,u,3>, <6,2,u,3>
+  3773630614U,	// <3,6,3,0>: Cost 4 vsldoi8 <2,5,3,6>, <3,0,1,2>
+  3372942825U,	// <3,6,3,1>: Cost 4 vmrglw <2,u,3,3>, <2,0,6,1>
+  2234749434U,	// <3,6,3,2>: Cost 3 vmrghw <3,3,3,3>, <6,2,7,3>
+  3368962406U,	// <3,6,3,3>: Cost 4 vmrglw <2,2,3,3>, <3,2,6,3>
+  2699889154U,	// <3,6,3,4>: Cost 3 vsldoi8 <2,5,3,6>, <3,4,5,6>
+  3773631068U,	// <3,6,3,5>: Cost 4 vsldoi8 <2,5,3,6>, <3,5,6,6>
+  2331054904U,	// <3,6,3,6>: Cost 3 vmrglw <u,2,3,3>, <6,6,6,6>
+  1221479734U,	// <3,6,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  1221479735U,	// <3,6,3,u>: Cost 2 vmrglw <2,2,3,3>, RHS
+  2235584801U,	// <3,6,4,0>: Cost 3 vmrghw <3,4,5,6>, <6,0,1,2>
+  3717342106U,	// <3,6,4,1>: Cost 4 vsldoi4 <4,3,6,4>, <1,2,3,4>
+  2789167729U,	// <3,6,4,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,4,2,5>
+  2235585074U,	// <3,6,4,3>: Cost 3 vmrghw <3,4,5,6>, <6,3,4,5>
+  2235585165U,	// <3,6,4,4>: Cost 3 vmrghw <3,4,5,6>, <6,4,5,6>
+  2699889974U,	// <3,6,4,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  2800079509U,	// <3,6,4,6>: Cost 3 vsldoi12 LHS, <6,4,6,5>
+  1215515958U,	// <3,6,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1215515959U,	// <3,6,4,u>: Cost 2 vmrglw <1,2,3,4>, RHS
+  3873821356U,	// <3,6,5,0>: Cost 4 vsldoi12 LHS, <6,5,0,1>
+  3372959209U,	// <3,6,5,1>: Cost 5 vmrglw <2,u,3,5>, <2,0,6,1>
+  3862909629U,	// <3,6,5,2>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,2,0>
+  3773632358U,	// <3,6,5,3>: Cost 4 vsldoi8 <2,5,3,6>, <5,3,6,0>
+  3873821396U,	// <3,6,5,4>: Cost 4 vsldoi12 LHS, <6,5,4,5>
+  3873821405U,	// <3,6,5,5>: Cost 4 vsldoi12 LHS, <6,5,5,5>
+  3862909672U,	// <3,6,5,6>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,6,7>
+  2294574390U,	// <3,6,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2294574391U,	// <3,6,5,u>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2800079613U,	// <3,6,6,0>: Cost 3 vsldoi12 LHS, <6,6,0,1>
+  3873821446U,	// <3,6,6,1>: Cost 4 vsldoi12 LHS, <6,6,1,1>
+  2789167888U,	// <3,6,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,6,2,2>
+  3844920090U,	// <3,6,6,3>: Cost 4 vsldoi12 <3,2,6,3>, <6,6,3,3>
+  2800079653U,	// <3,6,6,4>: Cost 3 vsldoi12 LHS, <6,6,4,5>
+  3723333484U,	// <3,6,6,5>: Cost 4 vsldoi4 <5,3,6,6>, <5,3,6,6>
+  1726337848U,	// <3,6,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726337858U,	// <3,6,6,7>: Cost 2 vsldoi12 LHS, <6,6,7,7>
+  1726337867U,	// <3,6,6,u>: Cost 2 vsldoi12 LHS, <6,6,u,7>
+  1726337870U,	// <3,6,7,0>: Cost 2 vsldoi12 LHS, <6,7,0,1>
+  2297906665U,	// <3,6,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,0,6,1>
+  2792117090U,	// <3,6,7,2>: Cost 3 vsldoi12 <6,7,2,3>, <6,7,2,3>
+  2297907558U,	// <3,6,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,6,3>
+  1726337910U,	// <3,6,7,4>: Cost 2 vsldoi12 LHS, <6,7,4,5>
+  2297906993U,	// <3,6,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,4,6,5>
+  2297906832U,	// <3,6,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,2,6,6>
+  1224166710U,	// <3,6,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224166711U,	// <3,6,7,u>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1726337951U,	// <3,6,u,0>: Cost 2 vsldoi12 LHS, <6,u,0,1>
+  2699892526U,	// <3,6,u,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789168049U,	// <3,6,u,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,u,2,1>
+  2792854460U,	// <3,6,u,3>: Cost 3 vsldoi12 <6,u,3,3>, <6,u,3,3>
+  1726337991U,	// <3,6,u,4>: Cost 2 vsldoi12 LHS, <6,u,4,5>
+  2699892890U,	// <3,6,u,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  1726337848U,	// <3,6,u,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1215548726U,	// <3,6,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  1215548727U,	// <3,6,u,u>: Cost 2 vmrglw <1,2,3,u>, RHS
+  2700558336U,	// <3,7,0,0>: Cost 3 vsldoi8 <2,6,3,7>, <0,0,0,0>
+  1626816614U,	// <3,7,0,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700558513U,	// <3,7,0,2>: Cost 3 vsldoi8 <2,6,3,7>, <0,2,1,6>
+  2331030010U,	// <3,7,0,3>: Cost 3 vmrglw <u,2,3,0>, <6,2,7,3>
+  2700558674U,	// <3,7,0,4>: Cost 3 vsldoi8 <2,6,3,7>, <0,4,1,5>
+  2800079906U,	// <3,7,0,5>: Cost 3 vsldoi12 LHS, <7,0,5,6>
+  2655588936U,	// <3,7,0,6>: Cost 3 vsldoi4 <6,3,7,0>, <6,3,7,0>
+  2800079919U,	// <3,7,0,7>: Cost 3 vsldoi12 LHS, <7,0,7,1>
+  1626817181U,	// <3,7,0,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  3774300899U,	// <3,7,1,0>: Cost 4 vsldoi8 <2,6,3,7>, <1,0,1,1>
+  2700559156U,	// <3,7,1,1>: Cost 3 vsldoi8 <2,6,3,7>, <1,1,1,1>
+  2700559254U,	// <3,7,1,2>: Cost 3 vsldoi8 <2,6,3,7>, <1,2,3,0>
+  3774301148U,	// <3,7,1,3>: Cost 4 vsldoi8 <2,6,3,7>, <1,3,1,7>
+  3774301227U,	// <3,7,1,4>: Cost 4 vsldoi8 <2,6,3,7>, <1,4,1,5>
+  3774301295U,	// <3,7,1,5>: Cost 4 vsldoi8 <2,6,3,7>, <1,5,0,1>
+  3768329441U,	// <3,7,1,6>: Cost 4 vsldoi8 <1,6,3,7>, <1,6,3,7>
+  3403453250U,	// <3,7,1,7>: Cost 4 vmrglw <u,0,3,1>, <6,6,7,7>
+  2700559740U,	// <3,7,1,u>: Cost 3 vsldoi8 <2,6,3,7>, <1,u,3,0>
+  2700559849U,	// <3,7,2,0>: Cost 3 vsldoi8 <2,6,3,7>, <2,0,6,1>
+  3770983973U,	// <3,7,2,1>: Cost 4 vsldoi8 <2,1,3,7>, <2,1,3,7>
+  2700559976U,	// <3,7,2,2>: Cost 3 vsldoi8 <2,6,3,7>, <2,2,2,2>
+  2698569415U,	// <3,7,2,3>: Cost 3 vsldoi8 <2,3,3,7>, <2,3,3,7>
+  2700560177U,	// <3,7,2,4>: Cost 3 vsldoi8 <2,6,3,7>, <2,4,6,5>
+  3773638505U,	// <3,7,2,5>: Cost 4 vsldoi8 <2,5,3,7>, <2,5,3,7>
+  1626818490U,	// <3,7,2,6>: Cost 2 vsldoi8 <2,6,3,7>, <2,6,3,7>
+  2795140307U,	// <3,7,2,7>: Cost 3 vsldoi12 <7,2,7,3>, <7,2,7,3>
+  1628145756U,	// <3,7,2,u>: Cost 2 vsldoi8 <2,u,3,7>, <2,u,3,7>
+  2700560534U,	// <3,7,3,0>: Cost 3 vsldoi8 <2,6,3,7>, <3,0,1,2>
+  3774302438U,	// <3,7,3,1>: Cost 4 vsldoi8 <2,6,3,7>, <3,1,1,1>
+  2700560742U,	// <3,7,3,2>: Cost 3 vsldoi8 <2,6,3,7>, <3,2,6,3>
+  2700560796U,	// <3,7,3,3>: Cost 3 vsldoi8 <2,6,3,7>, <3,3,3,3>
+  2700560898U,	// <3,7,3,4>: Cost 3 vsldoi8 <2,6,3,7>, <3,4,5,6>
+  3774302821U,	// <3,7,3,5>: Cost 4 vsldoi8 <2,6,3,7>, <3,5,7,6>
+  2700561079U,	// <3,7,3,6>: Cost 3 vsldoi8 <2,6,3,7>, <3,6,7,7>
+  2700561091U,	// <3,7,3,7>: Cost 3 vsldoi8 <2,6,3,7>, <3,7,0,1>
+  2700561182U,	// <3,7,3,u>: Cost 3 vsldoi8 <2,6,3,7>, <3,u,1,2>
+  2655617126U,	// <3,7,4,0>: Cost 3 vsldoi4 <6,3,7,4>, LHS
+  3774303178U,	// <3,7,4,1>: Cost 4 vsldoi8 <2,6,3,7>, <4,1,2,3>
+  2655619002U,	// <3,7,4,2>: Cost 3 vsldoi4 <6,3,7,4>, <2,6,3,7>
+  2331062778U,	// <3,7,4,3>: Cost 3 vmrglw <u,2,3,4>, <6,2,7,3>
+  2655620406U,	// <3,7,4,4>: Cost 3 vsldoi4 <6,3,7,4>, RHS
+  1626819894U,	// <3,7,4,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  2655621708U,	// <3,7,4,6>: Cost 3 vsldoi4 <6,3,7,4>, <6,3,7,4>
+  2800080247U,	// <3,7,4,7>: Cost 3 vsldoi12 LHS, <7,4,7,5>
+  1626820137U,	// <3,7,4,u>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  3774303816U,	// <3,7,5,0>: Cost 4 vsldoi8 <2,6,3,7>, <5,0,1,2>
+  3873822093U,	// <3,7,5,1>: Cost 4 vsldoi12 LHS, <7,5,1,0>
+  3774303998U,	// <3,7,5,2>: Cost 4 vsldoi8 <2,6,3,7>, <5,2,3,4>
+  3862910368U,	// <3,7,5,3>: Cost 4 vsldoi12 <6,2,7,3>, <7,5,3,1>
+  3774304180U,	// <3,7,5,4>: Cost 4 vsldoi8 <2,6,3,7>, <5,4,5,6>
+  2800080310U,	// <3,7,5,5>: Cost 3 vsldoi12 LHS, <7,5,5,5>
+  2800080321U,	// <3,7,5,6>: Cost 3 vsldoi12 LHS, <7,5,6,7>
+  3873822147U,	// <3,7,5,7>: Cost 4 vsldoi12 LHS, <7,5,7,0>
+  2800080339U,	// <3,7,5,u>: Cost 3 vsldoi12 LHS, <7,5,u,7>
+  2800080348U,	// <3,7,6,0>: Cost 3 vsldoi12 LHS, <7,6,0,7>
+  3873822181U,	// <3,7,6,1>: Cost 4 vsldoi12 LHS, <7,6,1,7>
+  2789168622U,	// <3,7,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <7,6,2,7>
+  2700563016U,	// <3,7,6,3>: Cost 3 vsldoi8 <2,6,3,7>, <6,3,7,0>
+  2800080384U,	// <3,7,6,4>: Cost 3 vsldoi12 LHS, <7,6,4,7>
+  3862910472U,	// <3,7,6,5>: Cost 4 vsldoi12 <6,2,7,3>, <7,6,5,6>
+  2700563256U,	// <3,7,6,6>: Cost 3 vsldoi8 <2,6,3,7>, <6,6,6,6>
+  2800080404U,	// <3,7,6,7>: Cost 3 vsldoi12 LHS, <7,6,7,0>
+  2793149988U,	// <3,7,6,u>: Cost 3 vsldoi12 <6,u,7,3>, <7,6,u,7>
+  2637725798U,	// <3,7,7,0>: Cost 3 vsldoi4 <3,3,7,7>, LHS
+  3371649227U,	// <3,7,7,1>: Cost 4 vmrglw <2,6,3,7>, <3,0,7,1>
+  2637727674U,	// <3,7,7,2>: Cost 3 vsldoi4 <3,3,7,7>, <2,6,3,7>
+  2297907567U,	// <3,7,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,7,3>
+  2637729078U,	// <3,7,7,4>: Cost 3 vsldoi4 <3,3,7,7>, RHS
+  3371649312U,	// <3,7,7,5>: Cost 4 vmrglw <2,6,3,7>, <3,1,7,5>
+  2655646287U,	// <3,7,7,6>: Cost 3 vsldoi4 <6,3,7,7>, <6,3,7,7>
+  1726338668U,	// <3,7,7,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1726338668U,	// <3,7,7,u>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  2700564179U,	// <3,7,u,0>: Cost 3 vsldoi8 <2,6,3,7>, <u,0,1,2>
+  1626822446U,	// <3,7,u,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700564357U,	// <3,7,u,2>: Cost 3 vsldoi8 <2,6,3,7>, <u,2,3,0>
+  2700564412U,	// <3,7,u,3>: Cost 3 vsldoi8 <2,6,3,7>, <u,3,0,1>
+  2700564543U,	// <3,7,u,4>: Cost 3 vsldoi8 <2,6,3,7>, <u,4,5,6>
+  1626822810U,	// <3,7,u,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  1662654672U,	// <3,7,u,6>: Cost 2 vsldoi8 <u,6,3,7>, <u,6,3,7>
+  1726338668U,	// <3,7,u,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1626823013U,	// <3,7,u,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  1678557184U,	// <3,u,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1679005395U,	// <3,u,0,1>: Cost 2 vsldoi12 LHS, <u,0,1,2>
+  2289221787U,	// <3,u,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,u,2>
+  1215479964U,	// <3,u,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  2752747245U,	// <3,u,0,4>: Cost 3 vsldoi12 LHS, <u,0,4,1>
+  1158863002U,	// <3,u,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  2289224221U,	// <3,u,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,u,6>
+  1215483208U,	// <3,u,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1679005458U,	// <3,u,0,u>: Cost 2 vsldoi12 LHS, <u,0,u,2>
+  1558036582U,	// <3,u,1,0>: Cost 2 vsldoi4 <2,3,u,1>, LHS
+  1678558004U,	// <3,u,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  604821294U,	// <3,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2752747317U,	// <3,u,1,3>: Cost 3 vsldoi12 LHS, <u,1,3,1>
+  1558039862U,	// <3,u,1,4>: Cost 2 vsldoi4 <2,3,u,1>, RHS
+  2756949830U,	// <3,u,1,5>: Cost 3 vsldoi12 LHS, <u,1,5,0>
+  2800080726U,	// <3,u,1,6>: Cost 3 vsldoi12 LHS, <u,1,6,7>
+  2289233224U,	// <3,u,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  604821348U,	// <3,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696586709U,	// <3,u,2,0>: Cost 3 vsldoi8 <2,0,3,u>, <2,0,3,u>
+  2757392246U,	// <3,u,2,1>: Cost 3 vsldoi12 LHS, <u,2,1,3>
+  1624172151U,	// <3,u,2,2>: Cost 2 vsldoi8 <2,2,3,u>, <2,2,3,u>
+  1679005576U,	// <3,u,2,3>: Cost 2 vsldoi12 LHS, <u,2,3,3>
+  2631789878U,	// <3,u,2,4>: Cost 3 vsldoi4 <2,3,u,2>, RHS
+  2699904874U,	// <3,u,2,5>: Cost 3 vsldoi8 <2,5,3,u>, <2,5,3,u>
+  1626826683U,	// <3,u,2,6>: Cost 2 vsldoi8 <2,6,3,u>, <2,6,3,u>
+  1726338988U,	// <3,u,2,7>: Cost 2 vsldoi12 LHS, <u,2,7,3>
+  1683208117U,	// <3,u,2,u>: Cost 2 vsldoi12 LHS, <u,2,u,3>
+  1679005628U,	// <3,u,3,0>: Cost 2 vsldoi12 LHS, <u,3,0,1>
+  1161008942U,	// <3,u,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2752747471U,	// <3,u,3,2>: Cost 3 vsldoi12 LHS, <u,3,2,2>
+  403488870U,	// <3,u,3,3>: Cost 1 vspltisw3 LHS
+  1679005668U,	// <3,u,3,4>: Cost 2 vsldoi12 LHS, <u,3,4,5>
+  1161009306U,	// <3,u,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691943104U,	// <3,u,3,6>: Cost 3 vsldoi8 <1,2,3,u>, <3,6,u,7>
+  1221479752U,	// <3,u,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  403488870U,	// <3,u,3,u>: Cost 1 vspltisw3 LHS
+  2289255363U,	// <3,u,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,u,0>
+  1161844526U,	// <3,u,4,1>: Cost 2 vmrghw <3,4,5,6>, LHS
+  2289256661U,	// <3,u,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,u,2>
+  1215512732U,	// <3,u,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  1215513498U,	// <3,u,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679005759U,	// <3,u,4,5>: Cost 2 vsldoi12 LHS, <u,4,5,6>
+  2289256989U,	// <3,u,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,u,6>
+  1215515976U,	// <3,u,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1679005786U,	// <3,u,4,u>: Cost 2 vsldoi12 LHS, <u,4,u,6>
+  1558069350U,	// <3,u,5,0>: Cost 2 vsldoi4 <2,3,u,5>, LHS
+  2631811892U,	// <3,u,5,1>: Cost 3 vsldoi4 <2,3,u,5>, <1,1,1,1>
+  1558071026U,	// <3,u,5,2>: Cost 2 vsldoi4 <2,3,u,5>, <2,3,u,5>
+  2752747646U,	// <3,u,5,3>: Cost 3 vsldoi12 LHS, <u,5,3,6>
+  1558072630U,	// <3,u,5,4>: Cost 2 vsldoi4 <2,3,u,5>, RHS
+  1726337028U,	// <3,u,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  604821658U,	// <3,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2294574408U,	// <3,u,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  604821676U,	// <3,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2631819366U,	// <3,u,6,0>: Cost 3 vsldoi4 <2,3,u,6>, LHS
+  2757392574U,	// <3,u,6,1>: Cost 3 vsldoi12 LHS, <u,6,1,7>
+  2631821043U,	// <3,u,6,2>: Cost 3 vsldoi4 <2,3,u,6>, <2,3,u,6>
+  1679005904U,	// <3,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  2631822646U,	// <3,u,6,4>: Cost 3 vsldoi4 <2,3,u,6>, RHS
+  2236553370U,	// <3,u,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  1726337848U,	// <3,u,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726339309U,	// <3,u,6,7>: Cost 2 vsldoi12 LHS, <u,6,7,0>
+  1683208445U,	// <3,u,6,u>: Cost 2 vsldoi12 LHS, <u,6,u,7>
+  1726339328U,	// <3,u,7,0>: Cost 2 vsldoi12 LHS, <u,7,0,1>
+  2297905225U,	// <3,u,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,u,1>
+  2631829236U,	// <3,u,7,2>: Cost 3 vsldoi4 <2,3,u,7>, <2,3,u,7>
+  1224163484U,	// <3,u,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1726339368U,	// <3,u,7,4>: Cost 2 vsldoi12 LHS, <u,7,4,5>
+  2297905553U,	// <3,u,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,u,5>
+  2297905392U,	// <3,u,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,u,6>
+  1224166728U,	// <3,u,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224163489U,	// <3,u,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1683208529U,	// <3,u,u,0>: Cost 2 vsldoi12 LHS, <u,u,0,1>
+  1679006043U,	// <3,u,u,1>: Cost 2 vsldoi12 LHS, <u,u,1,2>
+  604821861U,	// <3,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  403488870U,	// <3,u,u,3>: Cost 1 vspltisw3 LHS
+  1683208569U,	// <3,u,u,4>: Cost 2 vsldoi12 LHS, <u,u,4,5>
+  1679006083U,	// <3,u,u,5>: Cost 2 vsldoi12 LHS, <u,u,5,6>
+  604821901U,	// <3,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  1215548744U,	// <3,u,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  604821915U,	// <3,u,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2759016448U,	// <4,0,0,0>: Cost 3 vsldoi12 <1,2,3,4>, <0,0,0,0>
+  1165115494U,	// <4,0,0,1>: Cost 2 vmrghw <4,0,5,1>, LHS
+  3717531337U,	// <4,0,0,2>: Cost 4 vsldoi4 <4,4,0,0>, <2,3,4,0>
+  3369675785U,	// <4,0,0,3>: Cost 4 vmrglw <2,3,4,0>, <4,2,0,3>
+  2751791144U,	// <4,0,0,4>: Cost 3 vsldoi12 <0,0,4,4>, <0,0,4,4>
+  2238857630U,	// <4,0,0,5>: Cost 3 vmrghw <4,0,5,1>, <0,5,1,0>
+  3312591341U,	// <4,0,0,6>: Cost 4 vmrghw <4,0,5,0>, <0,6,0,7>
+  3369676113U,	// <4,0,0,7>: Cost 4 vmrglw <2,3,4,0>, <4,6,0,7>
+  1165116061U,	// <4,0,0,u>: Cost 2 vmrghw <4,0,5,1>, LHS
+  2637824102U,	// <4,0,1,0>: Cost 3 vsldoi4 <3,4,0,1>, LHS
+  2637824922U,	// <4,0,1,1>: Cost 3 vsldoi4 <3,4,0,1>, <1,2,3,4>
+  1685274726U,	// <4,0,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637826512U,	// <4,0,1,3>: Cost 3 vsldoi4 <3,4,0,1>, <3,4,0,1>
+  2637827382U,	// <4,0,1,4>: Cost 3 vsldoi4 <3,4,0,1>, RHS
+  2661716070U,	// <4,0,1,5>: Cost 3 vsldoi4 <7,4,0,1>, <5,6,7,4>
+  3729486427U,	// <4,0,1,6>: Cost 4 vsldoi4 <6,4,0,1>, <6,4,0,1>
+  2661717300U,	// <4,0,1,7>: Cost 3 vsldoi4 <7,4,0,1>, <7,4,0,1>
+  1685274780U,	// <4,0,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  3711574118U,	// <4,0,2,0>: Cost 4 vsldoi4 <3,4,0,2>, LHS
+  2240200806U,	// <4,0,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  3771663992U,	// <4,0,2,2>: Cost 4 vsldoi8 <2,2,4,0>, <2,2,4,0>
+  2698585801U,	// <4,0,2,3>: Cost 3 vsldoi8 <2,3,4,0>, <2,3,4,0>
+  3373672105U,	// <4,0,2,4>: Cost 4 vmrglw <3,0,4,2>, <2,3,0,4>
+  3810813795U,	// <4,0,2,5>: Cost 4 vsldoi8 <u,7,4,0>, <2,5,3,1>
+  3772327866U,	// <4,0,2,6>: Cost 4 vsldoi8 <2,3,4,0>, <2,6,3,7>
+  3386280568U,	// <4,0,2,7>: Cost 5 vmrglw <5,1,4,2>, <3,6,0,7>
+  2701903966U,	// <4,0,2,u>: Cost 3 vsldoi8 <2,u,4,0>, <2,u,4,0>
+  3699638374U,	// <4,0,3,0>: Cost 4 vsldoi4 <1,4,0,3>, LHS
+  2753560832U,	// <4,0,3,1>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  3772328276U,	// <4,0,3,2>: Cost 4 vsldoi8 <2,3,4,0>, <3,2,4,3>
+  3827302674U,	// <4,0,3,3>: Cost 4 vsldoi12 <0,3,1,4>, <0,3,3,4>
+  3699641654U,	// <4,0,3,4>: Cost 4 vsldoi4 <1,4,0,3>, RHS
+  3779627588U,	// <4,0,3,5>: Cost 4 vsldoi8 <3,5,4,0>, <3,5,4,0>
+  3772328604U,	// <4,0,3,6>: Cost 4 vsldoi8 <2,3,4,0>, <3,6,4,7>
+  3780954854U,	// <4,0,3,7>: Cost 4 vsldoi8 <3,7,4,0>, <3,7,4,0>
+  2753560832U,	// <4,0,3,u>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  2725129106U,	// <4,0,4,0>: Cost 3 vsldoi8 <6,7,4,0>, <4,0,5,1>
+  1167720550U,	// <4,0,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  3839172953U,	// <4,0,4,2>: Cost 4 vsldoi12 <2,3,0,4>, <0,4,2,3>
+  3772329051U,	// <4,0,4,3>: Cost 4 vsldoi8 <2,3,4,0>, <4,3,0,4>
+  2241462610U,	// <4,0,4,4>: Cost 3 vmrghw <4,4,4,4>, <0,4,1,5>
+  2698587446U,	// <4,0,4,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  3772329297U,	// <4,0,4,6>: Cost 4 vsldoi8 <2,3,4,0>, <4,6,0,7>
+  3735483703U,	// <4,0,4,7>: Cost 4 vsldoi4 <7,4,0,4>, <7,4,0,4>
+  1167721117U,	// <4,0,4,u>: Cost 2 vmrghw <4,4,4,4>, LHS
+  1168556032U,	// <4,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  94814310U,	// <4,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2242298029U,	// <4,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2637859284U,	// <4,0,5,3>: Cost 3 vsldoi4 <3,4,0,5>, <3,4,0,5>
+  1168556370U,	// <4,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2242306530U,	// <4,0,5,5>: Cost 3 vmrghw RHS, <0,5,u,5>
+  2242298358U,	// <4,0,5,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661750072U,	// <4,0,5,7>: Cost 3 vsldoi4 <7,4,0,5>, <7,4,0,5>
+  94814877U,	// <4,0,5,u>: Cost 1 vmrghw RHS, LHS
+  3316580362U,	// <4,0,6,0>: Cost 4 vmrghw <4,6,5,1>, <0,0,1,1>
+  2242846822U,	// <4,0,6,1>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3798872570U,	// <4,0,6,2>: Cost 4 vsldoi8 <6,7,4,0>, <6,2,7,3>
+  3796218413U,	// <4,0,6,3>: Cost 4 vsldoi8 <6,3,4,0>, <6,3,4,0>
+  3834528273U,	// <4,0,6,4>: Cost 4 vsldoi12 <1,5,0,4>, <0,6,4,7>
+  3798872811U,	// <4,0,6,5>: Cost 4 vsldoi8 <6,7,4,0>, <6,5,7,1>
+  3316621876U,	// <4,0,6,6>: Cost 4 vmrghw <4,6,5,6>, <0,6,u,6>
+  2725131121U,	// <4,0,6,7>: Cost 3 vsldoi8 <6,7,4,0>, <6,7,4,0>
+  2242847389U,	// <4,0,6,u>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3377692672U,	// <4,0,7,0>: Cost 4 vmrglw <3,6,4,7>, <0,0,0,0>
+  2243493990U,	// <4,0,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  3775648970U,	// <4,0,7,2>: Cost 5 vsldoi8 <2,u,4,0>, <7,2,6,3>
+  3802191110U,	// <4,0,7,3>: Cost 4 vsldoi8 <7,3,4,0>, <7,3,4,0>
+  3317236050U,	// <4,0,7,4>: Cost 4 vmrghw <4,7,5,0>, <0,4,1,5>
+  3803518376U,	// <4,0,7,5>: Cost 4 vsldoi8 <7,5,4,0>, <7,5,4,0>
+  3317236214U,	// <4,0,7,6>: Cost 5 vmrghw <4,7,5,0>, <0,6,1,7>
+  3798873708U,	// <4,0,7,7>: Cost 4 vsldoi8 <6,7,4,0>, <7,7,7,7>
+  2243494557U,	// <4,0,7,u>: Cost 3 vmrghw <4,7,5,0>, LHS
+  1170546688U,	// <4,0,u,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  96804966U,	// <4,0,u,1>: Cost 1 vmrghw RHS, LHS
+  1685275293U,	// <4,0,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637883863U,	// <4,0,u,3>: Cost 3 vsldoi4 <3,4,0,u>, <3,4,0,u>
+  1170547026U,	// <4,0,u,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2698590362U,	// <4,0,u,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  2244289014U,	// <4,0,u,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661774651U,	// <4,0,u,7>: Cost 3 vsldoi4 <7,4,0,u>, <7,4,0,u>
+  96805533U,	// <4,0,u,u>: Cost 1 vmrghw RHS, LHS
+  2667749478U,	// <4,1,0,0>: Cost 3 vsldoi4 <u,4,1,0>, LHS
+  2689966182U,	// <4,1,0,1>: Cost 3 vsldoi8 <0,u,4,1>, LHS
+  2238571418U,	// <4,1,0,2>: Cost 3 vmrghw <4,0,1,2>, <1,2,3,4>
+  3711633880U,	// <4,1,0,3>: Cost 4 vsldoi4 <3,4,1,0>, <3,4,1,0>
+  2689966418U,	// <4,1,0,4>: Cost 3 vsldoi8 <0,u,4,1>, <0,4,1,5>
+  3361046866U,	// <4,1,0,5>: Cost 4 vmrglw <0,u,4,0>, <0,4,1,5>
+  3741495802U,	// <4,1,0,6>: Cost 4 vsldoi4 <u,4,1,0>, <6,2,7,3>
+  3741496314U,	// <4,1,0,7>: Cost 4 vsldoi4 <u,4,1,0>, <7,0,1,2>
+  2689966765U,	// <4,1,0,u>: Cost 3 vsldoi8 <0,u,4,1>, <0,u,4,1>
+  3764372222U,	// <4,1,1,0>: Cost 4 vsldoi8 <1,0,4,1>, <1,0,4,1>
+  2758206263U,	// <4,1,1,1>: Cost 3 vsldoi12 <1,1,1,4>, <1,1,1,4>
+  2698593178U,	// <4,1,1,2>: Cost 3 vsldoi8 <2,3,4,1>, <1,2,3,4>
+  3361057810U,	// <4,1,1,3>: Cost 4 vmrglw <0,u,4,1>, <4,2,1,3>
+  3827303250U,	// <4,1,1,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,1,4,4>
+  2287313234U,	// <4,1,1,5>: Cost 3 vmrglw <0,u,4,1>, <0,4,1,5>
+  3763709171U,	// <4,1,1,6>: Cost 4 vsldoi8 <0,u,4,1>, <1,6,5,7>
+  3361058138U,	// <4,1,1,7>: Cost 4 vmrglw <0,u,4,1>, <4,6,1,7>
+  2239759744U,	// <4,1,1,u>: Cost 3 vmrghw <4,1,u,3>, <1,u,3,4>
+  2637906022U,	// <4,1,2,0>: Cost 3 vsldoi4 <3,4,1,2>, LHS
+  2637906842U,	// <4,1,2,1>: Cost 3 vsldoi4 <3,4,1,2>, <1,2,3,4>
+  3763709544U,	// <4,1,2,2>: Cost 4 vsldoi8 <0,u,4,1>, <2,2,2,2>
+  1685275546U,	// <4,1,2,3>: Cost 2 vsldoi12 <1,2,3,4>, <1,2,3,4>
+  2637909302U,	// <4,1,2,4>: Cost 3 vsldoi4 <3,4,1,2>, RHS
+  3361063250U,	// <4,1,2,5>: Cost 4 vmrglw <0,u,4,2>, <0,4,1,5>
+  3763709882U,	// <4,1,2,6>: Cost 4 vsldoi8 <0,u,4,1>, <2,6,3,7>
+  3735541054U,	// <4,1,2,7>: Cost 4 vsldoi4 <7,4,1,2>, <7,4,1,2>
+  1685644231U,	// <4,1,2,u>: Cost 2 vsldoi12 <1,2,u,4>, <1,2,u,4>
+  2702575792U,	// <4,1,3,0>: Cost 3 vsldoi8 <3,0,4,1>, <3,0,4,1>
+  3832759257U,	// <4,1,3,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,3,1,4>
+  3833349090U,	// <4,1,3,2>: Cost 4 vsldoi12 <1,3,2,4>, <1,3,2,4>
+  3763710364U,	// <4,1,3,3>: Cost 4 vsldoi8 <0,u,4,1>, <3,3,3,3>
+  2707884546U,	// <4,1,3,4>: Cost 3 vsldoi8 <3,u,4,1>, <3,4,5,6>
+  3361071442U,	// <4,1,3,5>: Cost 4 vmrglw <0,u,4,3>, <0,4,1,5>
+  3772336796U,	// <4,1,3,6>: Cost 4 vsldoi8 <2,3,4,1>, <3,6,4,7>
+  3775654595U,	// <4,1,3,7>: Cost 5 vsldoi8 <2,u,4,1>, <3,7,0,1>
+  2707884856U,	// <4,1,3,u>: Cost 3 vsldoi8 <3,u,4,1>, <3,u,4,1>
+  2667782246U,	// <4,1,4,0>: Cost 3 vsldoi4 <u,4,1,4>, LHS
+  2241463092U,	// <4,1,4,1>: Cost 3 vmrghw <4,4,4,4>, <1,1,1,1>
+  2241553306U,	// <4,1,4,2>: Cost 3 vmrghw <4,4,5,6>, <1,2,3,4>
+  3827303484U,	// <4,1,4,3>: Cost 4 vsldoi12 <0,3,1,4>, <1,4,3,4>
+  2667785424U,	// <4,1,4,4>: Cost 3 vsldoi4 <u,4,1,4>, <4,4,4,4>
+  2689969462U,	// <4,1,4,5>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  3763711322U,	// <4,1,4,6>: Cost 4 vsldoi8 <0,u,4,1>, <4,6,1,7>
+  3867116636U,	// <4,1,4,7>: Cost 4 vsldoi12 <7,0,1,4>, <1,4,7,0>
+  2689969705U,	// <4,1,4,u>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  1546273106U,	// <4,1,5,0>: Cost 2 vsldoi4 <0,4,1,5>, <0,4,1,5>
+  1168556852U,	// <4,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1168556950U,	// <4,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2620016790U,	// <4,1,5,3>: Cost 3 vsldoi4 <0,4,1,5>, <3,0,1,2>
+  1546276150U,	// <4,1,5,4>: Cost 2 vsldoi4 <0,4,1,5>, RHS
+  2620018692U,	// <4,1,5,5>: Cost 3 vsldoi4 <0,4,1,5>, <5,5,5,5>
+  2242299087U,	// <4,1,5,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667795450U,	// <4,1,5,7>: Cost 3 vsldoi4 <u,4,1,5>, <7,0,1,2>
+  1546278702U,	// <4,1,5,u>: Cost 2 vsldoi4 <0,4,1,5>, LHS
+  3781628193U,	// <4,1,6,0>: Cost 4 vsldoi8 <3,u,4,1>, <6,0,1,2>
+  3832759503U,	// <4,1,6,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,1,7>
+  3316261786U,	// <4,1,6,2>: Cost 4 vmrghw <4,6,0,7>, <1,2,3,4>
+  3781628466U,	// <4,1,6,3>: Cost 4 vsldoi8 <3,u,4,1>, <6,3,4,5>
+  3827303658U,	// <4,1,6,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,6,4,7>
+  3361096018U,	// <4,1,6,5>: Cost 4 vmrglw <0,u,4,6>, <0,4,1,5>
+  3788264248U,	// <4,1,6,6>: Cost 4 vsldoi8 <5,0,4,1>, <6,6,6,6>
+  3788264270U,	// <4,1,6,7>: Cost 4 vsldoi8 <5,0,4,1>, <6,7,0,1>
+  3832759566U,	// <4,1,6,u>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,u,7>
+  2726466580U,	// <4,1,7,0>: Cost 3 vsldoi8 <7,0,4,1>, <7,0,4,1>
+  3377692682U,	// <4,1,7,1>: Cost 4 vmrglw <3,6,4,7>, <0,0,1,1>
+  3377694870U,	// <4,1,7,2>: Cost 4 vmrglw <3,6,4,7>, <3,0,1,2>
+  3802199303U,	// <4,1,7,3>: Cost 4 vsldoi8 <7,3,4,1>, <7,3,4,1>
+  2731775334U,	// <4,1,7,4>: Cost 3 vsldoi8 <7,u,4,1>, <7,4,5,6>
+  3377693010U,	// <4,1,7,5>: Cost 4 vmrglw <3,6,4,7>, <0,4,1,5>
+  3365749804U,	// <4,1,7,6>: Cost 5 vmrglw <1,6,4,7>, <1,4,1,6>
+  3788265068U,	// <4,1,7,7>: Cost 4 vsldoi8 <5,0,4,1>, <7,7,7,7>
+  2731775644U,	// <4,1,7,u>: Cost 3 vsldoi8 <7,u,4,1>, <7,u,4,1>
+  1546297685U,	// <4,1,u,0>: Cost 2 vsldoi4 <0,4,1,u>, <0,4,1,u>
+  1170547508U,	// <4,1,u,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1170547606U,	// <4,1,u,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  1689257344U,	// <4,1,u,3>: Cost 2 vsldoi12 <1,u,3,4>, <1,u,3,4>
+  1546300726U,	// <4,1,u,4>: Cost 2 vsldoi4 <0,4,1,u>, RHS
+  2284716370U,	// <4,1,u,5>: Cost 3 vmrglw <0,4,4,u>, <0,4,1,5>
+  2244289743U,	// <4,1,u,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667820026U,	// <4,1,u,7>: Cost 3 vsldoi4 <u,4,1,u>, <7,0,1,2>
+  1546303278U,	// <4,1,u,u>: Cost 2 vsldoi4 <0,4,1,u>, LHS
+  3729621094U,	// <4,2,0,0>: Cost 4 vsldoi4 <6,4,2,0>, LHS
+  3763716198U,	// <4,2,0,1>: Cost 4 vsldoi8 <0,u,4,2>, LHS
+  2238858856U,	// <4,2,0,2>: Cost 3 vmrghw <4,0,5,1>, <2,2,2,2>
+  2295930982U,	// <4,2,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3763716434U,	// <4,2,0,4>: Cost 4 vsldoi8 <0,u,4,2>, <0,4,1,5>
+  2238859107U,	// <4,2,0,5>: Cost 3 vmrghw <4,0,5,1>, <2,5,3,1>
+  2238859194U,	// <4,2,0,6>: Cost 3 vmrghw <4,0,5,1>, <2,6,3,7>
+  3312601066U,	// <4,2,0,7>: Cost 4 vmrghw <4,0,5,1>, <2,7,0,1>
+  2295930987U,	// <4,2,0,u>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3699769446U,	// <4,2,1,0>: Cost 4 vsldoi4 <1,4,2,1>, LHS
+  3313255971U,	// <4,2,1,1>: Cost 4 vmrghw <4,1,5,0>, <2,1,3,5>
+  3361056360U,	// <4,2,1,2>: Cost 4 vmrglw <0,u,4,1>, <2,2,2,2>
+  2287312998U,	// <4,2,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3788932148U,	// <4,2,1,4>: Cost 4 vsldoi8 <5,1,4,2>, <1,4,2,5>
+  3313256290U,	// <4,2,1,5>: Cost 4 vmrghw <4,1,5,0>, <2,5,3,0>
+  3838289469U,	// <4,2,1,6>: Cost 4 vsldoi12 <2,1,6,4>, <2,1,6,4>
+  3369682865U,	// <4,2,1,7>: Cost 5 vmrglw <2,3,4,1>, <2,6,2,7>
+  2287313003U,	// <4,2,1,u>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3838658133U,	// <4,2,2,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,0,1>
+  3711722394U,	// <4,2,2,1>: Cost 4 vsldoi4 <3,4,2,2>, <1,2,3,4>
+  2759018088U,	// <4,2,2,2>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,2,2>
+  2759018098U,	// <4,2,2,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,3,3>
+  3838658168U,	// <4,2,2,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,4,0>
+  3369027341U,	// <4,2,2,5>: Cost 4 vmrglw <2,2,4,2>, <2,4,2,5>
+  2240227258U,	// <4,2,2,6>: Cost 3 vmrghw <4,2,5,6>, <2,6,3,7>
+  3735614791U,	// <4,2,2,7>: Cost 4 vsldoi4 <7,4,2,2>, <7,4,2,2>
+  2759018143U,	// <4,2,2,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,u,3>
+  2759018150U,	// <4,2,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,0,1>
+  3831948975U,	// <4,2,3,1>: Cost 4 vsldoi12 <1,1,1,4>, <2,3,1,1>
+  3832759993U,	// <4,2,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <2,3,2,2>
+  2759018180U,	// <4,2,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,3,4>
+  2759018185U,	// <4,2,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,4,0>
+  3839542998U,	// <4,2,3,5>: Cost 4 vsldoi12 <2,3,5,4>, <2,3,5,4>
+  3314640826U,	// <4,2,3,6>: Cost 4 vmrghw <4,3,5,7>, <2,6,3,7>
+  2765948648U,	// <4,2,3,7>: Cost 3 vsldoi12 <2,3,7,4>, <2,3,7,4>
+  2759018222U,	// <4,2,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,u,1>
+  3838658295U,	// <4,2,4,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,4,0,1>
+  3315205667U,	// <4,2,4,1>: Cost 4 vmrghw <4,4,4,4>, <2,1,3,5>
+  2241463912U,	// <4,2,4,2>: Cost 3 vmrghw <4,4,4,4>, <2,2,2,2>
+  1234829414U,	// <4,2,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2241464085U,	// <4,2,4,4>: Cost 3 vmrghw <4,4,4,4>, <2,4,3,4>
+  2241546087U,	// <4,2,4,5>: Cost 3 vmrghw <4,4,5,5>, <2,5,3,5>
+  2241464250U,	// <4,2,4,6>: Cost 3 vmrghw <4,4,4,4>, <2,6,3,7>
+  3741602873U,	// <4,2,4,7>: Cost 4 vsldoi4 <u,4,2,4>, <7,0,u,2>
+  1234829419U,	// <4,2,4,u>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2626060390U,	// <4,2,5,0>: Cost 3 vsldoi4 <1,4,2,5>, LHS
+  2626061364U,	// <4,2,5,1>: Cost 3 vsldoi4 <1,4,2,5>, <1,4,2,5>
+  1168557672U,	// <4,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222230118U,	// <4,2,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  2626063670U,	// <4,2,5,4>: Cost 3 vsldoi4 <1,4,2,5>, RHS
+  2242299752U,	// <4,2,5,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1168558010U,	// <4,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2242299882U,	// <4,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1222230123U,	// <4,2,5,u>: Cost 2 vmrglw <2,3,4,5>, LHS
+  3711754342U,	// <4,2,6,0>: Cost 4 vsldoi4 <3,4,2,6>, LHS
+  3711755162U,	// <4,2,6,1>: Cost 4 vsldoi4 <3,4,2,6>, <1,2,3,4>
+  3838658481U,	// <4,2,6,2>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,2,7>
+  2759018426U,	// <4,2,6,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,3,7>
+  3838658499U,	// <4,2,6,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,4,7>
+  3735646310U,	// <4,2,6,5>: Cost 4 vsldoi4 <7,4,2,6>, <5,6,7,4>
+  3316590522U,	// <4,2,6,6>: Cost 4 vmrghw <4,6,5,2>, <2,6,3,7>
+  3798889331U,	// <4,2,6,7>: Cost 4 vsldoi8 <6,7,4,2>, <6,7,4,2>
+  2759018471U,	// <4,2,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,u,7>
+  3874564074U,	// <4,2,7,0>: Cost 4 vsldoi12 <u,2,3,4>, <2,7,0,1>
+  3800880230U,	// <4,2,7,1>: Cost 4 vsldoi8 <7,1,4,2>, <7,1,4,2>
+  3371722344U,	// <4,2,7,2>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,2>
+  2303950950U,	// <4,2,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  3371722346U,	// <4,2,7,4>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,4>
+  3371722509U,	// <4,2,7,5>: Cost 5 vmrglw <2,6,4,7>, <2,4,2,5>
+  3317237690U,	// <4,2,7,6>: Cost 4 vmrghw <4,7,5,0>, <2,6,3,7>
+  3317237738U,	// <4,2,7,7>: Cost 4 vmrghw <4,7,5,0>, <2,7,0,1>
+  2303950955U,	// <4,2,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2759018555U,	// <4,2,u,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,0,1>
+  2626085943U,	// <4,2,u,1>: Cost 3 vsldoi4 <1,4,2,u>, <1,4,2,u>
+  1170548328U,	// <4,2,u,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222254694U,	// <4,2,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2759018595U,	// <4,2,u,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,4,5>
+  2244290408U,	// <4,2,u,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1170548666U,	// <4,2,u,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2769266813U,	// <4,2,u,7>: Cost 3 vsldoi12 <2,u,7,4>, <2,u,7,4>
+  1222254699U,	// <4,2,u,u>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2238859414U,	// <4,3,0,0>: Cost 3 vmrghw <4,0,5,1>, <3,0,1,2>
+  2759018646U,	// <4,3,0,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,1,2>
+  3312314708U,	// <4,3,0,2>: Cost 4 vmrghw <4,0,1,2>, <3,2,4,3>
+  2238859676U,	// <4,3,0,3>: Cost 3 vmrghw <4,0,5,1>, <3,3,3,3>
+  2295931802U,	// <4,3,0,4>: Cost 3 vmrglw <2,3,4,0>, <1,2,3,4>
+  3735670886U,	// <4,3,0,5>: Cost 4 vsldoi4 <7,4,3,0>, <5,6,7,4>
+  3312315036U,	// <4,3,0,6>: Cost 4 vmrghw <4,0,1,2>, <3,6,4,7>
+  3369674682U,	// <4,3,0,7>: Cost 4 vmrglw <2,3,4,0>, <2,6,3,7>
+  2759018709U,	// <4,3,0,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,u,2>
+  3361055638U,	// <4,3,1,0>: Cost 4 vmrglw <0,u,4,1>, <1,2,3,0>
+  3831949542U,	// <4,3,1,1>: Cost 4 vsldoi12 <1,1,1,4>, <3,1,1,1>
+  2703917978U,	// <4,3,1,2>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3361056370U,	// <4,3,1,3>: Cost 4 vmrglw <0,u,4,1>, <2,2,3,3>
+  2295939994U,	// <4,3,1,4>: Cost 3 vmrglw <2,3,4,1>, <1,2,3,4>
+  3361056291U,	// <4,3,1,5>: Cost 4 vmrglw <0,u,4,1>, <2,1,3,5>
+  3378972520U,	// <4,3,1,6>: Cost 4 vmrglw <3,u,4,1>, <2,5,3,6>
+  3361056698U,	// <4,3,1,7>: Cost 4 vmrglw <0,u,4,1>, <2,6,3,7>
+  2703917978U,	// <4,3,1,u>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3832760624U,	// <4,3,2,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,0,3>
+  3711796122U,	// <4,3,2,1>: Cost 4 vsldoi4 <3,4,3,2>, <1,2,3,4>
+  3832760641U,	// <4,3,2,2>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,2,2>
+  2770962764U,	// <4,3,2,3>: Cost 3 vsldoi12 <3,2,3,4>, <3,2,3,4>
+  2759018836U,	// <4,3,2,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,2,4,3>
+  3827304802U,	// <4,3,2,5>: Cost 5 vsldoi12 <0,3,1,4>, <3,2,5,u>
+  3832760678U,	// <4,3,2,6>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,6,3>
+  3859597679U,	// <4,3,2,7>: Cost 4 vsldoi12 <5,6,7,4>, <3,2,7,3>
+  2771331449U,	// <4,3,2,u>: Cost 3 vsldoi12 <3,2,u,4>, <3,2,u,4>
+  2240841878U,	// <4,3,3,0>: Cost 3 vmrghw <4,3,5,0>, <3,0,1,2>
+  3776997635U,	// <4,3,3,1>: Cost 4 vsldoi8 <3,1,4,3>, <3,1,4,3>
+  2703919444U,	// <4,3,3,2>: Cost 3 vsldoi8 <3,2,4,3>, <3,2,4,3>
+  2759018908U,	// <4,3,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,3,3>
+  2759018918U,	// <4,3,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,4,4>
+  3386951446U,	// <4,3,3,5>: Cost 4 vmrglw <5,2,4,3>, <2,4,3,5>
+  3777661596U,	// <4,3,3,6>: Cost 4 vsldoi8 <3,2,4,3>, <3,6,4,7>
+  3375007674U,	// <4,3,3,7>: Cost 4 vmrglw <3,2,4,3>, <2,6,3,7>
+  2707901242U,	// <4,3,3,u>: Cost 3 vsldoi8 <3,u,4,3>, <3,u,4,3>
+  2759018960U,	// <4,3,4,0>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,0,1>
+  2759018970U,	// <4,3,4,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,1,2>
+  2632099605U,	// <4,3,4,2>: Cost 3 vsldoi4 <2,4,3,4>, <2,4,3,4>
+  2241464732U,	// <4,3,4,3>: Cost 3 vmrghw <4,4,4,4>, <3,3,3,3>
+  2759019000U,	// <4,3,4,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,4,5>
+  2753563138U,	// <4,3,4,5>: Cost 3 vsldoi12 <0,3,1,4>, <3,4,5,6>
+  3777662316U,	// <4,3,4,6>: Cost 4 vsldoi8 <3,2,4,3>, <4,6,3,7>
+  2308573114U,	// <4,3,4,7>: Cost 3 vmrglw <4,4,4,4>, <2,6,3,7>
+  2759019032U,	// <4,3,4,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,u,1>
+  1168558230U,	// <4,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2242300134U,	// <4,3,5,1>: Cost 3 vmrghw RHS, <3,1,1,1>
+  2632107798U,	// <4,3,5,2>: Cost 3 vsldoi4 <2,4,3,5>, <2,4,3,5>
+  1168558492U,	// <4,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1168558594U,	// <4,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2295973654U,	// <4,3,5,5>: Cost 3 vmrglw <2,3,4,5>, <2,4,3,5>
+  2242300536U,	// <4,3,5,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295973818U,	// <4,3,5,7>: Cost 3 vmrglw <2,3,4,5>, <2,6,3,7>
+  1168558878U,	// <4,3,5,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  3832760952U,	// <4,3,6,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,6,0,7>
+  3711828890U,	// <4,3,6,1>: Cost 4 vsldoi4 <3,4,3,6>, <1,2,3,4>
+  3316484436U,	// <4,3,6,2>: Cost 4 vmrghw <4,6,3,7>, <3,2,4,3>
+  3711830512U,	// <4,3,6,3>: Cost 4 vsldoi4 <3,4,3,6>, <3,4,3,6>
+  2759019164U,	// <4,3,6,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3361097251U,	// <4,3,6,5>: Cost 5 vmrglw <0,u,4,6>, <2,1,3,5>
+  3316624045U,	// <4,3,6,6>: Cost 4 vmrghw <4,6,5,6>, <3,6,6,6>
+  2773912244U,	// <4,3,6,7>: Cost 3 vsldoi12 <3,6,7,4>, <3,6,7,4>
+  2759019164U,	// <4,3,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3377693590U,	// <4,3,7,0>: Cost 4 vmrglw <3,6,4,7>, <1,2,3,0>
+  3365751680U,	// <4,3,7,1>: Cost 5 vmrglw <1,6,4,7>, <4,0,3,1>
+  2727810232U,	// <4,3,7,2>: Cost 3 vsldoi8 <7,2,4,3>, <7,2,4,3>
+  3377694322U,	// <4,3,7,3>: Cost 4 vmrglw <3,6,4,7>, <2,2,3,3>
+  2303951770U,	// <4,3,7,4>: Cost 3 vmrglw <3,6,4,7>, <1,2,3,4>
+  3741700198U,	// <4,3,7,5>: Cost 4 vsldoi4 <u,4,3,7>, <5,6,7,4>
+  3377695216U,	// <4,3,7,6>: Cost 4 vmrglw <3,6,4,7>, <3,4,3,6>
+  3375703994U,	// <4,3,7,7>: Cost 4 vmrglw <3,3,4,7>, <2,6,3,7>
+  2731792030U,	// <4,3,7,u>: Cost 3 vsldoi8 <7,u,4,3>, <7,u,4,3>
+  1170548886U,	// <4,3,u,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2759019294U,	// <4,3,u,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,1,2>
+  2632132377U,	// <4,3,u,2>: Cost 3 vsldoi4 <2,4,3,u>, <2,4,3,u>
+  1170549148U,	// <4,3,u,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1170549250U,	// <4,3,u,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2759019334U,	// <4,3,u,5>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,5,6>
+  2244291192U,	// <4,3,u,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295998394U,	// <4,3,u,7>: Cost 3 vmrglw <2,3,4,u>, <2,6,3,7>
+  1170549534U,	// <4,3,u,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  1165118354U,	// <4,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1637482598U,	// <4,4,0,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  3711854285U,	// <4,4,0,2>: Cost 4 vsldoi4 <3,4,4,0>, <2,3,4,4>
+  3827305344U,	// <4,4,0,3>: Cost 4 vsldoi12 <0,3,1,4>, <4,0,3,1>
+  2711224658U,	// <4,4,0,4>: Cost 3 vsldoi8 <4,4,4,4>, <0,4,1,5>
+  1165118774U,	// <4,4,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3312602489U,	// <4,4,0,6>: Cost 4 vmrghw <4,0,5,1>, <4,6,5,2>
+  3369675420U,	// <4,4,0,7>: Cost 4 vmrglw <2,3,4,0>, <3,6,4,7>
+  1165119017U,	// <4,4,0,u>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3369682633U,	// <4,4,1,0>: Cost 4 vmrglw <2,3,4,1>, <2,3,4,0>
+  2287313581U,	// <4,4,1,1>: Cost 3 vmrglw <0,u,4,1>, <0,u,4,1>
+  2759019466U,	// <4,4,1,2>: Cost 3 vsldoi12 <1,2,3,4>, <4,1,2,3>
+  3369683284U,	// <4,4,1,3>: Cost 4 vmrglw <2,3,4,1>, <3,2,4,3>
+  2311204048U,	// <4,4,1,4>: Cost 3 vmrglw <4,u,4,1>, <4,4,4,4>
+  2239319350U,	// <4,4,1,5>: Cost 3 vmrghw <4,1,2,3>, RHS
+  3784967411U,	// <4,4,1,6>: Cost 4 vsldoi8 <4,4,4,4>, <1,6,5,7>
+  3369683612U,	// <4,4,1,7>: Cost 4 vmrglw <2,3,4,1>, <3,6,4,7>
+  2763000832U,	// <4,4,1,u>: Cost 3 vsldoi12 <1,u,3,4>, <4,1,u,3>
+  3711869030U,	// <4,4,2,0>: Cost 4 vsldoi4 <3,4,4,2>, LHS
+  3711869850U,	// <4,4,2,1>: Cost 4 vsldoi4 <3,4,4,2>, <1,2,3,4>
+  2240203830U,	// <4,4,2,2>: Cost 3 vmrghw <4,2,5,3>, <4,2,5,3>
+  2698618573U,	// <4,4,2,3>: Cost 3 vsldoi8 <2,3,4,4>, <2,3,4,4>
+  2711226133U,	// <4,4,2,4>: Cost 3 vsldoi8 <4,4,4,4>, <2,4,3,4>
+  2240204086U,	// <4,4,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2711226298U,	// <4,4,2,6>: Cost 3 vsldoi8 <4,4,4,4>, <2,6,3,7>
+  3832761416U,	// <4,4,2,7>: Cost 4 vsldoi12 <1,2,3,4>, <4,2,7,3>
+  2701936738U,	// <4,4,2,u>: Cost 3 vsldoi8 <2,u,4,4>, <2,u,4,4>
+  2711226518U,	// <4,4,3,0>: Cost 3 vsldoi8 <4,4,4,4>, <3,0,1,2>
+  3777005828U,	// <4,4,3,1>: Cost 4 vsldoi8 <3,1,4,4>, <3,1,4,4>
+  3832761453U,	// <4,4,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,2,4>
+  2301266260U,	// <4,4,3,3>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  2705254903U,	// <4,4,3,4>: Cost 3 vsldoi8 <3,4,4,4>, <3,4,4,4>
+  2240843062U,	// <4,4,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  3832761489U,	// <4,4,3,6>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,6,4>
+  3375008412U,	// <4,4,3,7>: Cost 4 vmrglw <3,2,4,3>, <3,6,4,7>
+  2301266260U,	// <4,4,3,u>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  1570373734U,	// <4,4,4,0>: Cost 2 vsldoi4 <4,4,4,4>, LHS
+  2308574089U,	// <4,4,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,4,1>
+  2644117096U,	// <4,4,4,2>: Cost 3 vsldoi4 <4,4,4,4>, <2,2,2,2>
+  2638146039U,	// <4,4,4,3>: Cost 3 vsldoi4 <3,4,4,4>, <3,4,4,4>
+  229035318U,	// <4,4,4,4>: Cost 1 vspltisw0 RHS
+  1167723830U,	// <4,4,4,5>: Cost 2 vmrghw <4,4,4,4>, RHS
+  2644120058U,	// <4,4,4,6>: Cost 3 vsldoi4 <4,4,4,4>, <6,2,7,3>
+  2662036827U,	// <4,4,4,7>: Cost 3 vsldoi4 <7,4,4,4>, <7,4,4,4>
+  229035318U,	// <4,4,4,u>: Cost 1 vspltisw0 RHS
+  1168558994U,	// <4,4,5,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  2638152602U,	// <4,4,5,1>: Cost 3 vsldoi4 <3,4,4,5>, <1,2,3,4>
+  2242300981U,	// <4,4,5,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638154232U,	// <4,4,5,3>: Cost 3 vsldoi4 <3,4,4,5>, <3,4,4,5>
+  1168559322U,	// <4,4,5,4>: Cost 2 vmrghw RHS, <4,4,5,5>
+  94817590U,	// <4,4,5,5>: Cost 1 vmrghw RHS, RHS
+  1685278006U,	// <4,4,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2242309576U,	// <4,4,5,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  94817833U,	// <4,4,5,u>: Cost 1 vmrghw RHS, RHS
+  3316591506U,	// <4,4,6,0>: Cost 4 vmrghw <4,6,5,2>, <4,0,5,1>
+  3758428587U,	// <4,4,6,1>: Cost 4 vsldoi8 <0,0,4,4>, <6,1,7,5>
+  2711228922U,	// <4,4,6,2>: Cost 3 vsldoi8 <4,4,4,4>, <6,2,7,3>
+  3796251185U,	// <4,4,6,3>: Cost 4 vsldoi8 <6,3,4,4>, <6,3,4,4>
+  2711229085U,	// <4,4,6,4>: Cost 3 vsldoi8 <4,4,4,4>, <6,4,7,4>
+  2242850102U,	// <4,4,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2242850169U,	// <4,4,6,6>: Cost 3 vmrghw <4,6,5,2>, <4,6,5,2>
+  2725163893U,	// <4,4,6,7>: Cost 3 vsldoi8 <6,7,4,4>, <6,7,4,4>
+  2242850345U,	// <4,4,6,u>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2711229434U,	// <4,4,7,0>: Cost 3 vsldoi8 <4,4,4,4>, <7,0,1,2>
+  3377694410U,	// <4,4,7,1>: Cost 4 vmrglw <3,6,4,7>, <2,3,4,1>
+  3868593584U,	// <4,4,7,2>: Cost 4 vsldoi12 <7,2,3,4>, <4,7,2,3>
+  3377695060U,	// <4,4,7,3>: Cost 4 vmrglw <3,6,4,7>, <3,2,4,3>
+  2729145691U,	// <4,4,7,4>: Cost 3 vsldoi8 <7,4,4,4>, <7,4,4,4>
+  2243497270U,	// <4,4,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  3871542744U,	// <4,4,7,6>: Cost 4 vsldoi12 <7,6,7,4>, <4,7,6,7>
+  2303953564U,	// <4,4,7,7>: Cost 3 vmrglw <3,6,4,7>, <3,6,4,7>
+  2243497513U,	// <4,4,7,u>: Cost 3 vmrghw <4,7,5,0>, RHS
+  1170549650U,	// <4,4,u,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  1637488430U,	// <4,4,u,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  2244291637U,	// <4,4,u,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638178811U,	// <4,4,u,3>: Cost 3 vsldoi4 <3,4,4,u>, <3,4,4,u>
+  229035318U,	// <4,4,u,4>: Cost 1 vspltisw0 RHS
+  96808246U,	// <4,4,u,5>: Cost 1 vmrghw RHS, RHS
+  1685278249U,	// <4,4,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2244292040U,	// <4,4,u,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  96808489U,	// <4,4,u,u>: Cost 1 vmrghw RHS, RHS
+  2698625024U,	// <4,5,0,0>: Cost 3 vsldoi8 <2,3,4,5>, <0,0,0,0>
+  1624883302U,	// <4,5,0,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2638186190U,	// <4,5,0,2>: Cost 3 vsldoi4 <3,4,5,0>, <2,3,4,5>
+  2638187004U,	// <4,5,0,3>: Cost 3 vsldoi4 <3,4,5,0>, <3,4,5,0>
+  2687345005U,	// <4,5,0,4>: Cost 3 vsldoi8 <0,4,4,5>, <0,4,4,5>
+  2238861316U,	// <4,5,0,5>: Cost 3 vmrghw <4,0,5,1>, <5,5,5,5>
+  2662077302U,	// <4,5,0,6>: Cost 3 vsldoi4 <7,4,5,0>, <6,7,4,5>
+  2662077792U,	// <4,5,0,7>: Cost 3 vsldoi4 <7,4,5,0>, <7,4,5,0>
+  1624883869U,	// <4,5,0,u>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  3361057762U,	// <4,5,1,0>: Cost 4 vmrglw <0,u,4,1>, <4,1,5,0>
+  2691326803U,	// <4,5,1,1>: Cost 3 vsldoi8 <1,1,4,5>, <1,1,4,5>
+  2698625942U,	// <4,5,1,2>: Cost 3 vsldoi8 <2,3,4,5>, <1,2,3,0>
+  3361055659U,	// <4,5,1,3>: Cost 4 vmrglw <0,u,4,1>, <1,2,5,3>
+  3761087567U,	// <4,5,1,4>: Cost 4 vsldoi8 <0,4,4,5>, <1,4,5,5>
+  2693981335U,	// <4,5,1,5>: Cost 3 vsldoi8 <1,5,4,5>, <1,5,4,5>
+  2305231362U,	// <4,5,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  3361055987U,	// <4,5,1,7>: Cost 4 vmrglw <0,u,4,1>, <1,6,5,7>
+  2695972234U,	// <4,5,1,u>: Cost 3 vsldoi8 <1,u,4,5>, <1,u,4,5>
+  2638200934U,	// <4,5,2,0>: Cost 3 vsldoi4 <3,4,5,2>, LHS
+  3761088035U,	// <4,5,2,1>: Cost 4 vsldoi8 <0,4,4,5>, <2,1,3,5>
+  2697963133U,	// <4,5,2,2>: Cost 3 vsldoi8 <2,2,4,5>, <2,2,4,5>
+  1624884942U,	// <4,5,2,3>: Cost 2 vsldoi8 <2,3,4,5>, <2,3,4,5>
+  2698626838U,	// <4,5,2,4>: Cost 3 vsldoi8 <2,3,4,5>, <2,4,3,5>
+  3772368744U,	// <4,5,2,5>: Cost 4 vsldoi8 <2,3,4,5>, <2,5,3,6>
+  2698627002U,	// <4,5,2,6>: Cost 3 vsldoi8 <2,3,4,5>, <2,6,3,7>
+  3775023122U,	// <4,5,2,7>: Cost 4 vsldoi8 <2,7,4,5>, <2,7,4,5>
+  1628203107U,	// <4,5,2,u>: Cost 2 vsldoi8 <2,u,4,5>, <2,u,4,5>
+  2698627222U,	// <4,5,3,0>: Cost 3 vsldoi8 <2,3,4,5>, <3,0,1,2>
+  3765070057U,	// <4,5,3,1>: Cost 4 vsldoi8 <1,1,4,5>, <3,1,1,4>
+  2698627404U,	// <4,5,3,2>: Cost 3 vsldoi8 <2,3,4,5>, <3,2,3,4>
+  2698627484U,	// <4,5,3,3>: Cost 3 vsldoi8 <2,3,4,5>, <3,3,3,3>
+  2698627580U,	// <4,5,3,4>: Cost 3 vsldoi8 <2,3,4,5>, <3,4,5,0>
+  3779668553U,	// <4,5,3,5>: Cost 4 vsldoi8 <3,5,4,5>, <3,5,4,5>
+  2725169844U,	// <4,5,3,6>: Cost 3 vsldoi8 <6,7,4,5>, <3,6,7,4>
+  2707253995U,	// <4,5,3,7>: Cost 3 vsldoi8 <3,7,4,5>, <3,7,4,5>
+  2698627870U,	// <4,5,3,u>: Cost 3 vsldoi8 <2,3,4,5>, <3,u,1,2>
+  2638217318U,	// <4,5,4,0>: Cost 3 vsldoi4 <3,4,5,4>, LHS
+  2308574098U,	// <4,5,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,5,1>
+  2698628150U,	// <4,5,4,2>: Cost 3 vsldoi8 <2,3,4,5>, <4,2,5,3>
+  2638219776U,	// <4,5,4,3>: Cost 3 vsldoi4 <3,4,5,4>, <3,4,5,4>
+  2698628314U,	// <4,5,4,4>: Cost 3 vsldoi8 <2,3,4,5>, <4,4,5,5>
+  1624886582U,	// <4,5,4,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  2698628478U,	// <4,5,4,6>: Cost 3 vsldoi8 <2,3,4,5>, <4,6,5,7>
+  2662110564U,	// <4,5,4,7>: Cost 3 vsldoi4 <7,4,5,4>, <7,4,5,4>
+  1624886825U,	// <4,5,4,u>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1570455654U,	// <4,5,5,0>: Cost 2 vsldoi4 <4,4,5,5>, LHS
+  2312564250U,	// <4,5,5,1>: Cost 3 vmrglw <5,1,4,5>, <4,u,5,1>
+  2644199118U,	// <4,5,5,2>: Cost 3 vsldoi4 <4,4,5,5>, <2,3,4,5>
+  2295974966U,	// <4,5,5,3>: Cost 3 vmrglw <2,3,4,5>, <4,2,5,3>
+  1570458842U,	// <4,5,5,4>: Cost 2 vsldoi4 <4,4,5,5>, <4,4,5,5>
+  1168568324U,	// <4,5,5,5>: Cost 2 vmrghw RHS, <5,5,5,5>
+  1168568418U,	// <4,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2295975294U,	// <4,5,5,7>: Cost 3 vmrglw <2,3,4,5>, <4,6,5,7>
+  1168716036U,	// <4,5,5,u>: Cost 2 vmrghw RHS, <5,u,7,0>
+  1564491878U,	// <4,5,6,0>: Cost 2 vsldoi4 <3,4,5,6>, LHS
+  2626290768U,	// <4,5,6,1>: Cost 3 vsldoi4 <1,4,5,6>, <1,4,5,6>
+  2632263465U,	// <4,5,6,2>: Cost 3 vsldoi4 <2,4,5,6>, <2,4,5,6>
+  1564494338U,	// <4,5,6,3>: Cost 2 vsldoi4 <3,4,5,6>, <3,4,5,6>
+  1564495158U,	// <4,5,6,4>: Cost 2 vsldoi4 <3,4,5,6>, RHS
+  2638237464U,	// <4,5,6,5>: Cost 3 vsldoi4 <3,4,5,6>, <5,2,6,3>
+  2656154253U,	// <4,5,6,6>: Cost 3 vsldoi4 <6,4,5,6>, <6,4,5,6>
+  27705344U,	// <4,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,6,u>: Cost 0 copy RHS
+  2725172218U,	// <4,5,7,0>: Cost 3 vsldoi8 <6,7,4,5>, <7,0,1,2>
+  3859599489U,	// <4,5,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <5,7,1,4>
+  2698630320U,	// <4,5,7,2>: Cost 3 vsldoi8 <2,3,4,5>, <7,2,3,4>
+  2728490251U,	// <4,5,7,3>: Cost 3 vsldoi8 <7,3,4,5>, <7,3,4,5>
+  2725172576U,	// <4,5,7,4>: Cost 3 vsldoi8 <6,7,4,5>, <7,4,5,0>
+  3317239812U,	// <4,5,7,5>: Cost 4 vmrghw <4,7,5,0>, <5,5,5,5>
+  2725172760U,	// <4,5,7,6>: Cost 3 vsldoi8 <6,7,4,5>, <7,6,7,4>
+  2725172844U,	// <4,5,7,7>: Cost 3 vsldoi8 <6,7,4,5>, <7,7,7,7>
+  2725172866U,	// <4,5,7,u>: Cost 3 vsldoi8 <6,7,4,5>, <7,u,1,2>
+  1564508262U,	// <4,5,u,0>: Cost 2 vsldoi4 <3,4,5,u>, LHS
+  1624889134U,	// <4,5,u,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2698631045U,	// <4,5,u,2>: Cost 3 vsldoi8 <2,3,4,5>, <u,2,3,0>
+  1564510724U,	// <4,5,u,3>: Cost 2 vsldoi4 <3,4,5,u>, <3,4,5,u>
+  1564511542U,	// <4,5,u,4>: Cost 2 vsldoi4 <3,4,5,u>, RHS
+  1624889498U,	// <4,5,u,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1170550882U,	// <4,5,u,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  27705344U,	// <4,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,u,u>: Cost 0 copy RHS
+  3312595285U,	// <4,6,0,0>: Cost 4 vmrghw <4,0,5,0>, <6,0,7,0>
+  3763748966U,	// <4,6,0,1>: Cost 4 vsldoi8 <0,u,4,6>, LHS
+  2238861818U,	// <4,6,0,2>: Cost 3 vmrghw <4,0,5,1>, <6,2,7,3>
+  3767730432U,	// <4,6,0,3>: Cost 4 vsldoi8 <1,5,4,6>, <0,3,1,4>
+  3763749202U,	// <4,6,0,4>: Cost 4 vsldoi8 <0,u,4,6>, <0,4,1,5>
+  2238862059U,	// <4,6,0,5>: Cost 3 vmrghw <4,0,5,1>, <6,5,7,1>
+  2238862136U,	// <4,6,0,6>: Cost 3 vmrghw <4,0,5,1>, <6,6,6,6>
+  2295934262U,	// <4,6,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  2295934263U,	// <4,6,0,u>: Cost 3 vmrglw <2,3,4,0>, RHS
+  3378973999U,	// <4,6,1,0>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,0>
+  3378974648U,	// <4,6,1,1>: Cost 4 vmrglw <3,u,4,1>, <5,4,6,1>
+  3779675034U,	// <4,6,1,2>: Cost 4 vsldoi8 <3,5,4,6>, <1,2,3,4>
+  3378974002U,	// <4,6,1,3>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,3>
+  3378974003U,	// <4,6,1,4>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,4>
+  3767731352U,	// <4,6,1,5>: Cost 4 vsldoi8 <1,5,4,6>, <1,5,4,6>
+  3378974734U,	// <4,6,1,6>: Cost 4 vmrglw <3,u,4,1>, <5,5,6,6>
+  2287316278U,	// <4,6,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  2287316279U,	// <4,6,1,u>: Cost 3 vmrglw <0,u,4,1>, RHS
+  3735904358U,	// <4,6,2,0>: Cost 4 vsldoi4 <7,4,6,2>, LHS
+  3763750435U,	// <4,6,2,1>: Cost 5 vsldoi8 <0,u,4,6>, <2,1,3,5>
+  3313938937U,	// <4,6,2,2>: Cost 4 vmrghw <4,2,5,2>, <6,2,7,2>
+  3772376782U,	// <4,6,2,3>: Cost 4 vsldoi8 <2,3,4,6>, <2,3,4,5>
+  3852890591U,	// <4,6,2,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,2,4,3>
+  3735908454U,	// <4,6,2,5>: Cost 4 vsldoi4 <7,4,6,2>, <5,6,7,4>
+  3801573306U,	// <4,6,2,6>: Cost 4 vsldoi8 <7,2,4,6>, <2,6,3,7>
+  2785858042U,	// <4,6,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,7,3>
+  2785858051U,	// <4,6,2,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,u,3>
+  3863065101U,	// <4,6,3,0>: Cost 4 vsldoi12 <6,3,0,4>, <6,3,0,4>
+  3314586024U,	// <4,6,3,1>: Cost 4 vmrghw <4,3,5,0>, <6,1,7,2>
+  3863212575U,	// <4,6,3,2>: Cost 4 vsldoi12 <6,3,2,4>, <6,3,2,4>
+  3863286312U,	// <4,6,3,3>: Cost 4 vsldoi12 <6,3,3,4>, <6,3,3,4>
+  3767732738U,	// <4,6,3,4>: Cost 4 vsldoi8 <1,5,4,6>, <3,4,5,6>
+  3779676746U,	// <4,6,3,5>: Cost 4 vsldoi8 <3,5,4,6>, <3,5,4,6>
+  3398898488U,	// <4,6,3,6>: Cost 4 vmrglw <7,2,4,3>, <6,6,6,6>
+  2301267254U,	// <4,6,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2301267255U,	// <4,6,3,u>: Cost 3 vmrglw <3,2,4,3>, RHS
+  3852890715U,	// <4,6,4,0>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,0,1>
+  3315208615U,	// <4,6,4,1>: Cost 4 vmrghw <4,4,4,4>, <6,1,7,1>
+  2241466874U,	// <4,6,4,2>: Cost 3 vmrghw <4,4,4,4>, <6,2,7,3>
+  3852890745U,	// <4,6,4,3>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,3,4>
+  2241467037U,	// <4,6,4,4>: Cost 3 vmrghw <4,4,4,4>, <6,4,7,4>
+  2241549039U,	// <4,6,4,5>: Cost 3 vmrghw <4,4,5,5>, <6,5,7,5>
+  2241467192U,	// <4,6,4,6>: Cost 3 vmrghw <4,4,4,4>, <6,6,6,6>
+  1234832694U,	// <4,6,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  1234832695U,	// <4,6,4,u>: Cost 2 vmrglw <4,4,4,4>, RHS
+  2242302241U,	// <4,6,5,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2242310567U,	// <4,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1168568826U,	// <4,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2242302514U,	// <4,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2242302605U,	// <4,6,5,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2242310891U,	// <4,6,5,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1168569144U,	// <4,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222233398U,	// <4,6,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  1222233399U,	// <4,6,5,u>: Cost 2 vmrglw <2,3,4,5>, RHS
+  3316576545U,	// <4,6,6,0>: Cost 4 vmrghw <4,6,5,0>, <6,0,1,2>
+  3316584871U,	// <4,6,6,1>: Cost 4 vmrghw <4,6,5,1>, <6,1,7,1>
+  2242851322U,	// <4,6,6,2>: Cost 3 vmrghw <4,6,5,2>, <6,2,7,3>
+  3316601394U,	// <4,6,6,3>: Cost 4 vmrghw <4,6,5,3>, <6,3,4,5>
+  3852890916U,	// <4,6,6,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,6,4,4>
+  3316617963U,	// <4,6,6,5>: Cost 4 vmrghw <4,6,5,5>, <6,5,7,1>
+  2242884408U,	// <4,6,6,6>: Cost 3 vmrghw <4,6,5,6>, <6,6,6,6>
+  2785858370U,	// <4,6,6,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,7,7>
+  2785858379U,	// <4,6,6,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,u,7>
+  2785858382U,	// <4,6,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,0,1>
+  3859600215U,	// <4,6,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <6,7,1,1>
+  3317240314U,	// <4,6,7,2>: Cost 4 vmrghw <4,7,5,0>, <6,2,7,3>
+  2792199020U,	// <4,6,7,3>: Cost 3 vsldoi12 <6,7,3,4>, <6,7,3,4>
+  2785858422U,	// <4,6,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,4,5>
+  3856651132U,	// <4,6,7,5>: Cost 4 vsldoi12 <5,2,3,4>, <6,7,5,2>
+  3317240632U,	// <4,6,7,6>: Cost 4 vmrghw <4,7,5,0>, <6,6,6,6>
+  2303954230U,	// <4,6,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303954231U,	// <4,6,7,u>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2244292897U,	// <4,6,u,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2244293031U,	// <4,6,u,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1170551290U,	// <4,6,u,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2244293170U,	// <4,6,u,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2244293261U,	// <4,6,u,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2244293355U,	// <4,6,u,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1170551608U,	// <4,6,u,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222257974U,	// <4,6,u,7>: Cost 2 vmrglw <2,3,4,u>, RHS
+  1222257975U,	// <4,6,u,u>: Cost 2 vmrglw <2,3,4,u>, RHS
+  2238862330U,	// <4,7,0,0>: Cost 3 vmrghw <4,0,5,1>, <7,0,1,2>
+  2706604134U,	// <4,7,0,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3312604308U,	// <4,7,0,2>: Cost 4 vmrghw <4,0,5,1>, <7,2,0,3>
+  3768402176U,	// <4,7,0,3>: Cost 4 vsldoi8 <1,6,4,7>, <0,3,1,4>
+  2238862648U,	// <4,7,0,4>: Cost 3 vmrghw <4,0,5,1>, <7,4,0,5>
+  3859600418U,	// <4,7,0,5>: Cost 4 vsldoi12 <5,6,7,4>, <7,0,5,6>
+  3729994393U,	// <4,7,0,6>: Cost 4 vsldoi4 <6,4,7,0>, <6,4,7,0>
+  2238862956U,	// <4,7,0,7>: Cost 3 vmrghw <4,0,5,1>, <7,7,7,7>
+  2706604701U,	// <4,7,0,u>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3385610338U,	// <4,7,1,0>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,0>
+  3780346676U,	// <4,7,1,1>: Cost 4 vsldoi8 <3,6,4,7>, <1,1,1,1>
+  2706604954U,	// <4,7,1,2>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3385610746U,	// <4,7,1,3>: Cost 4 vmrglw <5,0,4,1>, <6,2,7,3>
+  3385610342U,	// <4,7,1,4>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,4>
+  3385610667U,	// <4,7,1,5>: Cost 4 vmrglw <5,0,4,1>, <6,1,7,5>
+  3768403178U,	// <4,7,1,6>: Cost 4 vsldoi8 <1,6,4,7>, <1,6,4,7>
+  3385611074U,	// <4,7,1,7>: Cost 4 vmrglw <5,0,4,1>, <6,6,7,7>
+  2706604954U,	// <4,7,1,u>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3859600532U,	// <4,7,2,0>: Cost 4 vsldoi12 <5,6,7,4>, <7,2,0,3>
+  3712091034U,	// <4,7,2,1>: Cost 5 vsldoi4 <3,4,7,2>, <1,2,3,4>
+  3774375528U,	// <4,7,2,2>: Cost 4 vsldoi8 <2,6,4,7>, <2,2,2,2>
+  2794853552U,	// <4,7,2,3>: Cost 3 vsldoi12 <7,2,3,4>, <7,2,3,4>
+  2785858744U,	// <4,7,2,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,2,4,3>
+  3735982182U,	// <4,7,2,5>: Cost 4 vsldoi4 <7,4,7,2>, <5,6,7,4>
+  3774375875U,	// <4,7,2,6>: Cost 4 vsldoi8 <2,6,4,7>, <2,6,4,7>
+  3735983476U,	// <4,7,2,7>: Cost 4 vsldoi4 <7,4,7,2>, <7,4,7,2>
+  2795222237U,	// <4,7,2,u>: Cost 3 vsldoi12 <7,2,u,4>, <7,2,u,4>
+  3780348054U,	// <4,7,3,0>: Cost 4 vsldoi8 <3,6,4,7>, <3,0,1,2>
+  3730015130U,	// <4,7,3,1>: Cost 4 vsldoi4 <6,4,7,3>, <1,2,3,4>
+  3780348244U,	// <4,7,3,2>: Cost 4 vsldoi8 <3,6,4,7>, <3,2,4,3>
+  3778357673U,	// <4,7,3,3>: Cost 4 vsldoi8 <3,3,4,7>, <3,3,4,7>
+  2325155942U,	// <4,7,3,4>: Cost 3 vmrglw <7,2,4,3>, <5,6,7,4>
+  3779684939U,	// <4,7,3,5>: Cost 5 vsldoi8 <3,5,4,7>, <3,5,4,7>
+  2706606748U,	// <4,7,3,6>: Cost 3 vsldoi8 <3,6,4,7>, <3,6,4,7>
+  3398898498U,	// <4,7,3,7>: Cost 4 vmrglw <7,2,4,3>, <6,6,7,7>
+  2707934014U,	// <4,7,3,u>: Cost 3 vsldoi8 <3,u,4,7>, <3,u,4,7>
+  2785858868U,	// <4,7,4,0>: Cost 3 vsldoi12 <5,6,7,4>, <7,4,0,1>
+  3780348874U,	// <4,7,4,1>: Cost 4 vsldoi8 <3,6,4,7>, <4,1,2,3>
+  3780349000U,	// <4,7,4,2>: Cost 4 vsldoi8 <3,6,4,7>, <4,2,7,3>
+  2308575738U,	// <4,7,4,3>: Cost 3 vmrglw <4,4,4,4>, <6,2,7,3>
+  2656283856U,	// <4,7,4,4>: Cost 3 vsldoi4 <6,4,7,4>, <4,4,4,4>
+  2706607414U,	// <4,7,4,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2656285341U,	// <4,7,4,6>: Cost 3 vsldoi4 <6,4,7,4>, <6,4,7,4>
+  2241468012U,	// <4,7,4,7>: Cost 3 vmrghw <4,4,4,4>, <7,7,7,7>
+  2706607657U,	// <4,7,4,u>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  1168569338U,	// <4,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2242311242U,	// <4,7,5,1>: Cost 3 vmrghw RHS, <7,1,1,1>
+  2242303178U,	// <4,7,5,2>: Cost 3 vmrghw RHS, <7,2,6,3>
+  2242311395U,	// <4,7,5,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1168569702U,	// <4,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2242311606U,	// <4,7,5,5>: Cost 3 vmrghw RHS, <7,5,5,5>
+  2242311662U,	// <4,7,5,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1168569964U,	// <4,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1168569986U,	// <4,7,5,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  3316593658U,	// <4,7,6,0>: Cost 4 vmrghw <4,6,5,2>, <7,0,1,2>
+  3316593738U,	// <4,7,6,1>: Cost 5 vmrghw <4,6,5,2>, <7,1,1,1>
+  3316634800U,	// <4,7,6,2>: Cost 4 vmrghw <4,6,5,7>, <7,2,3,4>
+  3386978810U,	// <4,7,6,3>: Cost 4 vmrglw <5,2,4,6>, <6,2,7,3>
+  2785859072U,	// <4,7,6,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,6,4,7>
+  3736014950U,	// <4,7,6,5>: Cost 4 vsldoi4 <7,4,7,6>, <5,6,7,4>
+  3316594158U,	// <4,7,6,6>: Cost 4 vmrghw <4,6,5,2>, <7,6,2,7>
+  2797803032U,	// <4,7,6,7>: Cost 3 vsldoi12 <7,6,7,4>, <7,6,7,4>
+  2797876769U,	// <4,7,6,u>: Cost 3 vsldoi12 <7,6,u,4>, <7,6,u,4>
+  2243499002U,	// <4,7,7,0>: Cost 3 vmrghw <4,7,5,0>, <7,0,1,2>
+  3718103962U,	// <4,7,7,1>: Cost 4 vsldoi4 <4,4,7,7>, <1,2,3,4>
+  3317257418U,	// <4,7,7,2>: Cost 4 vmrghw <4,7,5,2>, <7,2,6,3>
+  3377695816U,	// <4,7,7,3>: Cost 4 vmrglw <3,6,4,7>, <4,2,7,3>
+  2243532134U,	// <4,7,7,4>: Cost 3 vmrghw <4,7,5,4>, <7,4,5,6>
+  3317282230U,	// <4,7,7,5>: Cost 4 vmrghw <4,7,5,5>, <7,5,5,5>
+  2730497536U,	// <4,7,7,6>: Cost 3 vsldoi8 <7,6,4,7>, <7,6,4,7>
+  2243556972U,	// <4,7,7,7>: Cost 3 vmrghw <4,7,5,7>, <7,7,7,7>
+  2243565186U,	// <4,7,7,u>: Cost 3 vmrghw <4,7,5,u>, <7,u,1,2>
+  1170551802U,	// <4,7,u,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2706609966U,	// <4,7,u,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  2244293797U,	// <4,7,u,2>: Cost 3 vmrghw RHS, <7,2,2,2>
+  2244293859U,	// <4,7,u,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1170552166U,	// <4,7,u,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2706610330U,	// <4,7,u,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2244294126U,	// <4,7,u,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1170552428U,	// <4,7,u,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1170552450U,	// <4,7,u,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  1165118354U,	// <4,u,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1624907878U,	// <4,u,0,1>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638407377U,	// <4,u,0,2>: Cost 3 vsldoi4 <3,4,u,0>, <2,3,4,u>
+  2295931036U,	// <4,u,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  2687369584U,	// <4,u,0,4>: Cost 3 vsldoi8 <0,4,4,u>, <0,4,4,u>
+  1165121690U,	// <4,u,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  2662298489U,	// <4,u,0,6>: Cost 3 vsldoi4 <7,4,u,0>, <6,7,4,u>
+  2295934280U,	// <4,u,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  1624908445U,	// <4,u,0,u>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638413926U,	// <4,u,1,0>: Cost 3 vsldoi4 <3,4,u,1>, LHS
+  2691351382U,	// <4,u,1,1>: Cost 3 vsldoi8 <1,1,4,u>, <1,1,4,u>
+  1685280558U,	// <4,u,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2287313052U,	// <4,u,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  2299257799U,	// <4,u,1,4>: Cost 3 vmrglw <2,u,4,1>, <1,2,u,4>
+  2694005914U,	// <4,u,1,5>: Cost 3 vsldoi8 <1,5,4,u>, <1,5,4,u>
+  2305231362U,	// <4,u,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  2287316296U,	// <4,u,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  1685280612U,	// <4,u,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2638422118U,	// <4,u,2,0>: Cost 3 vsldoi4 <3,4,u,2>, LHS
+  2240206638U,	// <4,u,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  2697987712U,	// <4,u,2,2>: Cost 3 vsldoi8 <2,2,4,u>, <2,2,4,u>
+  1624909521U,	// <4,u,2,3>: Cost 2 vsldoi8 <2,3,4,u>, <2,3,4,u>
+  2759391121U,	// <4,u,2,4>: Cost 3 vsldoi12 <1,2,u,4>, <u,2,4,3>
+  2240207002U,	// <4,u,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2698651578U,	// <4,u,2,6>: Cost 3 vsldoi8 <2,3,4,u>, <2,6,3,7>
+  2785859500U,	// <4,u,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <u,2,7,3>
+  1628227686U,	// <4,u,2,u>: Cost 2 vsldoi8 <2,u,4,u>, <2,u,4,u>
+  2759022524U,	// <4,u,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,0,1>
+  2801342408U,	// <4,u,3,1>: Cost 3 vsldoi12 <u,3,1,4>, <u,3,1,4>
+  2703960409U,	// <4,u,3,2>: Cost 3 vsldoi8 <3,2,4,u>, <3,2,4,u>
+  2759022554U,	// <4,u,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,3,4>
+  2759022564U,	// <4,u,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,4,5>
+  2240845978U,	// <4,u,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  2706614941U,	// <4,u,3,6>: Cost 3 vsldoi8 <3,6,4,u>, <3,6,4,u>
+  2301267272U,	// <4,u,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2759022596U,	// <4,u,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,u,1>
+  1570668646U,	// <4,u,4,0>: Cost 2 vsldoi4 <4,4,u,4>, LHS
+  1167726382U,	// <4,u,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  2698652753U,	// <4,u,4,2>: Cost 3 vsldoi8 <2,3,4,u>, <4,2,u,3>
+  1234829468U,	// <4,u,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  229035318U,	// <4,u,4,4>: Cost 1 vspltisw0 RHS
+  1624911158U,	// <4,u,4,5>: Cost 2 vsldoi8 <2,3,4,u>, RHS
+  2698653081U,	// <4,u,4,6>: Cost 3 vsldoi8 <2,3,4,u>, <4,6,u,7>
+  1234832712U,	// <4,u,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  229035318U,	// <4,u,4,u>: Cost 1 vspltisw0 RHS
+  1168561875U,	// <4,u,5,0>: Cost 2 vmrghw RHS, <u,0,1,2>
+  94820142U,	// <4,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1168562053U,	// <4,u,5,2>: Cost 2 vmrghw RHS, <u,2,3,0>
+  1222230172U,	// <4,u,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  1168562239U,	// <4,u,5,4>: Cost 2 vmrghw RHS, <u,4,5,6>
+  94820506U,	// <4,u,5,5>: Cost 1 vmrghw RHS, RHS
+  1685280922U,	// <4,u,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  1222233416U,	// <4,u,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  94820709U,	// <4,u,5,u>: Cost 1 vmrghw RHS, LHS
+  1564713062U,	// <4,u,6,0>: Cost 2 vsldoi4 <3,4,u,6>, LHS
+  2626511979U,	// <4,u,6,1>: Cost 3 vsldoi4 <1,4,u,6>, <1,4,u,6>
+  2632484676U,	// <4,u,6,2>: Cost 3 vsldoi4 <2,4,u,6>, <2,4,u,6>
+  1564715549U,	// <4,u,6,3>: Cost 2 vsldoi4 <3,4,u,6>, <3,4,u,6>
+  1564716342U,	// <4,u,6,4>: Cost 2 vsldoi4 <3,4,u,6>, RHS
+  2242853018U,	// <4,u,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2656375464U,	// <4,u,6,6>: Cost 3 vsldoi4 <6,4,u,6>, <6,4,u,6>
+  27705344U,	// <4,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,6,u>: Cost 0 copy RHS
+  2785859840U,	// <4,u,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,0,1>
+  2243499822U,	// <4,u,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  2727851197U,	// <4,u,7,2>: Cost 3 vsldoi8 <7,2,4,u>, <7,2,4,u>
+  2303951004U,	// <4,u,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2785859880U,	// <4,u,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,4,5>
+  2243500186U,	// <4,u,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  2730505729U,	// <4,u,7,6>: Cost 3 vsldoi8 <7,6,4,u>, <7,6,4,u>
+  2303954248U,	// <4,u,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303951009U,	// <4,u,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  1564729446U,	// <4,u,u,0>: Cost 2 vsldoi4 <3,4,u,u>, LHS
+  96810798U,	// <4,u,u,1>: Cost 1 vmrghw RHS, LHS
+  1685281125U,	// <4,u,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  1222254748U,	// <4,u,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  229035318U,	// <4,u,u,4>: Cost 1 vspltisw0 RHS
+  96811162U,	// <4,u,u,5>: Cost 1 vmrghw RHS, RHS
+  1685281165U,	// <4,u,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  27705344U,	// <4,u,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,u,u>: Cost 0 copy RHS
+  2754232320U,	// <5,0,0,0>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,0,0>
+  2754232330U,	// <5,0,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,1,1>
+  3718194894U,	// <5,0,0,2>: Cost 4 vsldoi4 <4,5,0,0>, <2,3,4,5>
+  3376385762U,	// <5,0,0,3>: Cost 4 vmrglw <3,4,5,0>, <5,2,0,3>
+  2754232357U,	// <5,0,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,4,1>
+  3845816370U,	// <5,0,0,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,0,5,5>
+  3782353389U,	// <5,0,0,6>: Cost 4 vsldoi8 <4,0,5,0>, <0,6,0,7>
+  3376386090U,	// <5,0,0,7>: Cost 4 vmrglw <3,4,5,0>, <5,6,0,7>
+  2757402697U,	// <5,0,0,u>: Cost 3 vsldoi12 <0,u,u,5>, <0,0,u,1>
+  2626543718U,	// <5,0,1,0>: Cost 3 vsldoi4 <1,5,0,1>, LHS
+  2626544751U,	// <5,0,1,1>: Cost 3 vsldoi4 <1,5,0,1>, <1,5,0,1>
+  1680490598U,	// <5,0,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3766428665U,	// <5,0,1,3>: Cost 4 vsldoi8 <1,3,5,0>, <1,3,5,0>
+  2626546998U,	// <5,0,1,4>: Cost 3 vsldoi4 <1,5,0,1>, RHS
+  2650435539U,	// <5,0,1,5>: Cost 3 vsldoi4 <5,5,0,1>, <5,5,0,1>
+  3783017715U,	// <5,0,1,6>: Cost 4 vsldoi8 <4,1,5,0>, <1,6,5,7>
+  3385019000U,	// <5,0,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,0,7>
+  1680490652U,	// <5,0,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3376398336U,	// <5,0,2,0>: Cost 4 vmrglw <3,4,5,2>, <0,0,0,0>
+  2245877862U,	// <5,0,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  3773064808U,	// <5,0,2,2>: Cost 4 vsldoi8 <2,4,5,0>, <2,2,2,2>
+  2705295054U,	// <5,0,2,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  3827974343U,	// <5,0,2,4>: Cost 4 vsldoi12 <0,4,1,5>, <0,2,4,1>
+  3845816530U,	// <5,0,2,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,2,5,3>
+  3779037114U,	// <5,0,2,6>: Cost 4 vsldoi8 <3,4,5,0>, <2,6,3,7>
+  3810887658U,	// <5,0,2,7>: Cost 4 vsldoi8 <u,7,5,0>, <2,7,0,1>
+  2245878429U,	// <5,0,2,u>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2710603926U,	// <5,0,3,0>: Cost 3 vsldoi8 <4,3,5,0>, <3,0,1,2>
+  3827974396U,	// <5,0,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <0,3,1,0>
+  3779037516U,	// <5,0,3,2>: Cost 4 vsldoi8 <3,4,5,0>, <3,2,3,4>
+  3779037596U,	// <5,0,3,3>: Cost 4 vsldoi8 <3,4,5,0>, <3,3,3,3>
+  2705295868U,	// <5,0,3,4>: Cost 3 vsldoi8 <3,4,5,0>, <3,4,5,0>
+  3379726804U,	// <5,0,3,5>: Cost 4 vmrglw <4,0,5,3>, <3,4,0,5>
+  3802925748U,	// <5,0,3,6>: Cost 4 vsldoi8 <7,4,5,0>, <3,6,7,4>
+  3363138168U,	// <5,0,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,0,7>
+  2707950400U,	// <5,0,3,u>: Cost 3 vsldoi8 <3,u,5,0>, <3,u,5,0>
+  2626568294U,	// <5,0,4,0>: Cost 3 vsldoi4 <1,5,0,4>, LHS
+  1680490834U,	// <5,0,4,1>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  3828048219U,	// <5,0,4,2>: Cost 4 vsldoi12 <0,4,2,5>, <0,4,2,5>
+  2710604932U,	// <5,0,4,3>: Cost 3 vsldoi8 <4,3,5,0>, <4,3,5,0>
+  2754232685U,	// <5,0,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,4,4,5>
+  2705296694U,	// <5,0,4,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779038590U,	// <5,0,4,6>: Cost 4 vsldoi8 <3,4,5,0>, <4,6,5,7>
+  2713259464U,	// <5,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1680490834U,	// <5,0,4,u>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  2311307264U,	// <5,0,5,0>: Cost 3 vmrglw <4,u,5,5>, <0,0,0,0>
+  1174437990U,	// <5,0,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  3779038946U,	// <5,0,5,2>: Cost 4 vsldoi8 <3,4,5,0>, <5,2,0,3>
+  3845816752U,	// <5,0,5,3>: Cost 4 vsldoi12 <3,4,0,5>, <0,5,3,0>
+  2248180050U,	// <5,0,5,4>: Cost 3 vmrghw <5,5,5,5>, <0,4,1,5>
+  2248180194U,	// <5,0,5,5>: Cost 3 vmrghw <5,5,5,5>, <0,5,u,5>
+  3779039274U,	// <5,0,5,6>: Cost 4 vsldoi8 <3,4,5,0>, <5,6,0,7>
+  3385051768U,	// <5,0,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,0,7>
+  1174438557U,	// <5,0,5,u>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2302689280U,	// <5,0,6,0>: Cost 3 vmrglw <3,4,5,6>, <0,0,0,0>
+  1175208038U,	// <5,0,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  3787002362U,	// <5,0,6,2>: Cost 4 vsldoi8 <4,7,5,0>, <6,2,7,3>
+  3376432160U,	// <5,0,6,3>: Cost 4 vmrglw <3,4,5,6>, <1,4,0,3>
+  2248950098U,	// <5,0,6,4>: Cost 3 vmrghw <5,6,7,0>, <0,4,1,5>
+  2248950180U,	// <5,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  3376433702U,	// <5,0,6,6>: Cost 4 vmrglw <3,4,5,6>, <3,5,0,6>
+  2729186166U,	// <5,0,6,7>: Cost 3 vsldoi8 <7,4,5,0>, <6,7,4,5>
+  1175208605U,	// <5,0,6,u>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2713261050U,	// <5,0,7,0>: Cost 3 vsldoi8 <4,7,5,0>, <7,0,1,2>
+  3365823599U,	// <5,0,7,1>: Cost 4 vmrglw <1,6,5,7>, <1,5,0,1>
+  3808900317U,	// <5,0,7,2>: Cost 4 vsldoi8 <u,4,5,0>, <7,2,u,4>
+  3784348899U,	// <5,0,7,3>: Cost 4 vsldoi8 <4,3,5,0>, <7,3,0,1>
+  2729186656U,	// <5,0,7,4>: Cost 3 vsldoi8 <7,4,5,0>, <7,4,5,0>
+  3787003268U,	// <5,0,7,5>: Cost 4 vsldoi8 <4,7,5,0>, <7,5,0,0>
+  3802928664U,	// <5,0,7,6>: Cost 4 vsldoi8 <7,4,5,0>, <7,6,7,4>
+  3787003431U,	// <5,0,7,7>: Cost 4 vsldoi8 <4,7,5,0>, <7,7,0,1>
+  2731841188U,	// <5,0,7,u>: Cost 3 vsldoi8 <7,u,5,0>, <7,u,5,0>
+  2626601062U,	// <5,0,u,0>: Cost 3 vsldoi4 <1,5,0,u>, LHS
+  1683145366U,	// <5,0,u,1>: Cost 2 vsldoi12 <0,u,1,5>, <0,u,1,5>
+  1680491165U,	// <5,0,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2705295054U,	// <5,0,u,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  2754233005U,	// <5,0,u,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,u,4,1>
+  2705299610U,	// <5,0,u,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779041488U,	// <5,0,u,6>: Cost 4 vsldoi8 <3,4,5,0>, <u,6,3,7>
+  2737150252U,	// <5,0,u,7>: Cost 3 vsldoi8 <u,7,5,0>, <u,7,5,0>
+  1680491219U,	// <5,0,u,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2713927680U,	// <5,1,0,0>: Cost 3 vsldoi8 <4,u,5,1>, <0,0,0,0>
+  1640185958U,	// <5,1,0,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2310607866U,	// <5,1,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  3787669756U,	// <5,1,0,3>: Cost 4 vsldoi8 <4,u,5,1>, <0,3,1,0>
+  2713928018U,	// <5,1,0,4>: Cost 3 vsldoi8 <4,u,5,1>, <0,4,1,5>
+  2306621778U,	// <5,1,0,5>: Cost 3 vmrglw <4,1,5,0>, <0,4,1,5>
+  3787670006U,	// <5,1,0,6>: Cost 4 vsldoi8 <4,u,5,1>, <0,6,1,7>
+  3736188301U,	// <5,1,0,7>: Cost 4 vsldoi4 <7,5,1,0>, <7,5,1,0>
+  1640186525U,	// <5,1,0,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2650505318U,	// <5,1,1,0>: Cost 3 vsldoi4 <5,5,1,1>, LHS
+  2754233140U,	// <5,1,1,1>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,1,1>
+  2311276694U,	// <5,1,1,2>: Cost 3 vmrglw <4,u,5,1>, <3,0,1,2>
+  2311278315U,	// <5,1,1,3>: Cost 3 vmrglw <4,u,5,1>, <5,2,1,3>
+  2758435667U,	// <5,1,1,4>: Cost 3 vsldoi12 <1,1,4,5>, <1,1,4,5>
+  2754233180U,	// <5,1,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,5,5>
+  3385016497U,	// <5,1,1,6>: Cost 4 vmrglw <4,u,5,1>, <0,2,1,6>
+  2311278643U,	// <5,1,1,7>: Cost 3 vmrglw <4,u,5,1>, <5,6,1,7>
+  2758730615U,	// <5,1,1,u>: Cost 3 vsldoi12 <1,1,u,5>, <1,1,u,5>
+  3700367462U,	// <5,1,2,0>: Cost 4 vsldoi4 <1,5,1,2>, LHS
+  3830629255U,	// <5,1,2,1>: Cost 4 vsldoi12 <0,u,1,5>, <1,2,1,3>
+  2713929320U,	// <5,1,2,2>: Cost 3 vsldoi8 <4,u,5,1>, <2,2,2,2>
+  2754233238U,	// <5,1,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,3,0>
+  2759099300U,	// <5,1,2,4>: Cost 3 vsldoi12 <1,2,4,5>, <1,2,4,5>
+  2754233259U,	// <5,1,2,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,5,3>
+  2713929658U,	// <5,1,2,6>: Cost 3 vsldoi8 <4,u,5,1>, <2,6,3,7>
+  3872359354U,	// <5,1,2,7>: Cost 4 vsldoi12 <7,u,0,5>, <1,2,7,0>
+  2754233283U,	// <5,1,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,u,0>
+  2713929878U,	// <5,1,3,0>: Cost 3 vsldoi8 <4,u,5,1>, <3,0,1,2>
+  3363135498U,	// <5,1,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,1,1>
+  3363137686U,	// <5,1,3,2>: Cost 4 vmrglw <1,2,5,3>, <3,0,1,2>
+  2713930140U,	// <5,1,3,3>: Cost 3 vsldoi8 <4,u,5,1>, <3,3,3,3>
+  2713930242U,	// <5,1,3,4>: Cost 3 vsldoi8 <4,u,5,1>, <3,4,5,6>
+  2289394002U,	// <5,1,3,5>: Cost 3 vmrglw <1,2,5,3>, <0,4,1,5>
+  3787672184U,	// <5,1,3,6>: Cost 4 vsldoi8 <4,u,5,1>, <3,6,0,7>
+  3787672259U,	// <5,1,3,7>: Cost 4 vsldoi8 <4,u,5,1>, <3,7,0,1>
+  2713930526U,	// <5,1,3,u>: Cost 3 vsldoi8 <4,u,5,1>, <3,u,1,2>
+  1634880402U,	// <5,1,4,0>: Cost 2 vsldoi8 <4,0,5,1>, <4,0,5,1>
+  2760205355U,	// <5,1,4,1>: Cost 3 vsldoi12 <1,4,1,5>, <1,4,1,5>
+  2760279092U,	// <5,1,4,2>: Cost 3 vsldoi12 <1,4,2,5>, <1,4,2,5>
+  3787672708U,	// <5,1,4,3>: Cost 4 vsldoi8 <4,u,5,1>, <4,3,5,0>
+  2713930960U,	// <5,1,4,4>: Cost 3 vsldoi8 <4,u,5,1>, <4,4,4,4>
+  1640189238U,	// <5,1,4,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  3786345848U,	// <5,1,4,6>: Cost 4 vsldoi8 <4,6,5,1>, <4,6,5,1>
+  3787009481U,	// <5,1,4,7>: Cost 4 vsldoi8 <4,7,5,1>, <4,7,5,1>
+  1640189466U,	// <5,1,4,u>: Cost 2 vsldoi8 <4,u,5,1>, <4,u,5,1>
+  2754233455U,	// <5,1,5,0>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,0,1>
+  2713931407U,	// <5,1,5,1>: Cost 3 vsldoi8 <4,u,5,1>, <5,1,0,1>
+  2713931499U,	// <5,1,5,2>: Cost 3 vsldoi8 <4,u,5,1>, <5,2,1,3>
+  3827975305U,	// <5,1,5,3>: Cost 4 vsldoi12 <0,4,1,5>, <1,5,3,0>
+  2754233495U,	// <5,1,5,4>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,4,5>
+  2288746834U,	// <5,1,5,5>: Cost 3 vmrglw <1,1,5,5>, <0,4,1,5>
+  2713931827U,	// <5,1,5,6>: Cost 3 vsldoi8 <4,u,5,1>, <5,6,1,7>
+  3787673725U,	// <5,1,5,7>: Cost 4 vsldoi8 <4,u,5,1>, <5,7,1,0>
+  2754233527U,	// <5,1,5,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,u,1>
+  2668462182U,	// <5,1,6,0>: Cost 3 vsldoi4 <u,5,1,6>, LHS
+  2290746002U,	// <5,1,6,1>: Cost 3 vmrglw <1,4,5,6>, <0,u,1,1>
+  2302691478U,	// <5,1,6,2>: Cost 3 vmrglw <3,4,5,6>, <3,0,1,2>
+  3364488071U,	// <5,1,6,3>: Cost 4 vmrglw <1,4,5,6>, <1,2,1,3>
+  2302689536U,	// <5,1,6,4>: Cost 3 vmrglw <3,4,5,6>, <0,3,1,4>
+  2754233587U,	// <5,1,6,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,6,5,7>
+  2713932600U,	// <5,1,6,6>: Cost 3 vsldoi8 <4,u,5,1>, <6,6,6,6>
+  2713932622U,	// <5,1,6,7>: Cost 3 vsldoi8 <4,u,5,1>, <6,7,0,1>
+  2302689297U,	// <5,1,6,u>: Cost 3 vmrglw <3,4,5,6>, <0,0,1,u>
+  2713932794U,	// <5,1,7,0>: Cost 3 vsldoi8 <4,u,5,1>, <7,0,1,2>
+  3365822474U,	// <5,1,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,1,1>
+  3365824662U,	// <5,1,7,2>: Cost 4 vmrglw <1,6,5,7>, <3,0,1,2>
+  3787674851U,	// <5,1,7,3>: Cost 4 vsldoi8 <4,u,5,1>, <7,3,0,1>
+  2713933158U,	// <5,1,7,4>: Cost 3 vsldoi8 <4,u,5,1>, <7,4,5,6>
+  2292080978U,	// <5,1,7,5>: Cost 3 vmrglw <1,6,5,7>, <0,4,1,5>
+  3365823613U,	// <5,1,7,6>: Cost 4 vmrglw <1,6,5,7>, <1,5,1,6>
+  2713933420U,	// <5,1,7,7>: Cost 3 vsldoi8 <4,u,5,1>, <7,7,7,7>
+  2713933442U,	// <5,1,7,u>: Cost 3 vsldoi8 <4,u,5,1>, <7,u,1,2>
+  1658771190U,	// <5,1,u,0>: Cost 2 vsldoi8 <u,0,5,1>, <u,0,5,1>
+  1640191790U,	// <5,1,u,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2762933624U,	// <5,1,u,2>: Cost 3 vsldoi12 <1,u,2,5>, <1,u,2,5>
+  2754233724U,	// <5,1,u,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,u,3,0>
+  2763081098U,	// <5,1,u,4>: Cost 3 vsldoi12 <1,u,4,5>, <1,u,4,5>
+  1640192154U,	// <5,1,u,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  2713934032U,	// <5,1,u,6>: Cost 3 vsldoi8 <4,u,5,1>, <u,6,3,7>
+  2713934080U,	// <5,1,u,7>: Cost 3 vsldoi8 <4,u,5,1>, <u,7,0,1>
+  1640192357U,	// <5,1,u,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  3779051520U,	// <5,2,0,0>: Cost 4 vsldoi8 <3,4,5,2>, <0,0,0,0>
+  2705309798U,	// <5,2,0,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  3838813637U,	// <5,2,0,2>: Cost 4 vsldoi12 <2,2,4,5>, <2,0,2,1>
+  2302640230U,	// <5,2,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3765117266U,	// <5,2,0,4>: Cost 4 vsldoi8 <1,1,5,2>, <0,4,1,5>
+  3381027892U,	// <5,2,0,5>: Cost 4 vmrglw <4,2,5,0>, <1,4,2,5>
+  3842794985U,	// <5,2,0,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,0,6,1>
+  3408232554U,	// <5,2,0,7>: Cost 4 vmrglw <u,7,5,0>, <0,1,2,7>
+  2302640235U,	// <5,2,0,u>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3700432998U,	// <5,2,1,0>: Cost 4 vsldoi4 <1,5,2,1>, LHS
+  3765117785U,	// <5,2,1,1>: Cost 4 vsldoi8 <1,1,5,2>, <1,1,5,2>
+  2311276136U,	// <5,2,1,2>: Cost 3 vmrglw <4,u,5,1>, <2,2,2,2>
+  1237532774U,	// <5,2,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700436278U,	// <5,2,1,4>: Cost 4 vsldoi4 <1,5,2,1>, RHS
+  3381036084U,	// <5,2,1,5>: Cost 4 vmrglw <4,2,5,1>, <1,4,2,5>
+  3385018045U,	// <5,2,1,6>: Cost 4 vmrglw <4,u,5,1>, <2,3,2,6>
+  3385017560U,	// <5,2,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,2,7>
+  1237532779U,	// <5,2,1,u>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700441190U,	// <5,2,2,0>: Cost 4 vsldoi4 <1,5,2,2>, LHS
+  3700442242U,	// <5,2,2,1>: Cost 4 vsldoi4 <1,5,2,2>, <1,5,2,2>
+  2754233960U,	// <5,2,2,2>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,2,2>
+  2754233970U,	// <5,2,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,3,3>
+  2765071997U,	// <5,2,2,4>: Cost 3 vsldoi12 <2,2,4,5>, <2,2,4,5>
+  3834021508U,	// <5,2,2,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,2,5,3>
+  3842795152U,	// <5,2,2,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,2,6,6>
+  3376402492U,	// <5,2,2,7>: Cost 4 vmrglw <3,4,5,2>, <5,6,2,7>
+  2754234015U,	// <5,2,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,u,3>
+  2754234022U,	// <5,2,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,3,0,1>
+  3827975855U,	// <5,2,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <2,3,1,1>
+  2644625102U,	// <5,2,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393766U,	// <5,2,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1691993806U,	// <5,2,3,4>: Cost 2 vsldoi12 <2,3,4,5>, <2,3,4,5>
+  2785052375U,	// <5,2,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <2,3,5,5>
+  3854812897U,	// <5,2,3,6>: Cost 4 vsldoi12 <4,u,5,5>, <2,3,6,6>
+  3802942187U,	// <5,2,3,7>: Cost 4 vsldoi8 <7,4,5,2>, <3,7,4,5>
+  1692288754U,	// <5,2,3,u>: Cost 2 vsldoi12 <2,3,u,5>, <2,3,u,5>
+  3839846139U,	// <5,2,4,0>: Cost 4 vsldoi12 <2,4,0,5>, <2,4,0,5>
+  2709294052U,	// <5,2,4,1>: Cost 3 vsldoi8 <4,1,5,2>, <4,1,5,2>
+  2766251789U,	// <5,2,4,2>: Cost 3 vsldoi12 <2,4,2,5>, <2,4,2,5>
+  2765735702U,	// <5,2,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,3,5>
+  3840141087U,	// <5,2,4,4>: Cost 4 vsldoi12 <2,4,4,5>, <2,4,4,5>
+  2705313078U,	// <5,2,4,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2712612217U,	// <5,2,4,6>: Cost 3 vsldoi8 <4,6,5,2>, <4,6,5,2>
+  3787017674U,	// <5,2,4,7>: Cost 4 vsldoi8 <4,7,5,2>, <4,7,5,2>
+  2765735747U,	// <5,2,4,u>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,u,5>
+  3834021704U,	// <5,2,5,0>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,0,1>
+  3834021714U,	// <5,2,5,1>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,1,2>
+  2311308904U,	// <5,2,5,2>: Cost 3 vmrglw <4,u,5,5>, <2,2,2,2>
+  1237565542U,	// <5,2,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3834021744U,	// <5,2,5,4>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,4,5>
+  3369124916U,	// <5,2,5,5>: Cost 4 vmrglw <2,2,5,5>, <1,4,2,5>
+  2248181690U,	// <5,2,5,6>: Cost 3 vmrghw <5,5,5,5>, <2,6,3,7>
+  3786354825U,	// <5,2,5,7>: Cost 4 vsldoi8 <4,6,5,2>, <5,7,2,3>
+  1237565547U,	// <5,2,5,u>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3700473958U,	// <5,2,6,0>: Cost 4 vsldoi4 <1,5,2,6>, LHS
+  3700475014U,	// <5,2,6,1>: Cost 4 vsldoi4 <1,5,2,6>, <1,5,2,6>
+  2296718952U,	// <5,2,6,2>: Cost 3 vmrglw <2,4,5,6>, <2,2,2,2>
+  1228947558U,	// <5,2,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3700477238U,	// <5,2,6,4>: Cost 4 vsldoi4 <1,5,2,6>, RHS
+  3834021836U,	// <5,2,6,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,6,5,7>
+  2248951738U,	// <5,2,6,6>: Cost 3 vmrghw <5,6,7,0>, <2,6,3,7>
+  3370461105U,	// <5,2,6,7>: Cost 4 vmrglw <2,4,5,6>, <2,6,2,7>
+  1228947563U,	// <5,2,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3786355706U,	// <5,2,7,0>: Cost 4 vsldoi8 <4,6,5,2>, <7,0,1,2>
+  3783038037U,	// <5,2,7,1>: Cost 4 vsldoi8 <4,1,5,2>, <7,1,2,3>
+  3365824104U,	// <5,2,7,2>: Cost 4 vmrglw <1,6,5,7>, <2,2,2,2>
+  2292080742U,	// <5,2,7,3>: Cost 3 vmrglw <1,6,5,7>, LHS
+  3842131986U,	// <5,2,7,4>: Cost 4 vsldoi12 <2,7,4,5>, <2,7,4,5>
+  3371795508U,	// <5,2,7,5>: Cost 4 vmrglw <2,6,5,7>, <1,4,2,5>
+  3786356206U,	// <5,2,7,6>: Cost 4 vsldoi8 <4,6,5,2>, <7,6,2,7>
+  3786356332U,	// <5,2,7,7>: Cost 4 vsldoi8 <4,6,5,2>, <7,7,7,7>
+  2292080747U,	// <5,2,7,u>: Cost 3 vmrglw <1,6,5,7>, LHS
+  2754234427U,	// <5,2,u,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,u,0,1>
+  2705315630U,	// <5,2,u,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  2296735336U,	// <5,2,u,2>: Cost 3 vmrglw <2,4,5,u>, <2,2,2,2>
+  1228963942U,	// <5,2,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  1695311971U,	// <5,2,u,4>: Cost 2 vsldoi12 <2,u,4,5>, <2,u,4,5>
+  2705315994U,	// <5,2,u,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2769201269U,	// <5,2,u,6>: Cost 3 vsldoi12 <2,u,6,5>, <2,u,6,5>
+  3370477489U,	// <5,2,u,7>: Cost 4 vmrglw <2,4,5,u>, <2,6,2,7>
+  1695606919U,	// <5,2,u,u>: Cost 2 vsldoi12 <2,u,u,5>, <2,u,u,5>
+  3827976331U,	// <5,3,0,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,0,0,0>
+  2754234518U,	// <5,3,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,1,2>
+  3706472290U,	// <5,3,0,2>: Cost 4 vsldoi4 <2,5,3,0>, <2,5,3,0>
+  3700500630U,	// <5,3,0,3>: Cost 4 vsldoi4 <1,5,3,0>, <3,0,1,2>
+  2754234544U,	// <5,3,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,4,1>
+  3376383766U,	// <5,3,0,5>: Cost 4 vmrglw <3,4,5,0>, <2,4,3,5>
+  3769770513U,	// <5,3,0,6>: Cost 5 vsldoi8 <1,u,5,3>, <0,6,4,7>
+  3376383930U,	// <5,3,0,7>: Cost 4 vmrglw <3,4,5,0>, <2,6,3,7>
+  2754234581U,	// <5,3,0,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,u,2>
+  2311275414U,	// <5,3,1,0>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,0>
+  2305967971U,	// <5,3,1,1>: Cost 3 vmrglw <4,0,5,1>, <2,5,3,1>
+  2692047787U,	// <5,3,1,2>: Cost 3 vsldoi8 <1,2,5,3>, <1,2,5,3>
+  2311276146U,	// <5,3,1,3>: Cost 3 vmrglw <4,u,5,1>, <2,2,3,3>
+  2311275418U,	// <5,3,1,4>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,4>
+  3765789807U,	// <5,3,1,5>: Cost 4 vsldoi8 <1,2,5,3>, <1,5,0,1>
+  3765789939U,	// <5,3,1,6>: Cost 4 vsldoi8 <1,2,5,3>, <1,6,5,7>
+  2311276474U,	// <5,3,1,7>: Cost 3 vmrglw <4,u,5,1>, <2,6,3,7>
+  2696029585U,	// <5,3,1,u>: Cost 3 vsldoi8 <1,u,5,3>, <1,u,5,3>
+  2311288709U,	// <5,3,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  3765790243U,	// <5,3,2,1>: Cost 4 vsldoi8 <1,2,5,3>, <2,1,3,5>
+  3827976513U,	// <5,3,2,2>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,2,2>
+  2765736268U,	// <5,3,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <3,2,3,4>
+  2246248962U,	// <5,3,2,4>: Cost 3 vmrghw <5,2,6,3>, <3,4,5,6>
+  3765790563U,	// <5,3,2,5>: Cost 4 vsldoi8 <1,2,5,3>, <2,5,3,1>
+  3827976550U,	// <5,3,2,6>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,6,3>
+  3842795887U,	// <5,3,2,7>: Cost 4 vsldoi12 <2,u,4,5>, <3,2,7,3>
+  2769054073U,	// <5,3,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <3,2,u,4>
+  3827976575U,	// <5,3,3,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,3,0,1>
+  3765790963U,	// <5,3,3,1>: Cost 4 vsldoi8 <1,2,5,3>, <3,1,2,5>
+  3839478162U,	// <5,3,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <3,3,2,2>
+  2754234780U,	// <5,3,3,3>: Cost 3 vsldoi12 <0,4,1,5>, <3,3,3,3>
+  2771708327U,	// <5,3,3,4>: Cost 3 vsldoi12 <3,3,4,5>, <3,3,4,5>
+  3363137059U,	// <5,3,3,5>: Cost 4 vmrglw <1,2,5,3>, <2,1,3,5>
+  3375081320U,	// <5,3,3,6>: Cost 4 vmrglw <3,2,5,3>, <2,5,3,6>
+  3363137466U,	// <5,3,3,7>: Cost 4 vmrglw <1,2,5,3>, <2,6,3,7>
+  2772003275U,	// <5,3,3,u>: Cost 3 vsldoi12 <3,3,u,5>, <3,3,u,5>
+  2772077012U,	// <5,3,4,0>: Cost 3 vsldoi12 <3,4,0,5>, <3,4,0,5>
+  3765791714U,	// <5,3,4,1>: Cost 4 vsldoi8 <1,2,5,3>, <4,1,5,0>
+  2709965878U,	// <5,3,4,2>: Cost 3 vsldoi8 <4,2,5,3>, <4,2,5,3>
+  2772298223U,	// <5,3,4,3>: Cost 3 vsldoi12 <3,4,3,5>, <3,4,3,5>
+  2772371960U,	// <5,3,4,4>: Cost 3 vsldoi12 <3,4,4,5>, <3,4,4,5>
+  2754234882U,	// <5,3,4,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,5,6>
+  3839478282U,	// <5,3,4,6>: Cost 4 vsldoi12 <2,3,4,5>, <3,4,6,5>
+  3376416698U,	// <5,3,4,7>: Cost 4 vmrglw <3,4,5,4>, <2,6,3,7>
+  2754234909U,	// <5,3,4,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,u,6>
+  2311308182U,	// <5,3,5,0>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,0>
+  3765792421U,	// <5,3,5,1>: Cost 4 vsldoi8 <1,2,5,3>, <5,1,2,5>
+  2715938575U,	// <5,3,5,2>: Cost 3 vsldoi8 <5,2,5,3>, <5,2,5,3>
+  2311308914U,	// <5,3,5,3>: Cost 3 vmrglw <4,u,5,5>, <2,2,3,3>
+  2311308186U,	// <5,3,5,4>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,4>
+  2248182354U,	// <5,3,5,5>: Cost 3 vmrghw <5,5,5,5>, <3,5,5,5>
+  3765792837U,	// <5,3,5,6>: Cost 4 vsldoi8 <1,2,5,3>, <5,6,3,7>
+  2311309242U,	// <5,3,5,7>: Cost 3 vmrglw <4,u,5,5>, <2,6,3,7>
+  2311308190U,	// <5,3,5,u>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,u>
+  2632777830U,	// <5,3,6,0>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3706520372U,	// <5,3,6,1>: Cost 4 vsldoi4 <2,5,3,6>, <1,1,1,1>
+  2632779624U,	// <5,3,6,2>: Cost 3 vsldoi4 <2,5,3,6>, <2,5,3,6>
+  2632780290U,	// <5,3,6,3>: Cost 3 vsldoi4 <2,5,3,6>, <3,4,5,6>
+  2632781110U,	// <5,3,6,4>: Cost 3 vsldoi4 <2,5,3,6>, RHS
+  2248952413U,	// <5,3,6,5>: Cost 3 vmrghw <5,6,7,0>, <3,5,6,7>
+  2302691176U,	// <5,3,6,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302691258U,	// <5,3,6,7>: Cost 3 vmrglw <3,4,5,6>, <2,6,3,7>
+  2632783662U,	// <5,3,6,u>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3365823382U,	// <5,3,7,0>: Cost 4 vmrglw <1,6,5,7>, <1,2,3,0>
+  3706529011U,	// <5,3,7,1>: Cost 4 vsldoi4 <2,5,3,7>, <1,6,5,7>
+  3706529641U,	// <5,3,7,2>: Cost 4 vsldoi4 <2,5,3,7>, <2,5,3,7>
+  3365824114U,	// <5,3,7,3>: Cost 4 vmrglw <1,6,5,7>, <2,2,3,3>
+  2774362859U,	// <5,3,7,4>: Cost 3 vsldoi12 <3,7,4,5>, <3,7,4,5>
+  3365824035U,	// <5,3,7,5>: Cost 4 vmrglw <1,6,5,7>, <2,1,3,5>
+  3383740183U,	// <5,3,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,4,3,6>
+  3363833786U,	// <5,3,7,7>: Cost 4 vmrglw <1,3,5,7>, <2,6,3,7>
+  2774657807U,	// <5,3,7,u>: Cost 3 vsldoi12 <3,7,u,5>, <3,7,u,5>
+  2632794214U,	// <5,3,u,0>: Cost 3 vsldoi4 <2,5,3,u>, LHS
+  2754235166U,	// <5,3,u,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,1,2>
+  2632796010U,	// <5,3,u,2>: Cost 3 vsldoi4 <2,5,3,u>, <2,5,3,u>
+  2632796676U,	// <5,3,u,3>: Cost 3 vsldoi4 <2,5,3,u>, <3,4,5,u>
+  2632797494U,	// <5,3,u,4>: Cost 3 vsldoi4 <2,5,3,u>, RHS
+  2754235206U,	// <5,3,u,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,5,6>
+  2302691176U,	// <5,3,u,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302707642U,	// <5,3,u,7>: Cost 3 vmrglw <3,4,5,u>, <2,6,3,7>
+  2754235229U,	// <5,3,u,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,u,2>
+  3765133325U,	// <5,4,0,0>: Cost 4 vsldoi8 <1,1,5,4>, <0,0,1,4>
+  2705326182U,	// <5,4,0,1>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  3718489806U,	// <5,4,0,2>: Cost 4 vsldoi4 <4,5,4,0>, <2,3,4,5>
+  3718490624U,	// <5,4,0,3>: Cost 4 vsldoi4 <4,5,4,0>, <3,4,5,4>
+  2709307730U,	// <5,4,0,4>: Cost 3 vsldoi8 <4,1,5,4>, <0,4,1,5>
+  2302641870U,	// <5,4,0,5>: Cost 3 vmrglw <3,4,5,0>, <2,3,4,5>
+  3376383695U,	// <5,4,0,6>: Cost 5 vmrglw <3,4,5,0>, <2,3,4,6>
+  3384351018U,	// <5,4,0,7>: Cost 4 vmrglw <4,7,5,0>, <u,7,4,7>
+  2705326749U,	// <5,4,0,u>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  2305971057U,	// <5,4,1,0>: Cost 3 vmrglw <4,0,5,1>, <6,7,4,0>
+  3765134171U,	// <5,4,1,1>: Cost 4 vsldoi8 <1,1,5,4>, <1,1,5,4>
+  3766461338U,	// <5,4,1,2>: Cost 4 vsldoi8 <1,3,5,4>, <1,2,3,4>
+  3766461437U,	// <5,4,1,3>: Cost 4 vsldoi8 <1,3,5,4>, <1,3,5,4>
+  2311277776U,	// <5,4,1,4>: Cost 3 vmrglw <4,u,5,1>, <4,4,4,4>
+  2754235362U,	// <5,4,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <4,1,5,0>
+  3783050483U,	// <5,4,1,6>: Cost 4 vsldoi8 <4,1,5,4>, <1,6,5,7>
+  3385019036U,	// <5,4,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,4,7>
+  2311276241U,	// <5,4,1,u>: Cost 3 vmrglw <4,u,5,1>, <2,3,4,u>
+  3718504550U,	// <5,4,2,0>: Cost 4 vsldoi4 <4,5,4,2>, LHS
+  3783050787U,	// <5,4,2,1>: Cost 4 vsldoi8 <4,1,5,4>, <2,1,3,5>
+  3773097576U,	// <5,4,2,2>: Cost 4 vsldoi8 <2,4,5,4>, <2,2,2,2>
+  2705327822U,	// <5,4,2,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  3773097767U,	// <5,4,2,4>: Cost 4 vsldoi8 <2,4,5,4>, <2,4,5,4>
+  2765737014U,	// <5,4,2,5>: Cost 3 vsldoi12 <2,3,4,5>, <4,2,5,3>
+  3779069882U,	// <5,4,2,6>: Cost 4 vsldoi8 <3,4,5,4>, <2,6,3,7>
+  3376401052U,	// <5,4,2,7>: Cost 5 vmrglw <3,4,5,2>, <3,6,4,7>
+  2245881370U,	// <5,4,2,u>: Cost 3 vmrghw <5,2,1,3>, <4,u,5,1>
+  3779070102U,	// <5,4,3,0>: Cost 4 vsldoi8 <3,4,5,4>, <3,0,1,2>
+  3363135525U,	// <5,4,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,4,1>
+  3779070284U,	// <5,4,3,2>: Cost 4 vsldoi8 <3,4,5,4>, <3,2,3,4>
+  3779070364U,	// <5,4,3,3>: Cost 4 vsldoi8 <3,4,5,4>, <3,3,3,3>
+  2705328640U,	// <5,4,3,4>: Cost 3 vsldoi8 <3,4,5,4>, <3,4,5,4>
+  2307311310U,	// <5,4,3,5>: Cost 3 vmrglw <4,2,5,3>, <2,3,4,5>
+  3866021012U,	// <5,4,3,6>: Cost 4 vsldoi12 <6,7,4,5>, <4,3,6,7>
+  3363138204U,	// <5,4,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,4,7>
+  2707983172U,	// <5,4,3,u>: Cost 3 vsldoi8 <3,u,5,4>, <3,u,5,4>
+  2708646805U,	// <5,4,4,0>: Cost 3 vsldoi8 <4,0,5,4>, <4,0,5,4>
+  2709310438U,	// <5,4,4,1>: Cost 3 vsldoi8 <4,1,5,4>, <4,1,5,4>
+  3779071030U,	// <5,4,4,2>: Cost 4 vsldoi8 <3,4,5,4>, <4,2,5,3>
+  2710637704U,	// <5,4,4,3>: Cost 3 vsldoi8 <4,3,5,4>, <4,3,5,4>
+  2754235600U,	// <5,4,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <4,4,4,4>
+  1704676570U,	// <5,4,4,5>: Cost 2 vsldoi12 <4,4,5,5>, <4,4,5,5>
+  3779071358U,	// <5,4,4,6>: Cost 4 vsldoi8 <3,4,5,4>, <4,6,5,7>
+  2713292236U,	// <5,4,4,7>: Cost 3 vsldoi8 <4,7,5,4>, <4,7,5,4>
+  1704897781U,	// <5,4,4,u>: Cost 2 vsldoi12 <4,4,u,5>, <4,4,u,5>
+  2626871398U,	// <5,4,5,0>: Cost 3 vsldoi4 <1,5,4,5>, LHS
+  2626872471U,	// <5,4,5,1>: Cost 3 vsldoi4 <1,5,4,5>, <1,5,4,5>
+  2765737230U,	// <5,4,5,2>: Cost 3 vsldoi12 <2,3,4,5>, <4,5,2,3>
+  3700615318U,	// <5,4,5,3>: Cost 4 vsldoi4 <1,5,4,5>, <3,0,1,2>
+  2626874678U,	// <5,4,5,4>: Cost 3 vsldoi4 <1,5,4,5>, RHS
+  1174441270U,	// <5,4,5,5>: Cost 2 vmrghw <5,5,5,5>, RHS
+  1680493878U,	// <5,4,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  3385051804U,	// <5,4,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,4,7>
+  1680493896U,	// <5,4,5,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2248952722U,	// <5,4,6,0>: Cost 3 vmrghw <5,6,7,0>, <4,0,5,1>
+  2302692152U,	// <5,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  3382406107U,	// <5,4,6,2>: Cost 4 vmrglw <4,4,5,6>, <4,1,4,2>
+  3700623874U,	// <5,4,6,3>: Cost 4 vsldoi4 <1,5,4,6>, <3,4,5,6>
+  2248953040U,	// <5,4,6,4>: Cost 3 vmrghw <5,6,7,0>, <4,4,4,4>
+  1175211318U,	// <5,4,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3376432280U,	// <5,4,6,6>: Cost 4 vmrglw <3,4,5,6>, <1,5,4,6>
+  2729218934U,	// <5,4,6,7>: Cost 3 vsldoi8 <7,4,5,4>, <6,7,4,5>
+  1175211561U,	// <5,4,6,u>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3787035642U,	// <5,4,7,0>: Cost 4 vsldoi8 <4,7,5,4>, <7,0,1,2>
+  3365822501U,	// <5,4,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,4,1>
+  3808933085U,	// <5,4,7,2>: Cost 4 vsldoi8 <u,4,5,4>, <7,2,u,4>
+  3784381707U,	// <5,4,7,3>: Cost 4 vsldoi8 <4,3,5,4>, <7,3,4,5>
+  2713294182U,	// <5,4,7,4>: Cost 3 vsldoi8 <4,7,5,4>, <7,4,5,6>
+  2309998286U,	// <5,4,7,5>: Cost 3 vmrglw <4,6,5,7>, <2,3,4,5>
+  3383740111U,	// <5,4,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,3,4,6>
+  3787036239U,	// <5,4,7,7>: Cost 4 vsldoi8 <4,7,5,4>, <7,7,4,5>
+  2731873960U,	// <5,4,7,u>: Cost 3 vsldoi8 <7,u,5,4>, <7,u,5,4>
+  2626895974U,	// <5,4,u,0>: Cost 3 vsldoi4 <1,5,4,u>, LHS
+  2626897050U,	// <5,4,u,1>: Cost 3 vsldoi4 <1,5,4,u>, <1,5,4,u>
+  2644813518U,	// <5,4,u,2>: Cost 3 vsldoi4 <4,5,4,u>, <2,3,4,5>
+  2705327822U,	// <5,4,u,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  2626899254U,	// <5,4,u,4>: Cost 3 vsldoi4 <1,5,4,u>, RHS
+  1707331102U,	// <5,4,u,5>: Cost 2 vsldoi12 <4,u,5,5>, <4,u,5,5>
+  1680494121U,	// <5,4,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2737183024U,	// <5,4,u,7>: Cost 3 vsldoi8 <u,7,5,4>, <u,7,5,4>
+  1680494139U,	// <5,4,u,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2302642684U,	// <5,5,0,0>: Cost 3 vmrglw <3,4,5,0>, <3,4,5,0>
+  1640218726U,	// <5,5,0,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  3376384510U,	// <5,5,0,2>: Cost 4 vmrglw <3,4,5,0>, <3,4,5,2>
+  3376385078U,	// <5,5,0,3>: Cost 4 vmrglw <3,4,5,0>, <4,2,5,3>
+  2754236002U,	// <5,5,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <5,0,4,1>
+  2717942242U,	// <5,5,0,5>: Cost 3 vsldoi8 <5,5,5,5>, <0,5,u,5>
+  2244907106U,	// <5,5,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  3376385406U,	// <5,5,0,7>: Cost 4 vmrglw <3,4,5,0>, <4,6,5,7>
+  1640219293U,	// <5,5,0,u>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2305969365U,	// <5,5,1,0>: Cost 3 vmrglw <4,0,5,1>, <4,4,5,0>
+  1237536282U,	// <5,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2713961366U,	// <5,5,1,2>: Cost 3 vsldoi8 <4,u,5,5>, <1,2,3,0>
+  3766469630U,	// <5,5,1,3>: Cost 4 vsldoi8 <1,3,5,5>, <1,3,5,5>
+  2782326455U,	// <5,5,1,4>: Cost 3 vsldoi12 <5,1,4,5>, <5,1,4,5>
+  2311277786U,	// <5,5,1,5>: Cost 3 vmrglw <4,u,5,1>, <4,4,5,5>
+  2311277058U,	// <5,5,1,6>: Cost 3 vmrglw <4,u,5,1>, <3,4,5,6>
+  3385017587U,	// <5,5,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,5,7>
+  1237536282U,	// <5,5,1,u>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  3376400892U,	// <5,5,2,0>: Cost 4 vmrglw <3,4,5,2>, <3,4,5,0>
+  3827977963U,	// <5,5,2,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,2,1,3>
+  2302659070U,	// <5,5,2,2>: Cost 3 vmrglw <3,4,5,2>, <3,4,5,2>
+  2765737726U,	// <5,5,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <5,2,3,4>
+  3839479558U,	// <5,5,2,4>: Cost 4 vsldoi12 <2,3,4,5>, <5,2,4,3>
+  2781073167U,	// <5,5,2,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,2,5,3>
+  2713962426U,	// <5,5,2,6>: Cost 3 vsldoi8 <4,u,5,5>, <2,6,3,7>
+  3376401790U,	// <5,5,2,7>: Cost 4 vmrglw <3,4,5,2>, <4,6,5,7>
+  2769055531U,	// <5,5,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <5,2,u,4>
+  2713962646U,	// <5,5,3,0>: Cost 3 vsldoi8 <4,u,5,5>, <3,0,1,2>
+  3765143786U,	// <5,5,3,1>: Cost 4 vsldoi8 <1,1,5,5>, <3,1,1,5>
+  3839479621U,	// <5,5,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,3,2,3>
+  2289394603U,	// <5,5,3,3>: Cost 3 vmrglw <1,2,5,3>, <1,2,5,3>
+  2713963010U,	// <5,5,3,4>: Cost 3 vsldoi8 <4,u,5,5>, <3,4,5,6>
+  2313285150U,	// <5,5,3,5>: Cost 3 vmrglw <5,2,5,3>, <4,u,5,5>
+  3363138050U,	// <5,5,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,5,6>
+  3363136755U,	// <5,5,3,7>: Cost 4 vmrglw <1,2,5,3>, <1,6,5,7>
+  2713963294U,	// <5,5,3,u>: Cost 3 vsldoi8 <4,u,5,5>, <3,u,1,2>
+  2713963410U,	// <5,5,4,0>: Cost 3 vsldoi8 <4,u,5,5>, <4,0,5,1>
+  3827978127U,	// <5,5,4,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,4,1,5>
+  3839479704U,	// <5,5,4,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,4,2,5>
+  3376417846U,	// <5,5,4,3>: Cost 4 vmrglw <3,4,5,4>, <4,2,5,3>
+  1637567706U,	// <5,5,4,4>: Cost 2 vsldoi8 <4,4,5,5>, <4,4,5,5>
+  1640222006U,	// <5,5,4,5>: Cost 2 vsldoi8 <4,u,5,5>, RHS
+  2310640998U,	// <5,5,4,6>: Cost 3 vmrglw <4,7,5,4>, <7,4,5,6>
+  3376418174U,	// <5,5,4,7>: Cost 4 vmrglw <3,4,5,4>, <4,6,5,7>
+  1640222238U,	// <5,5,4,u>: Cost 2 vsldoi8 <4,u,5,5>, <4,u,5,5>
+  1577091174U,	// <5,5,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  2311310226U,	// <5,5,5,1>: Cost 3 vmrglw <4,u,5,5>, <4,0,5,1>
+  2713964303U,	// <5,5,5,2>: Cost 3 vsldoi8 <4,u,5,5>, <5,2,5,3>
+  2311311119U,	// <5,5,5,3>: Cost 3 vmrglw <4,u,5,5>, <5,2,5,3>
+  1577094454U,	// <5,5,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,5,5>: Cost 1 vspltisw1 RHS
+  2311309826U,	// <5,5,5,6>: Cost 3 vmrglw <4,u,5,5>, <3,4,5,6>
+  2311311447U,	// <5,5,5,7>: Cost 3 vmrglw <4,u,5,5>, <5,6,5,7>
+  296144182U,	// <5,5,5,u>: Cost 1 vspltisw1 RHS
+  2248953460U,	// <5,5,6,0>: Cost 3 vmrghw <5,6,7,0>, <5,0,6,1>
+  2326580114U,	// <5,5,6,1>: Cost 3 vmrglw <7,4,5,6>, <4,0,5,1>
+  2713965050U,	// <5,5,6,2>: Cost 3 vsldoi8 <4,u,5,5>, <6,2,7,3>
+  3700697602U,	// <5,5,6,3>: Cost 4 vsldoi4 <1,5,5,6>, <3,4,5,6>
+  2785644620U,	// <5,5,6,4>: Cost 3 vsldoi12 <5,6,4,5>, <5,6,4,5>
+  2781073495U,	// <5,5,6,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,6,5,7>
+  1228950018U,	// <5,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965390U,	// <5,5,6,7>: Cost 3 vsldoi8 <4,u,5,5>, <6,7,0,1>
+  1228950018U,	// <5,5,6,u>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965562U,	// <5,5,7,0>: Cost 3 vsldoi8 <4,u,5,5>, <7,0,1,2>
+  3383741330U,	// <5,5,7,1>: Cost 4 vmrglw <4,6,5,7>, <4,0,5,1>
+  3718620878U,	// <5,5,7,2>: Cost 4 vsldoi4 <4,5,5,7>, <2,3,4,5>
+  3365823403U,	// <5,5,7,3>: Cost 4 vmrglw <1,6,5,7>, <1,2,5,3>
+  2713965926U,	// <5,5,7,4>: Cost 3 vsldoi8 <4,u,5,5>, <7,4,5,6>
+  2717947318U,	// <5,5,7,5>: Cost 3 vsldoi8 <5,5,5,5>, <7,5,5,5>
+  3365825026U,	// <5,5,7,6>: Cost 4 vmrglw <1,6,5,7>, <3,4,5,6>
+  2292081907U,	// <5,5,7,7>: Cost 3 vmrglw <1,6,5,7>, <1,6,5,7>
+  2713966210U,	// <5,5,7,u>: Cost 3 vsldoi8 <4,u,5,5>, <7,u,1,2>
+  1577091174U,	// <5,5,u,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1640224558U,	// <5,5,u,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2713966469U,	// <5,5,u,2>: Cost 3 vsldoi8 <4,u,5,5>, <u,2,3,0>
+  2713966524U,	// <5,5,u,3>: Cost 3 vsldoi8 <4,u,5,5>, <u,3,0,1>
+  1577094454U,	// <5,5,u,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,u,5>: Cost 1 vspltisw1 RHS
+  1228950018U,	// <5,5,u,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713966848U,	// <5,5,u,7>: Cost 3 vsldoi8 <4,u,5,5>, <u,7,0,1>
+  296144182U,	// <5,5,u,u>: Cost 1 vspltisw1 RHS
+  2705342464U,	// <5,6,0,0>: Cost 3 vsldoi8 <3,4,5,6>, <0,0,0,0>
+  1631600742U,	// <5,6,0,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3773112493U,	// <5,6,0,2>: Cost 4 vsldoi8 <2,4,5,6>, <0,2,1,2>
+  2705342720U,	// <5,6,0,3>: Cost 3 vsldoi8 <3,4,5,6>, <0,3,1,4>
+  2705342802U,	// <5,6,0,4>: Cost 3 vsldoi8 <3,4,5,6>, <0,4,1,5>
+  3779084708U,	// <5,6,0,5>: Cost 4 vsldoi8 <3,4,5,6>, <0,5,1,6>
+  3779084790U,	// <5,6,0,6>: Cost 4 vsldoi8 <3,4,5,6>, <0,6,1,7>
+  2302643510U,	// <5,6,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631601309U,	// <5,6,0,u>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3767141092U,	// <5,6,1,0>: Cost 4 vsldoi8 <1,4,5,6>, <1,0,1,2>
+  2705343284U,	// <5,6,1,1>: Cost 3 vsldoi8 <3,4,5,6>, <1,1,1,1>
+  2705343382U,	// <5,6,1,2>: Cost 3 vsldoi8 <3,4,5,6>, <1,2,3,0>
+  3779085282U,	// <5,6,1,3>: Cost 4 vsldoi8 <3,4,5,6>, <1,3,2,4>
+  2693399632U,	// <5,6,1,4>: Cost 3 vsldoi8 <1,4,5,6>, <1,4,5,6>
+  3767805089U,	// <5,6,1,5>: Cost 4 vsldoi8 <1,5,5,6>, <1,5,5,6>
+  2311279416U,	// <5,6,1,6>: Cost 3 vmrglw <4,u,5,1>, <6,6,6,6>
+  1237536054U,	// <5,6,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1237536055U,	// <5,6,1,u>: Cost 2 vmrglw <4,u,5,1>, RHS
+  3773113789U,	// <5,6,2,0>: Cost 4 vsldoi8 <2,4,5,6>, <2,0,1,2>
+  3779085855U,	// <5,6,2,1>: Cost 4 vsldoi8 <3,4,5,6>, <2,1,3,1>
+  2699372136U,	// <5,6,2,2>: Cost 3 vsldoi8 <2,4,5,6>, <2,2,2,2>
+  2705344166U,	// <5,6,2,3>: Cost 3 vsldoi8 <3,4,5,6>, <2,3,0,1>
+  2699372329U,	// <5,6,2,4>: Cost 3 vsldoi8 <2,4,5,6>, <2,4,5,6>
+  2705344360U,	// <5,6,2,5>: Cost 3 vsldoi8 <3,4,5,6>, <2,5,3,6>
+  2705344442U,	// <5,6,2,6>: Cost 3 vsldoi8 <3,4,5,6>, <2,6,3,7>
+  2302659894U,	// <5,6,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2702026861U,	// <5,6,2,u>: Cost 3 vsldoi8 <2,u,5,6>, <2,u,5,6>
+  2705344662U,	// <5,6,3,0>: Cost 3 vsldoi8 <3,4,5,6>, <3,0,1,2>
+  3767142661U,	// <5,6,3,1>: Cost 4 vsldoi8 <1,4,5,6>, <3,1,4,5>
+  3773114689U,	// <5,6,3,2>: Cost 4 vsldoi8 <2,4,5,6>, <3,2,2,2>
+  2705344924U,	// <5,6,3,3>: Cost 3 vsldoi8 <3,4,5,6>, <3,3,3,3>
+  1631603202U,	// <5,6,3,4>: Cost 2 vsldoi8 <3,4,5,6>, <3,4,5,6>
+  3842945597U,	// <5,6,3,5>: Cost 4 vsldoi12 <2,u,6,5>, <6,3,5,7>
+  3779086962U,	// <5,6,3,6>: Cost 4 vsldoi8 <3,4,5,6>, <3,6,0,1>
+  2289397046U,	// <5,6,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634257734U,	// <5,6,3,u>: Cost 2 vsldoi8 <3,u,5,6>, <3,u,5,6>
+  2644926566U,	// <5,6,4,0>: Cost 3 vsldoi4 <4,5,6,4>, LHS
+  3779087306U,	// <5,6,4,1>: Cost 4 vsldoi8 <3,4,5,6>, <4,1,2,3>
+  2790142577U,	// <5,6,4,2>: Cost 3 vsldoi12 <6,4,2,5>, <6,4,2,5>
+  2644929026U,	// <5,6,4,3>: Cost 3 vsldoi4 <4,5,6,4>, <3,4,5,6>
+  2711317723U,	// <5,6,4,4>: Cost 3 vsldoi8 <4,4,5,6>, <4,4,5,6>
+  1631604022U,	// <5,6,4,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  2712644989U,	// <5,6,4,6>: Cost 3 vsldoi8 <4,6,5,6>, <4,6,5,6>
+  2302676278U,	// <5,6,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631604265U,	// <5,6,4,u>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  3842945708U,	// <5,6,5,0>: Cost 4 vsldoi12 <2,u,6,5>, <6,5,0,1>
+  3767144133U,	// <5,6,5,1>: Cost 4 vsldoi8 <1,4,5,6>, <5,1,6,1>
+  2705346328U,	// <5,6,5,2>: Cost 3 vsldoi8 <3,4,5,6>, <5,2,6,3>
+  3779088207U,	// <5,6,5,3>: Cost 4 vsldoi8 <3,4,5,6>, <5,3,3,4>
+  2717290420U,	// <5,6,5,4>: Cost 3 vsldoi8 <5,4,5,6>, <5,4,5,6>
+  2705346574U,	// <5,6,5,5>: Cost 3 vsldoi8 <3,4,5,6>, <5,5,6,6>
+  2705346596U,	// <5,6,5,6>: Cost 3 vsldoi8 <3,4,5,6>, <5,6,0,1>
+  1237568822U,	// <5,6,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  1237568823U,	// <5,6,5,u>: Cost 2 vmrglw <4,u,5,5>, RHS
+  2650914918U,	// <5,6,6,0>: Cost 3 vsldoi4 <5,5,6,6>, LHS
+  3364490949U,	// <5,6,6,1>: Cost 4 vmrglw <1,4,5,6>, <5,1,6,1>
+  2248954362U,	// <5,6,6,2>: Cost 3 vmrghw <5,6,7,0>, <6,2,7,3>
+  2302693144U,	// <5,6,6,3>: Cost 3 vmrglw <3,4,5,6>, <5,2,6,3>
+  2650918198U,	// <5,6,6,4>: Cost 3 vsldoi4 <5,5,6,6>, RHS
+  2650918926U,	// <5,6,6,5>: Cost 3 vsldoi4 <5,5,6,6>, <5,5,6,6>
+  2302693390U,	// <5,6,6,6>: Cost 3 vmrglw <3,4,5,6>, <5,5,6,6>
+  1228950838U,	// <5,6,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228950839U,	// <5,6,6,u>: Cost 2 vmrglw <3,4,5,6>, RHS
+  497467494U,	// <5,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571210036U,	// <5,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571210856U,	// <5,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571211414U,	// <5,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497470774U,	// <5,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571213316U,	// <5,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571213818U,	// <5,6,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571214956U,	// <5,6,7,7>: Cost 2 vsldoi4 RHS, <7,7,7,7>
+  497473326U,	// <5,6,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497475686U,	// <5,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631606574U,	// <5,6,u,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  1571219048U,	// <5,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571219606U,	// <5,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497478967U,	// <5,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631606938U,	// <5,6,u,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  1571222010U,	// <5,6,u,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1228967222U,	// <5,6,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497481518U,	// <5,6,u,u>: Cost 1 vsldoi4 RHS, LHS
+  3768475648U,	// <5,7,0,0>: Cost 4 vsldoi8 <1,6,5,7>, <0,0,0,0>
+  2694733926U,	// <5,7,0,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  3718711395U,	// <5,7,0,2>: Cost 4 vsldoi4 <4,5,7,0>, <2,u,4,5>
+  3384349178U,	// <5,7,0,3>: Cost 4 vmrglw <4,7,5,0>, <6,2,7,3>
+  2694734162U,	// <5,7,0,4>: Cost 3 vsldoi8 <1,6,5,7>, <0,4,1,5>
+  3384347884U,	// <5,7,0,5>: Cost 4 vmrglw <4,7,5,0>, <4,4,7,5>
+  3730658026U,	// <5,7,0,6>: Cost 4 vsldoi4 <6,5,7,0>, <6,5,7,0>
+  3718714362U,	// <5,7,0,7>: Cost 4 vsldoi4 <4,5,7,0>, <7,0,1,2>
+  2694734493U,	// <5,7,0,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2311278690U,	// <5,7,1,0>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,0>
+  2305970923U,	// <5,7,1,1>: Cost 3 vmrglw <4,0,5,1>, <6,5,7,1>
+  3768476566U,	// <5,7,1,2>: Cost 4 vsldoi8 <1,6,5,7>, <1,2,3,0>
+  2311279098U,	// <5,7,1,3>: Cost 3 vmrglw <4,u,5,1>, <6,2,7,3>
+  2311278694U,	// <5,7,1,4>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,4>
+  3768476783U,	// <5,7,1,5>: Cost 4 vsldoi8 <1,6,5,7>, <1,5,0,1>
+  2694735091U,	// <5,7,1,6>: Cost 3 vsldoi8 <1,6,5,7>, <1,6,5,7>
+  2311279426U,	// <5,7,1,7>: Cost 3 vmrglw <4,u,5,1>, <6,6,7,7>
+  2696062357U,	// <5,7,1,u>: Cost 3 vsldoi8 <1,u,5,7>, <1,u,5,7>
+  3383701602U,	// <5,7,2,0>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,0>
+  3768477219U,	// <5,7,2,1>: Cost 4 vsldoi8 <1,6,5,7>, <2,1,3,5>
+  3768477288U,	// <5,7,2,2>: Cost 4 vsldoi8 <1,6,5,7>, <2,2,2,2>
+  2309960186U,	// <5,7,2,3>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3383701606U,	// <5,7,2,4>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,4>
+  3768477545U,	// <5,7,2,5>: Cost 4 vsldoi8 <1,6,5,7>, <2,5,3,7>
+  3766486970U,	// <5,7,2,6>: Cost 4 vsldoi8 <1,3,5,7>, <2,6,3,7>
+  3383702338U,	// <5,7,2,7>: Cost 4 vmrglw <4,6,5,2>, <6,6,7,7>
+  2309960186U,	// <5,7,2,u>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3768477846U,	// <5,7,3,0>: Cost 4 vsldoi8 <1,6,5,7>, <3,0,1,2>
+  3768477975U,	// <5,7,3,1>: Cost 4 vsldoi8 <1,6,5,7>, <3,1,6,5>
+  3786393932U,	// <5,7,3,2>: Cost 4 vsldoi8 <4,6,5,7>, <3,2,3,4>
+  3768478108U,	// <5,7,3,3>: Cost 4 vsldoi8 <1,6,5,7>, <3,3,3,3>
+  2795599115U,	// <5,7,3,4>: Cost 3 vsldoi12 <7,3,4,5>, <7,3,4,5>
+  3385037470U,	// <5,7,3,5>: Cost 4 vmrglw <4,u,5,3>, <6,4,7,5>
+  3780422309U,	// <5,7,3,6>: Cost 4 vsldoi8 <3,6,5,7>, <3,6,5,7>
+  3848107301U,	// <5,7,3,7>: Cost 4 vsldoi12 <3,7,4,5>, <7,3,7,4>
+  2795894063U,	// <5,7,3,u>: Cost 3 vsldoi12 <7,3,u,5>, <7,3,u,5>
+  2795967800U,	// <5,7,4,0>: Cost 3 vsldoi12 <7,4,0,5>, <7,4,0,5>
+  3768478690U,	// <5,7,4,1>: Cost 4 vsldoi8 <1,6,5,7>, <4,1,5,0>
+  3718744163U,	// <5,7,4,2>: Cost 4 vsldoi4 <4,5,7,4>, <2,u,4,5>
+  3784404107U,	// <5,7,4,3>: Cost 4 vsldoi8 <4,3,5,7>, <4,3,5,7>
+  2796262748U,	// <5,7,4,4>: Cost 3 vsldoi12 <7,4,4,5>, <7,4,4,5>
+  2694737206U,	// <5,7,4,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2712653182U,	// <5,7,4,6>: Cost 3 vsldoi8 <4,6,5,7>, <4,6,5,7>
+  2713316815U,	// <5,7,4,7>: Cost 3 vsldoi8 <4,7,5,7>, <4,7,5,7>
+  2694737449U,	// <5,7,4,u>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2311311458U,	// <5,7,5,0>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,0>
+  3768479433U,	// <5,7,5,1>: Cost 4 vsldoi8 <1,6,5,7>, <5,1,6,5>
+  3768479521U,	// <5,7,5,2>: Cost 4 vsldoi8 <1,6,5,7>, <5,2,7,3>
+  2311311866U,	// <5,7,5,3>: Cost 3 vmrglw <4,u,5,5>, <6,2,7,3>
+  2311311462U,	// <5,7,5,4>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,4>
+  2248185270U,	// <5,7,5,5>: Cost 3 vmrghw <5,5,5,5>, <7,5,5,5>
+  2718625879U,	// <5,7,5,6>: Cost 3 vsldoi8 <5,6,5,7>, <5,6,5,7>
+  2311312194U,	// <5,7,5,7>: Cost 3 vmrglw <4,u,5,5>, <6,6,7,7>
+  2311311466U,	// <5,7,5,u>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,u>
+  2248954874U,	// <5,7,6,0>: Cost 3 vmrghw <5,6,7,0>, <7,0,1,2>
+  3322696778U,	// <5,7,6,1>: Cost 4 vmrghw <5,6,7,0>, <7,1,1,1>
+  2248955028U,	// <5,7,6,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2656963074U,	// <5,7,6,3>: Cost 3 vsldoi4 <6,5,7,6>, <3,4,5,6>
+  2248955238U,	// <5,7,6,4>: Cost 3 vmrghw <5,6,7,0>, <7,4,5,6>
+  2248955329U,	// <5,7,6,5>: Cost 3 vmrghw <5,6,7,0>, <7,5,6,7>
+  2656965360U,	// <5,7,6,6>: Cost 3 vsldoi4 <6,5,7,6>, <6,5,7,6>
+  2248955500U,	// <5,7,6,7>: Cost 3 vmrghw <5,6,7,0>, <7,7,7,7>
+  2248955522U,	// <5,7,6,u>: Cost 3 vmrghw <5,6,7,0>, <7,u,1,2>
+  3718766694U,	// <5,7,7,0>: Cost 4 vsldoi4 <4,5,7,7>, LHS
+  3724739827U,	// <5,7,7,1>: Cost 4 vsldoi4 <5,5,7,7>, <1,6,5,7>
+  3718768739U,	// <5,7,7,2>: Cost 4 vsldoi4 <4,5,7,7>, <2,u,4,5>
+  3365826337U,	// <5,7,7,3>: Cost 4 vmrglw <1,6,5,7>, <5,2,7,3>
+  2798253647U,	// <5,7,7,4>: Cost 3 vsldoi12 <7,7,4,5>, <7,7,4,5>
+  3365826258U,	// <5,7,7,5>: Cost 4 vmrglw <1,6,5,7>, <5,1,7,5>
+  3730715377U,	// <5,7,7,6>: Cost 4 vsldoi4 <6,5,7,7>, <6,5,7,7>
+  2310665836U,	// <5,7,7,7>: Cost 3 vmrglw <4,7,5,7>, <7,7,7,7>
+  2798548595U,	// <5,7,7,u>: Cost 3 vsldoi12 <7,7,u,5>, <7,7,u,5>
+  2311336034U,	// <5,7,u,0>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,0>
+  2694739758U,	// <5,7,u,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2248955028U,	// <5,7,u,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2311336442U,	// <5,7,u,3>: Cost 3 vmrglw <4,u,5,u>, <6,2,7,3>
+  2311336038U,	// <5,7,u,4>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,4>
+  2694740122U,	// <5,7,u,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2656981746U,	// <5,7,u,6>: Cost 3 vsldoi4 <6,5,7,u>, <6,5,7,u>
+  2311336770U,	// <5,7,u,7>: Cost 3 vmrglw <4,u,5,u>, <6,6,7,7>
+  2694740325U,	// <5,7,u,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2705358848U,	// <5,u,0,0>: Cost 3 vsldoi8 <3,4,5,u>, <0,0,0,0>
+  1631617126U,	// <5,u,0,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2310607866U,	// <5,u,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  2302640284U,	// <5,u,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  2754238189U,	// <5,u,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <u,0,4,1>
+  2305296114U,	// <5,u,0,5>: Cost 3 vmrglw <3,u,5,0>, <2,3,u,5>
+  2244907106U,	// <5,u,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  2302643528U,	// <5,u,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631617693U,	// <5,u,0,u>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2627133542U,	// <5,u,1,0>: Cost 3 vsldoi4 <1,5,u,1>, LHS
+  1237536282U,	// <5,u,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  1680496430U,	// <5,u,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1237532828U,	// <5,u,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  2693416018U,	// <5,u,1,4>: Cost 3 vsldoi8 <1,4,5,u>, <1,4,5,u>
+  2756892486U,	// <5,u,1,5>: Cost 3 vsldoi12 <0,u,1,5>, <u,1,5,0>
+  2694743284U,	// <5,u,1,6>: Cost 3 vsldoi8 <1,6,5,u>, <1,6,5,u>
+  1237536072U,	// <5,u,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1680496484U,	// <5,u,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2311288709U,	// <5,u,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  2245883694U,	// <5,u,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2699388520U,	// <5,u,2,2>: Cost 3 vsldoi8 <2,4,5,u>, <2,2,2,2>
+  2754238344U,	// <5,u,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,3,3>
+  2699388715U,	// <5,u,2,4>: Cost 3 vsldoi8 <2,4,5,u>, <2,4,5,u>
+  2757408666U,	// <5,u,2,5>: Cost 3 vsldoi12 <0,u,u,5>, <u,2,5,3>
+  2705360826U,	// <5,u,2,6>: Cost 3 vsldoi8 <3,4,5,u>, <2,6,3,7>
+  2302659912U,	// <5,u,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2754238389U,	// <5,u,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,u,3>
+  2754238396U,	// <5,u,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <u,3,0,1>
+  3827980229U,	// <5,u,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <u,3,1,1>
+  2644625102U,	// <5,u,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393820U,	// <5,u,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1631619588U,	// <5,u,3,4>: Cost 2 vsldoi8 <3,4,5,u>, <3,4,5,u>
+  2785056749U,	// <5,u,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <u,3,5,5>
+  3363138077U,	// <5,u,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,u,6>
+  2289397064U,	// <5,u,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634274120U,	// <5,u,3,u>: Cost 2 vsldoi8 <3,u,5,u>, <3,u,5,u>
+  1634937753U,	// <5,u,4,0>: Cost 2 vsldoi8 <4,0,5,u>, <4,0,5,u>
+  1728272410U,	// <5,u,4,1>: Cost 2 vsldoi12 <u,4,1,5>, <u,4,1,5>
+  2710006843U,	// <5,u,4,2>: Cost 3 vsldoi8 <4,2,5,u>, <4,2,5,u>
+  2765740076U,	// <5,u,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <u,4,3,5>
+  1637592285U,	// <5,u,4,4>: Cost 2 vsldoi8 <4,4,5,u>, <4,4,5,u>
+  1631620406U,	// <5,u,4,5>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  2712661375U,	// <5,u,4,6>: Cost 3 vsldoi8 <4,6,5,u>, <4,6,5,u>
+  2302676296U,	// <5,u,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631620649U,	// <5,u,4,u>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  1577091174U,	// <5,u,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1174443822U,	// <5,u,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2766035058U,	// <5,u,5,2>: Cost 3 vsldoi12 <2,3,u,5>, <u,5,2,3>
+  1237565596U,	// <5,u,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  1577094454U,	// <5,u,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,u,5,5>: Cost 1 vspltisw1 RHS
+  1680496794U,	// <5,u,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1237568840U,	// <5,u,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  296144182U,	// <5,u,5,u>: Cost 1 vspltisw1 RHS
+  2633146470U,	// <5,u,6,0>: Cost 3 vsldoi4 <2,5,u,6>, LHS
+  1175213870U,	// <5,u,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2633148309U,	// <5,u,6,2>: Cost 3 vsldoi4 <2,5,u,6>, <2,5,u,6>
+  1228947612U,	// <5,u,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  2633149750U,	// <5,u,6,4>: Cost 3 vsldoi4 <2,5,u,6>, RHS
+  1175214234U,	// <5,u,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  1228950018U,	// <5,u,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  1228950856U,	// <5,u,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228947617U,	// <5,u,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  497614950U,	// <5,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571357492U,	// <5,u,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571358312U,	// <5,u,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571358870U,	// <5,u,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497618248U,	// <5,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571360772U,	// <5,u,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571361274U,	// <5,u,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571361786U,	// <5,u,7,7>: Cost 2 vsldoi4 RHS, <7,0,1,2>
+  497620782U,	// <5,u,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497623142U,	// <5,u,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631622958U,	// <5,u,u,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  1680496997U,	// <5,u,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1228963996U,	// <5,u,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  497626441U,	// <5,u,u,4>: Cost 1 vsldoi4 RHS, RHS
+  296144182U,	// <5,u,u,5>: Cost 1 vspltisw1 RHS
+  1680497037U,	// <5,u,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1228967240U,	// <5,u,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497628974U,	// <5,u,u,u>: Cost 1 vsldoi4 RHS, LHS
+  2772451328U,	// <6,0,0,0>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,0,0>
+  2772451338U,	// <6,0,0,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,1,1>
+  3771146417U,	// <6,0,0,2>: Cost 4 vsldoi8 <2,1,6,0>, <0,2,1,6>
+  3383095739U,	// <6,0,0,3>: Cost 4 vmrglw <4,5,6,0>, <6,2,0,3>
+  3846193189U,	// <6,0,0,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,0,4,1>
+  3724832803U,	// <6,0,0,5>: Cost 4 vsldoi4 <5,6,0,0>, <5,6,0,0>
+  3383095985U,	// <6,0,0,6>: Cost 4 vmrglw <4,5,6,0>, <6,5,0,6>
+  3383096067U,	// <6,0,0,7>: Cost 4 vmrglw <4,5,6,0>, <6,6,0,7>
+  2772451401U,	// <6,0,0,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,u,1>
+  2651095142U,	// <6,0,1,0>: Cost 3 vsldoi4 <5,6,0,1>, LHS
+  2251612262U,	// <6,0,1,1>: Cost 3 vmrghw <6,1,7,1>, LHS
+  1698709606U,	// <6,0,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2651097602U,	// <6,0,1,3>: Cost 3 vsldoi4 <5,6,0,1>, <3,4,5,6>
+  2651098422U,	// <6,0,1,4>: Cost 3 vsldoi4 <5,6,0,1>, RHS
+  2651099172U,	// <6,0,1,5>: Cost 3 vsldoi4 <5,6,0,1>, <5,6,0,1>
+  2657071869U,	// <6,0,1,6>: Cost 3 vsldoi4 <6,6,0,1>, <6,6,0,1>
+  3724841978U,	// <6,0,1,7>: Cost 4 vsldoi4 <5,6,0,1>, <7,0,1,2>
+  1698709660U,	// <6,0,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2252292096U,	// <6,0,2,0>: Cost 3 vmrghw <6,2,7,3>, <0,0,0,0>
+  1178550374U,	// <6,0,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3826655418U,	// <6,0,2,2>: Cost 4 vsldoi12 <0,2,1,6>, <0,2,2,6>
+  3777783485U,	// <6,0,2,3>: Cost 4 vsldoi8 <3,2,6,0>, <2,3,2,6>
+  2252292434U,	// <6,0,2,4>: Cost 3 vmrghw <6,2,7,3>, <0,4,1,5>
+  3785746280U,	// <6,0,2,5>: Cost 4 vsldoi8 <4,5,6,0>, <2,5,3,6>
+  2252292593U,	// <6,0,2,6>: Cost 3 vmrghw <6,2,7,3>, <0,6,1,2>
+  3736794583U,	// <6,0,2,7>: Cost 4 vsldoi4 <7,6,0,2>, <7,6,0,2>
+  1178550941U,	// <6,0,2,u>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3375153152U,	// <6,0,3,0>: Cost 4 vmrglw <3,2,6,3>, <0,0,0,0>
+  2772451584U,	// <6,0,3,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,1,4>
+  3777784163U,	// <6,0,3,2>: Cost 4 vsldoi8 <3,2,6,0>, <3,2,6,0>
+  3846193426U,	// <6,0,3,3>: Cost 4 vsldoi12 <3,4,5,6>, <0,3,3,4>
+  2712005122U,	// <6,0,3,4>: Cost 3 vsldoi8 <4,5,6,0>, <3,4,5,6>
+  3724857382U,	// <6,0,3,5>: Cost 4 vsldoi4 <5,6,0,3>, <5,6,0,3>
+  3802335864U,	// <6,0,3,6>: Cost 4 vsldoi8 <7,3,6,0>, <3,6,0,7>
+  3801672410U,	// <6,0,3,7>: Cost 4 vsldoi8 <7,2,6,0>, <3,7,2,6>
+  2772451647U,	// <6,0,3,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,u,4>
+  3383123968U,	// <6,0,4,0>: Cost 4 vmrglw <4,5,6,4>, <0,0,0,0>
+  2772451666U,	// <6,0,4,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,1,5>
+  3773803577U,	// <6,0,4,2>: Cost 4 vsldoi8 <2,5,6,0>, <4,2,5,6>
+  3724864002U,	// <6,0,4,3>: Cost 4 vsldoi4 <5,6,0,4>, <3,4,5,6>
+  3846193517U,	// <6,0,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,4,4,5>
+  2712005935U,	// <6,0,4,5>: Cost 3 vsldoi8 <4,5,6,0>, <4,5,6,0>
+  3327009265U,	// <6,0,4,6>: Cost 4 vmrghw <6,4,2,5>, <0,6,1,2>
+  3383126648U,	// <6,0,4,7>: Cost 5 vmrglw <4,5,6,4>, <3,6,0,7>
+  2772451729U,	// <6,0,4,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,u,5>
+  3373178880U,	// <6,0,5,0>: Cost 4 vmrglw <2,u,6,5>, <0,0,0,0>
+  2254266470U,	// <6,0,5,1>: Cost 3 vmrghw <6,5,7,1>, LHS
+  3785748248U,	// <6,0,5,2>: Cost 4 vsldoi8 <4,5,6,0>, <5,2,6,3>
+  3790393190U,	// <6,0,5,3>: Cost 4 vsldoi8 <5,3,6,0>, <5,3,6,0>
+  3328000338U,	// <6,0,5,4>: Cost 4 vmrghw <6,5,7,0>, <0,4,1,5>
+  3785748494U,	// <6,0,5,5>: Cost 4 vsldoi8 <4,5,6,0>, <5,5,6,6>
+  3785748516U,	// <6,0,5,6>: Cost 4 vsldoi8 <4,5,6,0>, <5,6,0,1>
+  3379153528U,	// <6,0,5,7>: Cost 4 vmrglw <3,u,6,5>, <3,6,0,7>
+  2254267037U,	// <6,0,5,u>: Cost 3 vmrghw <6,5,7,1>, LHS
+  2254897152U,	// <6,0,6,0>: Cost 3 vmrghw <6,6,6,6>, <0,0,0,0>
+  1181155430U,	// <6,0,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  3785748923U,	// <6,0,6,2>: Cost 4 vsldoi8 <4,5,6,0>, <6,2,0,3>
+  3785749042U,	// <6,0,6,3>: Cost 4 vsldoi8 <4,5,6,0>, <6,3,4,5>
+  2254897490U,	// <6,0,6,4>: Cost 3 vmrghw <6,6,6,6>, <0,4,1,5>
+  3785749169U,	// <6,0,6,5>: Cost 4 vsldoi8 <4,5,6,0>, <6,5,0,6>
+  2724614962U,	// <6,0,6,6>: Cost 3 vsldoi8 <6,6,6,0>, <6,6,6,0>
+  3787739982U,	// <6,0,6,7>: Cost 4 vsldoi8 <4,u,6,0>, <6,7,0,1>
+  1181155997U,	// <6,0,6,u>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1235664896U,	// <6,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235666598U,	// <6,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  3712943720U,	// <6,0,7,2>: Cost 4 vsldoi4 <3,6,0,7>, <2,2,2,2>
+  2639202936U,	// <6,0,7,3>: Cost 3 vsldoi4 <3,6,0,7>, <3,6,0,7>
+  2639203638U,	// <6,0,7,4>: Cost 3 vsldoi4 <3,6,0,7>, RHS
+  2309409236U,	// <6,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  3712946517U,	// <6,0,7,6>: Cost 4 vsldoi4 <3,6,0,7>, <6,0,7,0>
+  2309409400U,	// <6,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235666605U,	// <6,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  1235673088U,	// <6,0,u,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235674790U,	// <6,0,u,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  1698710173U,	// <6,0,u,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2639211129U,	// <6,0,u,3>: Cost 3 vsldoi4 <3,6,0,u>, <3,6,0,u>
+  2639211830U,	// <6,0,u,4>: Cost 3 vsldoi4 <3,6,0,u>, RHS
+  2712008858U,	// <6,0,u,5>: Cost 3 vsldoi8 <4,5,6,0>, RHS
+  2657129220U,	// <6,0,u,6>: Cost 3 vsldoi4 <6,6,0,u>, <6,6,0,u>
+  2309417592U,	// <6,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1698710227U,	// <6,0,u,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  3775799296U,	// <6,1,0,0>: Cost 4 vsldoi8 <2,u,6,1>, <0,0,0,0>
+  2702057574U,	// <6,1,0,1>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3373143763U,	// <6,1,0,2>: Cost 4 vmrglw <2,u,6,0>, <u,0,1,2>
+  3695045122U,	// <6,1,0,3>: Cost 4 vsldoi4 <0,6,1,0>, <3,4,5,6>
+  3775799634U,	// <6,1,0,4>: Cost 4 vsldoi8 <2,u,6,1>, <0,4,1,5>
+  3383091538U,	// <6,1,0,5>: Cost 4 vmrglw <4,5,6,0>, <0,4,1,5>
+  3368493233U,	// <6,1,0,6>: Cost 4 vmrglw <2,1,6,0>, <0,2,1,6>
+  3362522319U,	// <6,1,0,7>: Cost 5 vmrglw <1,1,6,0>, <1,6,1,7>
+  2702058141U,	// <6,1,0,u>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3834250027U,	// <6,1,1,0>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,0,1>
+  2772452148U,	// <6,1,1,1>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  3832038210U,	// <6,1,1,2>: Cost 4 vsldoi12 <1,1,2,6>, <1,1,2,6>
+  3373150660U,	// <6,1,1,3>: Cost 4 vmrglw <2,u,6,1>, <6,2,1,3>
+  3834250067U,	// <6,1,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,4,5>
+  3373146450U,	// <6,1,1,5>: Cost 4 vmrglw <2,u,6,1>, <0,4,1,5>
+  3826656102U,	// <6,1,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,1,6,6>
+  3362530511U,	// <6,1,1,7>: Cost 4 vmrglw <1,1,6,1>, <1,6,1,7>
+  2772452148U,	// <6,1,1,u>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  2669092966U,	// <6,1,2,0>: Cost 3 vsldoi4 <u,6,1,2>, LHS
+  2252292916U,	// <6,1,2,1>: Cost 3 vmrghw <6,2,7,3>, <1,1,1,1>
+  2252293014U,	// <6,1,2,2>: Cost 3 vmrghw <6,2,7,3>, <1,2,3,0>
+  2772452246U,	// <6,1,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,2,3,0>
+  2669096246U,	// <6,1,2,4>: Cost 3 vsldoi4 <u,6,1,2>, RHS
+  3846194091U,	// <6,1,2,5>: Cost 4 vsldoi12 <3,4,5,6>, <1,2,5,3>
+  2702059450U,	// <6,1,2,6>: Cost 3 vsldoi8 <2,u,6,1>, <2,6,3,7>
+  3870081978U,	// <6,1,2,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,2,7,0>
+  2702059633U,	// <6,1,2,u>: Cost 3 vsldoi8 <2,u,6,1>, <2,u,6,1>
+  3775801494U,	// <6,1,3,0>: Cost 4 vsldoi8 <2,u,6,1>, <3,0,1,2>
+  3777128723U,	// <6,1,3,1>: Cost 4 vsldoi8 <3,1,6,1>, <3,1,6,1>
+  3775801702U,	// <6,1,3,2>: Cost 4 vsldoi8 <2,u,6,1>, <3,2,6,3>
+  3775801756U,	// <6,1,3,3>: Cost 4 vsldoi8 <2,u,6,1>, <3,3,3,3>
+  3775801858U,	// <6,1,3,4>: Cost 4 vsldoi8 <2,u,6,1>, <3,4,5,6>
+  3375153490U,	// <6,1,3,5>: Cost 4 vmrglw <3,2,6,3>, <0,4,1,5>
+  3826656265U,	// <6,1,3,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,3,6,7>
+  3775802051U,	// <6,1,3,7>: Cost 4 vsldoi8 <2,u,6,1>, <3,7,0,1>
+  3775802142U,	// <6,1,3,u>: Cost 4 vsldoi8 <2,u,6,1>, <3,u,1,2>
+  3846194206U,	// <6,1,4,0>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,0,1>
+  3846194219U,	// <6,1,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,1,5>
+  3846194228U,	// <6,1,4,2>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,2,5>
+  3846194236U,	// <6,1,4,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,3,4>
+  3846194246U,	// <6,1,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,4,5>
+  2760508496U,	// <6,1,4,5>: Cost 3 vsldoi12 <1,4,5,6>, <1,4,5,6>
+  3368526001U,	// <6,1,4,6>: Cost 4 vmrglw <2,1,6,4>, <0,2,1,6>
+  3870082144U,	// <6,1,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,4,7,4>
+  2760729707U,	// <6,1,4,u>: Cost 3 vsldoi12 <1,4,u,6>, <1,4,u,6>
+  2714668660U,	// <6,1,5,0>: Cost 3 vsldoi8 <5,0,6,1>, <5,0,6,1>
+  3834619005U,	// <6,1,5,1>: Cost 4 vsldoi12 <1,5,1,6>, <1,5,1,6>
+  3834692742U,	// <6,1,5,2>: Cost 4 vsldoi12 <1,5,2,6>, <1,5,2,6>
+  3846194317U,	// <6,1,5,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,5,3,4>
+  3834840216U,	// <6,1,5,4>: Cost 4 vsldoi12 <1,5,4,6>, <1,5,4,6>
+  3834913953U,	// <6,1,5,5>: Cost 4 vsldoi12 <1,5,5,6>, <1,5,5,6>
+  2719977570U,	// <6,1,5,6>: Cost 3 vsldoi8 <5,u,6,1>, <5,6,7,0>
+  3367208143U,	// <6,1,5,7>: Cost 4 vmrglw <1,u,6,5>, <1,6,1,7>
+  2719977724U,	// <6,1,5,u>: Cost 3 vsldoi8 <5,u,6,1>, <5,u,6,1>
+  2669125734U,	// <6,1,6,0>: Cost 3 vsldoi4 <u,6,1,6>, LHS
+  2254897972U,	// <6,1,6,1>: Cost 3 vmrghw <6,6,6,6>, <1,1,1,1>
+  2254898070U,	// <6,1,6,2>: Cost 3 vmrghw <6,6,6,6>, <1,2,3,0>
+  3775803929U,	// <6,1,6,3>: Cost 4 vsldoi8 <2,u,6,1>, <6,3,1,7>
+  2669129014U,	// <6,1,6,4>: Cost 3 vsldoi4 <u,6,1,6>, RHS
+  2322006354U,	// <6,1,6,5>: Cost 3 vmrglw <6,6,6,6>, <0,4,1,5>
+  2725950264U,	// <6,1,6,6>: Cost 3 vsldoi8 <6,u,6,1>, <6,6,6,6>
+  3793720142U,	// <6,1,6,7>: Cost 4 vsldoi8 <5,u,6,1>, <6,7,0,1>
+  2254898556U,	// <6,1,6,u>: Cost 3 vmrghw <6,6,6,6>, <1,u,3,0>
+  2627330150U,	// <6,1,7,0>: Cost 3 vsldoi4 <1,6,1,7>, LHS
+  1235664906U,	// <6,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235667094U,	// <6,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309406894U,	// <6,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2627333430U,	// <6,1,7,4>: Cost 3 vsldoi4 <1,6,1,7>, RHS
+  1235665234U,	// <6,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309406897U,	// <6,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309407222U,	// <6,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235664913U,	// <6,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  2627338342U,	// <6,1,u,0>: Cost 3 vsldoi4 <1,6,1,u>, LHS
+  1235673098U,	// <6,1,u,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235675286U,	// <6,1,u,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2772452732U,	// <6,1,u,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,u,3,0>
+  2627341622U,	// <6,1,u,4>: Cost 3 vsldoi4 <1,6,1,u>, RHS
+  1235673426U,	// <6,1,u,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309415089U,	// <6,1,u,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309415414U,	// <6,1,u,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235673105U,	// <6,1,u,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  3324683725U,	// <6,2,0,0>: Cost 4 vmrghw <6,0,7,0>, <2,0,3,0>
+  2725290086U,	// <6,2,0,1>: Cost 3 vsldoi8 <6,7,6,2>, LHS
+  3771162801U,	// <6,2,0,2>: Cost 4 vsldoi8 <2,1,6,2>, <0,2,1,6>
+  2309349478U,	// <6,2,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3730951478U,	// <6,2,0,4>: Cost 4 vsldoi4 <6,6,2,0>, RHS
+  3840738784U,	// <6,2,0,5>: Cost 4 vsldoi12 <2,5,3,6>, <2,0,5,1>
+  3842655721U,	// <6,2,0,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,0,6,1>
+  3736925671U,	// <6,2,0,7>: Cost 4 vsldoi4 <7,6,2,0>, <7,6,2,0>
+  2309349483U,	// <6,2,0,u>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3367840468U,	// <6,2,1,0>: Cost 4 vmrglw <2,0,6,1>, <3,7,2,0>
+  3325355551U,	// <6,2,1,1>: Cost 4 vmrghw <6,1,7,1>, <2,1,3,1>
+  3373147752U,	// <6,2,1,2>: Cost 4 vmrglw <2,u,6,1>, <2,2,2,2>
+  2299404390U,	// <6,2,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  3701099830U,	// <6,2,1,4>: Cost 5 vsldoi4 <1,6,2,1>, RHS
+  3767846054U,	// <6,2,1,5>: Cost 4 vsldoi8 <1,5,6,2>, <1,5,6,2>
+  3826656825U,	// <6,2,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <2,1,6,0>
+  3373147838U,	// <6,2,1,7>: Cost 5 vmrglw <2,u,6,1>, <2,3,2,7>
+  2299404395U,	// <6,2,1,u>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2657222758U,	// <6,2,2,0>: Cost 3 vsldoi4 <6,6,2,2>, LHS
+  3771164219U,	// <6,2,2,1>: Cost 4 vsldoi8 <2,1,6,2>, <2,1,6,2>
+  2766481000U,	// <6,2,2,2>: Cost 3 vsldoi12 <2,4,5,6>, <2,2,2,2>
+  2772452978U,	// <6,2,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,3,3>
+  2657226038U,	// <6,2,2,4>: Cost 3 vsldoi4 <6,6,2,2>, RHS
+  3790407528U,	// <6,2,2,5>: Cost 4 vsldoi8 <5,3,6,2>, <2,5,3,6>
+  2252294074U,	// <6,2,2,6>: Cost 3 vmrghw <6,2,7,3>, <2,6,3,7>
+  2252294148U,	// <6,2,2,7>: Cost 3 vmrghw <6,2,7,3>, <2,7,3,0>
+  2772453023U,	// <6,2,2,u>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,u,3>
+  2772453030U,	// <6,2,3,0>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,0,1>
+  3834250930U,	// <6,2,3,1>: Cost 4 vsldoi12 <1,4,5,6>, <2,3,1,4>
+  2765596349U,	// <6,2,3,2>: Cost 3 vsldoi12 <2,3,2,6>, <2,3,2,6>
+  2301411430U,	// <6,2,3,3>: Cost 3 vmrglw <3,2,6,3>, LHS
+  2772453070U,	// <6,2,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,4,5>
+  2765817560U,	// <6,2,3,5>: Cost 3 vsldoi12 <2,3,5,6>, <2,3,5,6>
+  2252933050U,	// <6,2,3,6>: Cost 3 vmrghw <6,3,7,0>, <2,6,3,7>
+  2796340968U,	// <6,2,3,7>: Cost 3 vsldoi12 <7,4,5,6>, <2,3,7,4>
+  2766038771U,	// <6,2,3,u>: Cost 3 vsldoi12 <2,3,u,6>, <2,3,u,6>
+  3725008998U,	// <6,2,4,0>: Cost 4 vsldoi4 <5,6,2,4>, LHS
+  3368530217U,	// <6,2,4,1>: Cost 5 vmrglw <2,1,6,4>, <6,0,2,1>
+  3840222989U,	// <6,2,4,2>: Cost 4 vsldoi12 <2,4,5,6>, <2,4,2,5>
+  2309382246U,	// <6,2,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  3725012278U,	// <6,2,4,4>: Cost 4 vsldoi4 <5,6,2,4>, RHS
+  2766481193U,	// <6,2,4,5>: Cost 3 vsldoi12 <2,4,5,6>, <2,4,5,6>
+  3842656049U,	// <6,2,4,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,4,6,5>
+  3327010820U,	// <6,2,4,7>: Cost 4 vmrghw <6,4,2,5>, <2,7,3,0>
+  2766702404U,	// <6,2,4,u>: Cost 3 vsldoi12 <2,4,u,6>, <2,4,u,6>
+  3713073254U,	// <6,2,5,0>: Cost 4 vsldoi4 <3,6,2,5>, LHS
+  3789082310U,	// <6,2,5,1>: Cost 4 vsldoi8 <5,1,6,2>, <5,1,6,2>
+  3840665439U,	// <6,2,5,2>: Cost 4 vsldoi12 <2,5,2,6>, <2,5,2,6>
+  2766997352U,	// <6,2,5,3>: Cost 3 vsldoi12 <2,5,3,6>, <2,5,3,6>
+  3713076534U,	// <6,2,5,4>: Cost 4 vsldoi4 <3,6,2,5>, RHS
+  3791736842U,	// <6,2,5,5>: Cost 4 vsldoi8 <5,5,6,2>, <5,5,6,2>
+  3373180605U,	// <6,2,5,6>: Cost 4 vmrglw <2,u,6,5>, <2,3,2,6>
+  3793064108U,	// <6,2,5,7>: Cost 4 vsldoi8 <5,7,6,2>, <5,7,6,2>
+  2767366037U,	// <6,2,5,u>: Cost 3 vsldoi12 <2,5,u,6>, <2,5,u,6>
+  3701137510U,	// <6,2,6,0>: Cost 4 vsldoi4 <1,6,2,6>, LHS
+  3701138647U,	// <6,2,6,1>: Cost 4 vsldoi4 <1,6,2,6>, <1,6,2,6>
+  2254898792U,	// <6,2,6,2>: Cost 3 vmrghw <6,6,6,6>, <2,2,2,2>
+  1248264294U,	// <6,2,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  3701140790U,	// <6,2,6,4>: Cost 4 vsldoi4 <1,6,2,6>, RHS
+  3725029435U,	// <6,2,6,5>: Cost 4 vsldoi4 <5,6,2,6>, <5,6,2,6>
+  2254899130U,	// <6,2,6,6>: Cost 3 vmrghw <6,6,6,6>, <2,6,3,7>
+  2725294981U,	// <6,2,6,7>: Cost 3 vsldoi8 <6,7,6,2>, <6,7,6,2>
+  1248264299U,	// <6,2,6,u>: Cost 2 vmrglw <6,6,6,6>, LHS
+  2633375846U,	// <6,2,7,0>: Cost 3 vsldoi4 <2,6,2,7>, LHS
+  2309407468U,	// <6,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235666536U,	// <6,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161923174U,	// <6,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2633379126U,	// <6,2,7,4>: Cost 3 vsldoi4 <2,6,2,7>, RHS
+  2309407796U,	// <6,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309408445U,	// <6,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309407960U,	// <6,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161923179U,	// <6,2,7,u>: Cost 1 vmrglw RHS, LHS
+  2633384038U,	// <6,2,u,0>: Cost 3 vsldoi4 <2,6,2,u>, LHS
+  2309415660U,	// <6,2,u,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235674728U,	// <6,2,u,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161931366U,	// <6,2,u,3>: Cost 1 vmrglw RHS, LHS
+  2633387318U,	// <6,2,u,4>: Cost 3 vsldoi4 <2,6,2,u>, RHS
+  2769135725U,	// <6,2,u,5>: Cost 3 vsldoi12 <2,u,5,6>, <2,u,5,6>
+  2309416637U,	// <6,2,u,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309416152U,	// <6,2,u,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161931371U,	// <6,2,u,u>: Cost 1 vmrglw RHS, LHS
+  3777806336U,	// <6,3,0,0>: Cost 4 vsldoi8 <3,2,6,3>, <0,0,0,0>
+  2704064614U,	// <6,3,0,1>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  3765862577U,	// <6,3,0,2>: Cost 4 vsldoi8 <1,2,6,3>, <0,2,1,6>
+  3843393708U,	// <6,3,0,3>: Cost 4 vsldoi12 <3,0,3,6>, <3,0,3,6>
+  2250516994U,	// <6,3,0,4>: Cost 3 vmrghw <6,0,1,2>, <3,4,5,6>
+  3725054014U,	// <6,3,0,5>: Cost 4 vsldoi4 <5,6,3,0>, <5,6,3,0>
+  3383093096U,	// <6,3,0,6>: Cost 4 vmrglw <4,5,6,0>, <2,5,3,6>
+  3368495034U,	// <6,3,0,7>: Cost 4 vmrglw <2,1,6,0>, <2,6,3,7>
+  2704065181U,	// <6,3,0,u>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  2251622550U,	// <6,3,1,0>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  3777807156U,	// <6,3,1,1>: Cost 4 vsldoi8 <3,2,6,3>, <1,1,1,1>
+  3765863348U,	// <6,3,1,2>: Cost 4 vsldoi8 <1,2,6,3>, <1,2,6,3>
+  3373147762U,	// <6,3,1,3>: Cost 4 vmrglw <2,u,6,1>, <2,2,3,3>
+  3834251525U,	// <6,3,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <3,1,4,5>
+  3373147683U,	// <6,3,1,5>: Cost 5 vmrglw <2,u,6,1>, <2,1,3,5>
+  3391727545U,	// <6,3,1,6>: Cost 4 vmrglw <6,0,6,1>, <2,6,3,6>
+  2299406266U,	// <6,3,1,7>: Cost 3 vmrglw <2,u,6,1>, <2,6,3,7>
+  2251622550U,	// <6,3,1,u>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  2252294294U,	// <6,3,2,0>: Cost 3 vmrghw <6,2,7,3>, <3,0,1,2>
+  3326036198U,	// <6,3,2,1>: Cost 4 vmrghw <6,2,7,3>, <3,1,1,1>
+  3771836045U,	// <6,3,2,2>: Cost 4 vsldoi8 <2,2,6,3>, <2,2,6,3>
+  2252294556U,	// <6,3,2,3>: Cost 3 vmrghw <6,2,7,3>, <3,3,3,3>
+  2252294658U,	// <6,3,2,4>: Cost 3 vmrghw <6,2,7,3>, <3,4,5,6>
+  3840739677U,	// <6,3,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <3,2,5,3>
+  2704066490U,	// <6,3,2,6>: Cost 3 vsldoi8 <3,2,6,3>, <2,6,3,7>
+  3368511418U,	// <6,3,2,7>: Cost 4 vmrglw <2,1,6,2>, <2,6,3,7>
+  2252294942U,	// <6,3,2,u>: Cost 3 vmrghw <6,2,7,3>, <3,u,1,2>
+  3707158630U,	// <6,3,3,0>: Cost 4 vsldoi4 <2,6,3,3>, LHS
+  3765864692U,	// <6,3,3,1>: Cost 5 vsldoi8 <1,2,6,3>, <3,1,2,6>
+  2704066918U,	// <6,3,3,2>: Cost 3 vsldoi8 <3,2,6,3>, <3,2,6,3>
+  2772453788U,	// <6,3,3,3>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,3,3>
+  2772453799U,	// <6,3,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,4,5>
+  3789752888U,	// <6,3,3,5>: Cost 4 vsldoi8 <5,2,6,3>, <3,5,2,6>
+  3840739770U,	// <6,3,3,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,3,6,6>
+  2301413306U,	// <6,3,3,7>: Cost 3 vmrglw <3,2,6,3>, <2,6,3,7>
+  2775108043U,	// <6,3,3,u>: Cost 3 vsldoi12 <3,u,5,6>, <3,3,u,5>
+  2651340902U,	// <6,3,4,0>: Cost 3 vsldoi4 <5,6,3,4>, LHS
+  3846195674U,	// <6,3,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <3,4,1,2>
+  3845974503U,	// <6,3,4,2>: Cost 4 vsldoi12 <3,4,2,6>, <3,4,2,6>
+  2651343362U,	// <6,3,4,3>: Cost 3 vsldoi4 <5,6,3,4>, <3,4,5,6>
+  2651344182U,	// <6,3,4,4>: Cost 3 vsldoi4 <5,6,3,4>, RHS
+  1698712066U,	// <6,3,4,5>: Cost 2 vsldoi12 <3,4,5,6>, <3,4,5,6>
+  3383125864U,	// <6,3,4,6>: Cost 4 vmrglw <4,5,6,4>, <2,5,3,6>
+  3368527802U,	// <6,3,4,7>: Cost 4 vmrglw <2,1,6,4>, <2,6,3,7>
+  1698933277U,	// <6,3,4,u>: Cost 2 vsldoi12 <3,4,u,6>, <3,4,u,6>
+  3373179798U,	// <6,3,5,0>: Cost 4 vmrglw <2,u,6,5>, <1,2,3,0>
+  3707176179U,	// <6,3,5,1>: Cost 5 vsldoi4 <2,6,3,5>, <1,6,5,7>
+  2716012312U,	// <6,3,5,2>: Cost 3 vsldoi8 <5,2,6,3>, <5,2,6,3>
+  3373180530U,	// <6,3,5,3>: Cost 4 vmrglw <2,u,6,5>, <2,2,3,3>
+  2254309890U,	// <6,3,5,4>: Cost 3 vmrghw <6,5,7,6>, <3,4,5,6>
+  3785773070U,	// <6,3,5,5>: Cost 4 vsldoi8 <4,5,6,3>, <5,5,6,6>
+  3840739932U,	// <6,3,5,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,5,6,6>
+  2299439034U,	// <6,3,5,7>: Cost 3 vmrglw <2,u,6,5>, <2,6,3,7>
+  2719994110U,	// <6,3,5,u>: Cost 3 vsldoi8 <5,u,6,3>, <5,u,6,3>
+  2254899350U,	// <6,3,6,0>: Cost 3 vmrghw <6,6,6,6>, <3,0,1,2>
+  3328641254U,	// <6,3,6,1>: Cost 4 vmrghw <6,6,6,6>, <3,1,1,1>
+  2633443257U,	// <6,3,6,2>: Cost 3 vsldoi4 <2,6,3,6>, <2,6,3,6>
+  2254899612U,	// <6,3,6,3>: Cost 3 vmrghw <6,6,6,6>, <3,3,3,3>
+  2254899714U,	// <6,3,6,4>: Cost 3 vmrghw <6,6,6,6>, <3,4,5,6>
+  3785773772U,	// <6,3,6,5>: Cost 4 vsldoi8 <4,5,6,3>, <6,5,3,6>
+  2725966648U,	// <6,3,6,6>: Cost 3 vsldoi8 <6,u,6,3>, <6,6,6,6>
+  2322007994U,	// <6,3,6,7>: Cost 3 vmrglw <6,6,6,6>, <2,6,3,7>
+  2254899998U,	// <6,3,6,u>: Cost 3 vmrghw <6,6,6,6>, <3,u,1,2>
+  1559707750U,	// <6,3,7,0>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  2633450292U,	// <6,3,7,1>: Cost 3 vsldoi4 <2,6,3,7>, <1,1,1,1>
+  1559709626U,	// <6,3,7,2>: Cost 2 vsldoi4 <2,6,3,7>, <2,6,3,7>
+  1235666546U,	// <6,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559711030U,	// <6,3,7,4>: Cost 2 vsldoi4 <2,6,3,7>, RHS
+  2309408291U,	// <6,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2633454152U,	// <6,3,7,6>: Cost 3 vsldoi4 <2,6,3,7>, <6,3,7,0>
+  1235666874U,	// <6,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559713582U,	// <6,3,7,u>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  1559715942U,	// <6,3,u,0>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  2633458484U,	// <6,3,u,1>: Cost 3 vsldoi4 <2,6,3,u>, <1,1,1,1>
+  1559717819U,	// <6,3,u,2>: Cost 2 vsldoi4 <2,6,3,u>, <2,6,3,u>
+  1235674738U,	// <6,3,u,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559719222U,	// <6,3,u,4>: Cost 2 vsldoi4 <2,6,3,u>, RHS
+  1701366598U,	// <6,3,u,5>: Cost 2 vsldoi12 <3,u,5,6>, <3,u,5,6>
+  2633462353U,	// <6,3,u,6>: Cost 3 vsldoi4 <2,6,3,u>, <6,3,u,0>
+  1235675066U,	// <6,3,u,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559721774U,	// <6,3,u,u>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  3785777152U,	// <6,4,0,0>: Cost 4 vsldoi8 <4,5,6,4>, <0,0,0,0>
+  2712035430U,	// <6,4,0,1>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3771179185U,	// <6,4,0,2>: Cost 4 vsldoi8 <2,1,6,4>, <0,2,1,6>
+  3846196096U,	// <6,4,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <4,0,3,1>
+  3785777490U,	// <6,4,0,4>: Cost 4 vsldoi8 <4,5,6,4>, <0,4,1,5>
+  2250517814U,	// <6,4,0,5>: Cost 3 vmrghw <6,0,1,2>, RHS
+  3324259703U,	// <6,4,0,6>: Cost 4 vmrghw <6,0,1,2>, <4,6,5,0>
+  3383092458U,	// <6,4,0,7>: Cost 5 vmrglw <4,5,6,0>, <1,6,4,7>
+  2712035997U,	// <6,4,0,u>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3325356946U,	// <6,4,1,0>: Cost 4 vmrghw <6,1,7,1>, <4,0,5,1>
+  3785777972U,	// <6,4,1,1>: Cost 4 vsldoi8 <4,5,6,4>, <1,1,1,1>
+  3846196170U,	// <6,4,1,2>: Cost 4 vsldoi12 <3,4,5,6>, <4,1,2,3>
+  3325365380U,	// <6,4,1,3>: Cost 4 vmrghw <6,1,7,2>, <4,3,5,0>
+  3852168155U,	// <6,4,1,4>: Cost 4 vsldoi12 <4,4,5,6>, <4,1,4,2>
+  2251615542U,	// <6,4,1,5>: Cost 3 vmrghw <6,1,7,1>, RHS
+  3325357432U,	// <6,4,1,6>: Cost 4 vmrghw <6,1,7,1>, <4,6,5,1>
+  3870084088U,	// <6,4,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <4,1,7,4>
+  2251615785U,	// <6,4,1,u>: Cost 3 vmrghw <6,1,7,1>, RHS
+  2252295058U,	// <6,4,2,0>: Cost 3 vmrghw <6,2,7,3>, <4,0,5,1>
+  3771180605U,	// <6,4,2,1>: Cost 4 vsldoi8 <2,1,6,4>, <2,1,6,4>
+  3785778792U,	// <6,4,2,2>: Cost 4 vsldoi8 <4,5,6,4>, <2,2,2,2>
+  3777816253U,	// <6,4,2,3>: Cost 4 vsldoi8 <3,2,6,4>, <2,3,2,6>
+  2252295376U,	// <6,4,2,4>: Cost 3 vmrghw <6,2,7,3>, <4,4,4,4>
+  1178553654U,	// <6,4,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  2252295545U,	// <6,4,2,6>: Cost 3 vmrghw <6,2,7,3>, <4,6,5,2>
+  3326037448U,	// <6,4,2,7>: Cost 4 vmrghw <6,2,7,3>, <4,7,5,0>
+  1178553897U,	// <6,4,2,u>: Cost 2 vmrghw <6,2,7,3>, RHS
+  3785779350U,	// <6,4,3,0>: Cost 4 vsldoi8 <4,5,6,4>, <3,0,1,2>
+  3383118648U,	// <6,4,3,1>: Cost 4 vmrglw <4,5,6,3>, <3,u,4,1>
+  3777816935U,	// <6,4,3,2>: Cost 4 vsldoi8 <3,2,6,4>, <3,2,6,4>
+  3785779612U,	// <6,4,3,3>: Cost 4 vsldoi8 <4,5,6,4>, <3,3,3,3>
+  2712037890U,	// <6,4,3,4>: Cost 3 vsldoi8 <4,5,6,4>, <3,4,5,6>
+  2252754230U,	// <6,4,3,5>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3784452764U,	// <6,4,3,6>: Cost 4 vsldoi8 <4,3,6,4>, <3,6,4,7>
+  3801705178U,	// <6,4,3,7>: Cost 4 vsldoi8 <7,2,6,4>, <3,7,2,6>
+  2252754473U,	// <6,4,3,u>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3787770770U,	// <6,4,4,0>: Cost 4 vsldoi8 <4,u,6,4>, <4,0,5,1>
+  3383126840U,	// <6,4,4,1>: Cost 4 vmrglw <4,5,6,4>, <3,u,4,1>
+  3327380534U,	// <6,4,4,2>: Cost 4 vmrghw <6,4,7,5>, <4,2,5,3>
+  3784453265U,	// <6,4,4,3>: Cost 4 vsldoi8 <4,3,6,4>, <4,3,6,4>
+  2253630672U,	// <6,4,4,4>: Cost 3 vmrghw <6,4,7,4>, <4,4,4,4>
+  2778426587U,	// <6,4,4,5>: Cost 3 vsldoi12 <4,4,5,6>, <4,4,5,6>
+  3383128789U,	// <6,4,4,6>: Cost 4 vmrglw <4,5,6,4>, <6,5,4,6>
+  3381799580U,	// <6,4,4,7>: Cost 4 vmrglw <4,3,6,4>, <3,6,4,7>
+  2778647798U,	// <6,4,4,u>: Cost 3 vsldoi12 <4,4,u,6>, <4,4,u,6>
+  2651422822U,	// <6,4,5,0>: Cost 3 vsldoi4 <5,6,4,5>, LHS
+  3701277928U,	// <6,4,5,1>: Cost 4 vsldoi4 <1,6,4,5>, <1,6,4,5>
+  3701278650U,	// <6,4,5,2>: Cost 4 vsldoi4 <1,6,4,5>, <2,6,3,7>
+  2651425282U,	// <6,4,5,3>: Cost 3 vsldoi4 <5,6,4,5>, <3,4,5,6>
+  2651426102U,	// <6,4,5,4>: Cost 3 vsldoi4 <5,6,4,5>, RHS
+  2651426892U,	// <6,4,5,5>: Cost 3 vsldoi4 <5,6,4,5>, <5,6,4,5>
+  1698712886U,	// <6,4,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3725169658U,	// <6,4,5,7>: Cost 4 vsldoi4 <5,6,4,5>, <7,0,1,2>
+  1698712904U,	// <6,4,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2254900114U,	// <6,4,6,0>: Cost 3 vmrghw <6,6,6,6>, <4,0,5,1>
+  3389115192U,	// <6,4,6,1>: Cost 4 vmrglw <5,5,6,6>, <3,u,4,1>
+  3785781727U,	// <6,4,6,2>: Cost 4 vsldoi8 <4,5,6,4>, <6,2,4,3>
+  3785781810U,	// <6,4,6,3>: Cost 4 vsldoi8 <4,5,6,4>, <6,3,4,5>
+  2254900432U,	// <6,4,6,4>: Cost 3 vmrghw <6,6,6,6>, <4,4,4,4>
+  1181158710U,	// <6,4,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2254900605U,	// <6,4,6,6>: Cost 3 vmrghw <6,6,6,6>, <4,6,5,6>
+  3787772750U,	// <6,4,6,7>: Cost 4 vsldoi8 <4,u,6,4>, <6,7,0,1>
+  1181158953U,	// <6,4,6,u>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2639495270U,	// <6,4,7,0>: Cost 3 vsldoi4 <3,6,4,7>, LHS
+  2639496090U,	// <6,4,7,1>: Cost 3 vsldoi4 <3,6,4,7>, <1,2,3,4>
+  3707267011U,	// <6,4,7,2>: Cost 4 vsldoi4 <2,6,4,7>, <2,6,4,7>
+  2639497884U,	// <6,4,7,3>: Cost 3 vsldoi4 <3,6,4,7>, <3,6,4,7>
+  1237658832U,	// <6,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235666638U,	// <6,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  3713241753U,	// <6,4,7,6>: Cost 4 vsldoi4 <3,6,4,7>, <6,4,7,0>
+  2309409436U,	// <6,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235666641U,	// <6,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  2639503462U,	// <6,4,u,0>: Cost 3 vsldoi4 <3,6,4,u>, LHS
+  2639504282U,	// <6,4,u,1>: Cost 3 vsldoi4 <3,6,4,u>, <1,2,3,4>
+  3701303226U,	// <6,4,u,2>: Cost 4 vsldoi4 <1,6,4,u>, <2,6,3,7>
+  2639506077U,	// <6,4,u,3>: Cost 3 vsldoi4 <3,6,4,u>, <3,6,4,u>
+  1235676368U,	// <6,4,u,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235674830U,	// <6,4,u,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  1698713129U,	// <6,4,u,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2309417628U,	// <6,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1698713147U,	// <6,4,u,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3775832064U,	// <6,5,0,0>: Cost 4 vsldoi8 <2,u,6,5>, <0,0,0,0>
+  2702090342U,	// <6,5,0,1>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3775832241U,	// <6,5,0,2>: Cost 4 vsldoi8 <2,u,6,5>, <0,2,1,6>
+  3719227906U,	// <6,5,0,3>: Cost 4 vsldoi4 <4,6,5,0>, <3,4,5,6>
+  3775832402U,	// <6,5,0,4>: Cost 4 vsldoi8 <2,u,6,5>, <0,4,1,5>
+  3385085146U,	// <6,5,0,5>: Cost 4 vmrglw <4,u,6,0>, <4,4,5,5>
+  2309351938U,	// <6,5,0,6>: Cost 3 vmrglw <4,5,6,0>, <3,4,5,6>
+  3376459134U,	// <6,5,0,7>: Cost 5 vmrglw <3,4,6,0>, <4,6,5,7>
+  2702090909U,	// <6,5,0,u>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3719233546U,	// <6,5,1,0>: Cost 4 vsldoi4 <4,6,5,1>, <0,0,1,1>
+  3775832884U,	// <6,5,1,1>: Cost 4 vsldoi8 <2,u,6,5>, <1,1,1,1>
+  3775832982U,	// <6,5,1,2>: Cost 4 vsldoi8 <2,u,6,5>, <1,2,3,0>
+  3846196909U,	// <6,5,1,3>: Cost 4 vsldoi12 <3,4,5,6>, <5,1,3,4>
+  3719236984U,	// <6,5,1,4>: Cost 4 vsldoi4 <4,6,5,1>, <4,6,5,1>
+  3856150209U,	// <6,5,1,5>: Cost 4 vsldoi12 <5,1,5,6>, <5,1,5,6>
+  3834252997U,	// <6,5,1,6>: Cost 4 vsldoi12 <1,4,5,6>, <5,1,6,1>
+  3870084817U,	// <6,5,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,1,7,4>
+  3769861532U,	// <6,5,1,u>: Cost 4 vsldoi8 <1,u,6,5>, <1,u,6,5>
+  2645500006U,	// <6,5,2,0>: Cost 3 vsldoi4 <4,6,5,2>, LHS
+  3719242548U,	// <6,5,2,1>: Cost 4 vsldoi4 <4,6,5,2>, <1,1,1,1>
+  3775833704U,	// <6,5,2,2>: Cost 4 vsldoi8 <2,u,6,5>, <2,2,2,2>
+  3775833766U,	// <6,5,2,3>: Cost 4 vsldoi8 <2,u,6,5>, <2,3,0,1>
+  2645503353U,	// <6,5,2,4>: Cost 3 vsldoi4 <4,6,5,2>, <4,6,5,2>
+  2252296196U,	// <6,5,2,5>: Cost 3 vmrghw <6,2,7,3>, <5,5,5,5>
+  2702092218U,	// <6,5,2,6>: Cost 3 vsldoi8 <2,u,6,5>, <2,6,3,7>
+  3719246842U,	// <6,5,2,7>: Cost 4 vsldoi4 <4,6,5,2>, <7,0,1,2>
+  2702092405U,	// <6,5,2,u>: Cost 3 vsldoi8 <2,u,6,5>, <2,u,6,5>
+  3775834262U,	// <6,5,3,0>: Cost 4 vsldoi8 <2,u,6,5>, <3,0,1,2>
+  3777161495U,	// <6,5,3,1>: Cost 4 vsldoi8 <3,1,6,5>, <3,1,6,5>
+  3775834470U,	// <6,5,3,2>: Cost 4 vsldoi8 <2,u,6,5>, <3,2,6,3>
+  3775834524U,	// <6,5,3,3>: Cost 4 vsldoi8 <2,u,6,5>, <3,3,3,3>
+  3775834626U,	// <6,5,3,4>: Cost 4 vsldoi8 <2,u,6,5>, <3,4,5,6>
+  3385109722U,	// <6,5,3,5>: Cost 4 vmrglw <4,u,6,3>, <4,4,5,5>
+  2309376514U,	// <6,5,3,6>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3775834819U,	// <6,5,3,7>: Cost 4 vsldoi8 <2,u,6,5>, <3,7,0,1>
+  2309376514U,	// <6,5,3,u>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3719258214U,	// <6,5,4,0>: Cost 4 vsldoi4 <4,6,5,4>, LHS
+  3385117586U,	// <6,5,4,1>: Cost 4 vmrglw <4,u,6,4>, <4,0,5,1>
+  3327242008U,	// <6,5,4,2>: Cost 4 vmrghw <6,4,5,6>, <5,2,6,3>
+  3719260674U,	// <6,5,4,3>: Cost 4 vsldoi4 <4,6,5,4>, <3,4,5,6>
+  3719261563U,	// <6,5,4,4>: Cost 4 vsldoi4 <4,6,5,4>, <4,6,5,4>
+  2702093622U,	// <6,5,4,5>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  2309384706U,	// <6,5,4,6>: Cost 3 vmrglw <4,5,6,4>, <3,4,5,6>
+  3870085060U,	// <6,5,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,4,7,4>
+  2702093865U,	// <6,5,4,u>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  3719266406U,	// <6,5,5,0>: Cost 4 vsldoi4 <4,6,5,5>, LHS
+  3789106889U,	// <6,5,5,1>: Cost 4 vsldoi8 <5,1,6,5>, <5,1,6,5>
+  3785789208U,	// <6,5,5,2>: Cost 4 vsldoi8 <4,5,6,5>, <5,2,6,3>
+  3373183950U,	// <6,5,5,3>: Cost 4 vmrglw <2,u,6,5>, <6,u,5,3>
+  2717355964U,	// <6,5,5,4>: Cost 3 vsldoi8 <5,4,6,5>, <5,4,6,5>
+  2791772164U,	// <6,5,5,5>: Cost 3 vsldoi12 <6,6,6,6>, <5,5,5,5>
+  2772455438U,	// <6,5,5,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,5,6,6>
+  3373183549U,	// <6,5,5,7>: Cost 4 vmrglw <2,u,6,5>, <6,3,5,7>
+  2720010496U,	// <6,5,5,u>: Cost 3 vsldoi8 <5,u,6,5>, <5,u,6,5>
+  2772455460U,	// <6,5,6,0>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,0,1>
+  2322008978U,	// <6,5,6,1>: Cost 3 vmrglw <6,6,6,6>, <4,0,5,1>
+  3840225335U,	// <6,5,6,2>: Cost 4 vsldoi12 <2,4,5,6>, <5,6,2,2>
+  2772455490U,	// <6,5,6,3>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,3,4>
+  2772455500U,	// <6,5,6,4>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,4,5>
+  2254901252U,	// <6,5,6,5>: Cost 3 vmrghw <6,6,6,6>, <5,5,5,5>
+  2772455520U,	// <6,5,6,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,6,7>
+  2785874024U,	// <6,5,6,7>: Cost 3 vsldoi12 <5,6,7,6>, <5,6,7,6>
+  2772455532U,	// <6,5,6,u>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,u,1>
+  2627625062U,	// <6,5,7,0>: Cost 3 vsldoi4 <1,6,5,7>, LHS
+  1235667858U,	// <6,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309409278U,	// <6,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309407659U,	// <6,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627628342U,	// <6,5,7,4>: Cost 3 vsldoi4 <1,6,5,7>, RHS
+  1235668186U,	// <6,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235667458U,	// <6,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309407987U,	// <6,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235667460U,	// <6,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2627633254U,	// <6,5,u,0>: Cost 3 vsldoi4 <1,6,5,u>, LHS
+  1235676050U,	// <6,5,u,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309417470U,	// <6,5,u,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309415851U,	// <6,5,u,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627636534U,	// <6,5,u,4>: Cost 3 vsldoi4 <1,6,5,u>, RHS
+  1235676378U,	// <6,5,u,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235675650U,	// <6,5,u,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309416179U,	// <6,5,u,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235675652U,	// <6,5,u,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2309352751U,	// <6,6,0,0>: Cost 3 vmrglw <4,5,6,0>, <4,5,6,0>
+  1650917478U,	// <6,6,0,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  2250584570U,	// <6,6,0,2>: Cost 3 vmrghw <6,0,2,1>, <6,2,7,3>
+  3846197554U,	// <6,6,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <6,0,3,1>
+  2724659538U,	// <6,6,0,4>: Cost 3 vsldoi8 <6,6,6,6>, <0,4,1,5>
+  3725275225U,	// <6,6,0,5>: Cost 4 vsldoi4 <5,6,6,0>, <5,6,6,0>
+  2791772493U,	// <6,6,0,6>: Cost 3 vsldoi12 <6,6,6,6>, <6,0,6,1>
+  2309352758U,	// <6,6,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  1650918045U,	// <6,6,0,u>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  3325358368U,	// <6,6,1,0>: Cost 4 vmrghw <6,1,7,1>, <6,0,1,1>
+  2299406449U,	// <6,6,1,1>: Cost 3 vmrglw <2,u,6,1>, <2,u,6,1>
+  2724660118U,	// <6,6,1,2>: Cost 3 vsldoi8 <6,6,6,6>, <1,2,3,0>
+  3373148518U,	// <6,6,1,3>: Cost 4 vmrglw <2,u,6,1>, <3,2,6,3>
+  3834253712U,	// <6,6,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <6,1,4,5>
+  3373147953U,	// <6,6,1,5>: Cost 4 vmrglw <2,u,6,1>, <2,4,6,5>
+  2323297080U,	// <6,6,1,6>: Cost 3 vmrglw <6,u,6,1>, <6,6,6,6>
+  2299407670U,	// <6,6,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2299407671U,	// <6,6,1,u>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2252296489U,	// <6,6,2,0>: Cost 3 vmrghw <6,2,7,3>, <6,0,2,1>
+  3326038394U,	// <6,6,2,1>: Cost 4 vmrghw <6,2,7,3>, <6,1,2,1>
+  1178554874U,	// <6,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724660902U,	// <6,6,2,3>: Cost 3 vsldoi8 <6,6,6,6>, <2,3,0,1>
+  2252296817U,	// <6,6,2,4>: Cost 3 vmrghw <6,2,7,3>, <6,4,2,5>
+  3840741864U,	// <6,6,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <6,2,5,3>
+  2252296976U,	// <6,6,2,6>: Cost 3 vmrghw <6,2,7,3>, <6,6,2,2>
+  2785874426U,	// <6,6,2,7>: Cost 3 vsldoi12 <5,6,7,6>, <6,2,7,3>
+  1178554874U,	// <6,6,2,u>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724661398U,	// <6,6,3,0>: Cost 3 vsldoi8 <6,6,6,6>, <3,0,1,2>
+  3375154665U,	// <6,6,3,1>: Cost 4 vmrglw <3,2,6,3>, <2,0,6,1>
+  3375154909U,	// <6,6,3,2>: Cost 4 vmrglw <3,2,6,3>, <2,3,6,2>
+  2301413734U,	// <6,6,3,3>: Cost 3 vmrglw <3,2,6,3>, <3,2,6,3>
+  2772455986U,	// <6,6,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <6,3,4,5>
+  3375154993U,	// <6,6,3,5>: Cost 4 vmrglw <3,2,6,3>, <2,4,6,5>
+  2323313464U,	// <6,6,3,6>: Cost 3 vmrglw <6,u,6,3>, <6,6,6,6>
+  2301414710U,	// <6,6,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2301414711U,	// <6,6,3,u>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2724662162U,	// <6,6,4,0>: Cost 3 vsldoi8 <6,6,6,6>, <4,0,5,1>
+  3326939559U,	// <6,6,4,1>: Cost 4 vmrghw <6,4,1,5>, <6,1,7,1>
+  2253271546U,	// <6,6,4,2>: Cost 3 vmrghw <6,4,2,5>, <6,2,7,3>
+  3383127346U,	// <6,6,4,3>: Cost 4 vmrglw <4,5,6,4>, <4,5,6,3>
+  2309385523U,	// <6,6,4,4>: Cost 3 vmrglw <4,5,6,4>, <4,5,6,4>
+  1650920758U,	// <6,6,4,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  2724662653U,	// <6,6,4,6>: Cost 3 vsldoi8 <6,6,6,6>, <4,6,5,6>
+  2309385526U,	// <6,6,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  1650921001U,	// <6,6,4,u>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  3725312102U,	// <6,6,5,0>: Cost 4 vsldoi4 <5,6,6,5>, LHS
+  3373180393U,	// <6,6,5,1>: Cost 4 vmrglw <2,u,6,5>, <2,0,6,1>
+  3791769368U,	// <6,6,5,2>: Cost 4 vsldoi8 <5,5,6,6>, <5,2,6,3>
+  3373181286U,	// <6,6,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,6,3>
+  3725315382U,	// <6,6,5,4>: Cost 4 vsldoi4 <5,6,6,5>, RHS
+  2299439221U,	// <6,6,5,5>: Cost 3 vmrglw <2,u,6,5>, <2,u,6,5>
+  2724663394U,	// <6,6,5,6>: Cost 3 vsldoi8 <6,6,6,6>, <5,6,7,0>
+  2299440438U,	// <6,6,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  2299440439U,	// <6,6,5,u>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1583808614U,	// <6,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <6,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2254574074U,	// <6,6,6,2>: Cost 3 vmrghw <6,6,2,2>, <6,2,7,3>
+  2322010609U,	// <6,6,6,3>: Cost 3 vmrglw <6,6,6,6>, <6,2,6,3>
+  1583811894U,	// <6,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2322010773U,	// <6,6,6,5>: Cost 3 vmrglw <6,6,6,6>, <6,4,6,5>
+  363253046U,	// <6,6,6,6>: Cost 1 vspltisw2 RHS
+  1248267574U,	// <6,6,6,7>: Cost 2 vmrglw <6,6,6,6>, RHS
+  363253046U,	// <6,6,6,u>: Cost 1 vspltisw2 RHS
+  2309410095U,	// <6,6,7,0>: Cost 3 vmrglw RHS, <4,5,6,0>
+  2309408233U,	// <6,6,7,1>: Cost 3 vmrglw RHS, <2,0,6,1>
+  2311402373U,	// <6,6,7,2>: Cost 3 vmrglw RHS, <6,7,6,2>
+  2309409126U,	// <6,6,7,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  2309410099U,	// <6,6,7,4>: Cost 3 vmrglw RHS, <4,5,6,4>
+  2309408561U,	// <6,6,7,5>: Cost 3 vmrglw RHS, <2,4,6,5>
+  1237660472U,	// <6,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  161926454U,	// <6,6,7,7>: Cost 1 vmrglw RHS, RHS
+  161926455U,	// <6,6,7,u>: Cost 1 vmrglw RHS, RHS
+  1583808614U,	// <6,6,u,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1650923310U,	// <6,6,u,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  1178554874U,	// <6,6,u,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2309417318U,	// <6,6,u,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  1583811894U,	// <6,6,u,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1650923674U,	// <6,6,u,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  363253046U,	// <6,6,u,6>: Cost 1 vspltisw2 RHS
+  161934646U,	// <6,6,u,7>: Cost 1 vmrglw RHS, RHS
+  161934647U,	// <6,6,u,u>: Cost 1 vmrglw RHS, RHS
+  1638318080U,	// <6,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564576358U,	// <6,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712060077U,	// <6,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712060156U,	// <6,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638318418U,	// <6,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577865314U,	// <6,7,0,5>: Cost 2 vsldoi4 <5,6,7,0>, <5,6,7,0>
+  2712060406U,	// <6,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2651608058U,	// <6,7,0,7>: Cost 3 vsldoi4 <5,6,7,0>, <7,0,1,2>
+  564576925U,	// <6,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712060643U,	// <6,7,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638318900U,	// <6,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638318998U,	// <6,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  3766559753U,	// <6,7,1,3>: Cost 4 vsldoi8 <1,3,6,7>, <1,3,6,7>
+  2712060971U,	// <6,7,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712061039U,	// <6,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712061135U,	// <6,7,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  3373148612U,	// <6,7,1,7>: Cost 4 vmrglw <2,u,6,1>, <3,3,7,7>
+  1638319484U,	// <6,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712061373U,	// <6,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712061471U,	// <6,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638319720U,	// <6,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638319782U,	// <6,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712061709U,	// <6,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712061800U,	// <6,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1638320058U,	// <6,7,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252297836U,	// <6,7,2,7>: Cost 3 vmrghw <6,2,7,3>, <7,7,7,7>
+  1638320187U,	// <6,7,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638320278U,	// <6,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712062182U,	// <6,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2712062256U,	// <6,7,3,2>: Cost 3 vsldoi8 RHS, <3,2,0,3>
+  1638320540U,	// <6,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638320642U,	// <6,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712062546U,	// <6,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712062584U,	// <6,7,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2712062659U,	// <6,7,3,7>: Cost 3 vsldoi8 RHS, <3,7,0,1>
+  1638320926U,	// <6,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638321042U,	// <6,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712062922U,	// <6,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712063029U,	// <6,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712063108U,	// <6,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638321360U,	// <6,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564579638U,	// <6,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712063357U,	// <6,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,6>
+  2712063439U,	// <6,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,7>
+  564579881U,	// <6,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712063560U,	// <6,7,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714054287U,	// <6,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712063742U,	// <6,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  3373181295U,	// <6,7,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,7,3>
+  2712063924U,	// <6,7,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638322180U,	// <6,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638322274U,	// <6,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  3373181380U,	// <6,7,5,7>: Cost 4 vmrglw <2,u,6,5>, <3,3,7,7>
+  1640313092U,	// <6,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712064289U,	// <6,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712064423U,	// <6,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638322682U,	// <6,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712064562U,	// <6,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712064653U,	// <6,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712064747U,	// <6,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638323000U,	// <6,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638323022U,	// <6,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638323168U,	// <6,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,7,3>
+  1237659746U,	// <6,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309411158U,	// <6,7,7,1>: Cost 3 vmrglw RHS, <6,0,7,1>
+  2639718330U,	// <6,7,7,2>: Cost 3 vsldoi4 <3,6,7,7>, <2,6,3,7>
+  1235669498U,	// <6,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1237659750U,	// <6,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309411243U,	// <6,7,7,5>: Cost 3 vmrglw RHS, <6,1,7,5>
+  1583895362U,	// <6,7,7,6>: Cost 2 vsldoi4 <6,6,7,7>, <6,6,7,7>
+  1235669826U,	// <6,7,7,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  1235669503U,	// <6,7,7,u>: Cost 2 vmrglw RHS, <6,2,7,u>
+  1638323923U,	// <6,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564582190U,	// <6,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638324101U,	// <6,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638324156U,	// <6,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638324287U,	// <6,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564582554U,	// <6,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638324432U,	// <6,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  1235678018U,	// <6,7,u,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  564582757U,	// <6,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  1638326272U,	// <6,u,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564584550U,	// <6,u,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712068269U,	// <6,u,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2309349532U,	// <6,u,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  1638326610U,	// <6,u,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577939051U,	// <6,u,0,5>: Cost 2 vsldoi4 <5,6,u,0>, <5,6,u,0>
+  2712068598U,	// <6,u,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2309352776U,	// <6,u,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  564585117U,	// <6,u,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712068835U,	// <6,u,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638327092U,	// <6,u,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1698715438U,	// <6,u,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2299404444U,	// <6,u,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2712069163U,	// <6,u,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712069231U,	// <6,u,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712069327U,	// <6,u,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  2299407688U,	// <6,u,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  1698715492U,	// <6,u,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2712069565U,	// <6,u,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  1178556206U,	// <6,u,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  1638327912U,	// <6,u,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638327974U,	// <6,u,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712069901U,	// <6,u,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  1178556570U,	// <6,u,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  1638328250U,	// <6,u,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252298496U,	// <6,u,2,7>: Cost 3 vmrghw <6,2,7,3>, <u,7,0,1>
+  1638328379U,	// <6,u,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638328470U,	// <6,u,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712070374U,	// <6,u,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2704107883U,	// <6,u,3,2>: Cost 3 vsldoi8 <3,2,6,u>, <3,2,6,u>
+  1638328732U,	// <6,u,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638328834U,	// <6,u,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712070738U,	// <6,u,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712070776U,	// <6,u,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2301414728U,	// <6,u,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  1638329118U,	// <6,u,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638329234U,	// <6,u,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712071114U,	// <6,u,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712071221U,	// <6,u,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2309382300U,	// <6,u,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  1638329552U,	// <6,u,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564587831U,	// <6,u,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712071545U,	// <6,u,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2309385544U,	// <6,u,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  564588073U,	// <6,u,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712071752U,	// <6,u,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714062479U,	// <6,u,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712071934U,	// <6,u,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2299437212U,	// <6,u,5,3>: Cost 3 vmrglw <2,u,6,5>, LHS
+  2712072116U,	// <6,u,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638330372U,	// <6,u,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1698715802U,	// <6,u,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2299440456U,	// <6,u,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1698715820U,	// <6,u,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  1583808614U,	// <6,u,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1181161262U,	// <6,u,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1638330874U,	// <6,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1248264348U,	// <6,u,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  1583811894U,	// <6,u,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1181161626U,	// <6,u,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  363253046U,	// <6,u,6,6>: Cost 1 vspltisw2 RHS
+  1638331214U,	// <6,u,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  363253046U,	// <6,u,6,u>: Cost 1 vspltisw2 RHS
+  1560076390U,	// <6,u,7,0>: Cost 2 vsldoi4 <2,6,u,7>, LHS
+  1235664969U,	// <6,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1560078311U,	// <6,u,7,2>: Cost 2 vsldoi4 <2,6,u,7>, <2,6,u,7>
+  161923228U,	// <6,u,7,3>: Cost 1 vmrglw RHS, LHS
+  1560079670U,	// <6,u,7,4>: Cost 2 vsldoi4 <2,6,u,7>, RHS
+  1235665297U,	// <6,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235667485U,	// <6,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  161926472U,	// <6,u,7,7>: Cost 1 vmrglw RHS, RHS
+  161923233U,	// <6,u,7,u>: Cost 1 vmrglw RHS, LHS
+  1560084582U,	// <6,u,u,0>: Cost 2 vsldoi4 <2,6,u,u>, LHS
+  564590382U,	// <6,u,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1560086504U,	// <6,u,u,2>: Cost 2 vsldoi4 <2,6,u,u>, <2,6,u,u>
+  161931420U,	// <6,u,u,3>: Cost 1 vmrglw RHS, LHS
+  1560087862U,	// <6,u,u,4>: Cost 2 vsldoi4 <2,6,u,u>, RHS
+  564590746U,	// <6,u,u,5>: Cost 1 vsldoi8 RHS, RHS
+  363253046U,	// <6,u,u,6>: Cost 1 vspltisw2 RHS
+  161934664U,	// <6,u,u,7>: Cost 1 vmrglw RHS, RHS
+  161931425U,	// <6,u,u,u>: Cost 1 vmrglw RHS, LHS
+  1705426944U,	// <7,0,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705426954U,	// <7,0,0,1>: Cost 2 vsldoi12 RHS, <0,0,1,1>
+  3713550266U,	// <7,0,0,2>: Cost 4 vsldoi4 <3,7,0,0>, <2,6,3,7>
+  2316063892U,	// <7,0,0,3>: Cost 3 vmrglw <5,6,7,0>, <7,2,0,3>
+  2779168805U,	// <7,0,0,4>: Cost 3 vsldoi12 RHS, <0,0,4,1>
+  2663698530U,	// <7,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2657727309U,	// <7,0,0,6>: Cost 3 vsldoi4 <6,7,0,0>, <6,7,0,0>
+  2316064220U,	// <7,0,0,7>: Cost 3 vmrglw <5,6,7,0>, <7,6,0,7>
+  1705427017U,	// <7,0,0,u>: Cost 2 vsldoi12 RHS, <0,0,u,1>
+  1583988838U,	// <7,0,1,0>: Cost 2 vsldoi4 <6,7,0,1>, LHS
+  2779168859U,	// <7,0,1,1>: Cost 3 vsldoi12 RHS, <0,1,1,1>
+  631685222U,	// <7,0,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2639817411U,	// <7,0,1,3>: Cost 3 vsldoi4 <3,7,0,1>, <3,7,0,1>
+  1583992118U,	// <7,0,1,4>: Cost 2 vsldoi4 <6,7,0,1>, RHS
+  2657734660U,	// <7,0,1,5>: Cost 3 vsldoi4 <6,7,0,1>, <5,5,5,5>
+  1583993678U,	// <7,0,1,6>: Cost 2 vsldoi4 <6,7,0,1>, <6,7,0,1>
+  2657735672U,	// <7,0,1,7>: Cost 3 vsldoi4 <6,7,0,1>, <7,0,1,0>
+  631685276U,	// <7,0,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779168933U,	// <7,0,2,0>: Cost 3 vsldoi12 RHS, <0,2,0,3>
+  2767667377U,	// <7,0,2,1>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,1,6>
+  2718713448U,	// <7,0,2,2>: Cost 3 vsldoi8 <5,6,7,0>, <2,2,2,2>
+  2718713510U,	// <7,0,2,3>: Cost 3 vsldoi8 <5,6,7,0>, <2,3,0,1>
+  3841409228U,	// <7,0,2,4>: Cost 4 vsldoi12 <2,6,3,7>, <0,2,4,6>
+  3852910802U,	// <7,0,2,5>: Cost 4 vsldoi12 RHS, <0,2,5,3>
+  2718713786U,	// <7,0,2,6>: Cost 3 vsldoi8 <5,6,7,0>, <2,6,3,7>
+  3847160036U,	// <7,0,2,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,2,7,3>
+  2767667440U,	// <7,0,2,u>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,u,6>
+  2718714006U,	// <7,0,3,0>: Cost 3 vsldoi8 <5,6,7,0>, <3,0,1,2>
+  2779169020U,	// <7,0,3,1>: Cost 3 vsldoi12 RHS, <0,3,1,0>
+  3852910853U,	// <7,0,3,2>: Cost 4 vsldoi12 RHS, <0,3,2,0>
+  2718714268U,	// <7,0,3,3>: Cost 3 vsldoi8 <5,6,7,0>, <3,3,3,3>
+  2718714370U,	// <7,0,3,4>: Cost 3 vsldoi8 <5,6,7,0>, <3,4,5,6>
+  2718714461U,	// <7,0,3,5>: Cost 3 vsldoi8 <5,6,7,0>, <3,5,6,7>
+  2706770608U,	// <7,0,3,6>: Cost 3 vsldoi8 <3,6,7,0>, <3,6,7,0>
+  3847160114U,	// <7,0,3,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,3,7,0>
+  2779169083U,	// <7,0,3,u>: Cost 3 vsldoi12 RHS, <0,3,u,0>
+  2718714770U,	// <7,0,4,0>: Cost 3 vsldoi8 <5,6,7,0>, <4,0,5,1>
+  1705427282U,	// <7,0,4,1>: Cost 2 vsldoi12 RHS, <0,4,1,5>
+  3713583034U,	// <7,0,4,2>: Cost 4 vsldoi4 <3,7,0,4>, <2,6,3,7>
+  3713583814U,	// <7,0,4,3>: Cost 4 vsldoi4 <3,7,0,4>, <3,7,0,4>
+  2779169133U,	// <7,0,4,4>: Cost 3 vsldoi12 RHS, <0,4,4,5>
+  1644973366U,	// <7,0,4,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  2657760081U,	// <7,0,4,6>: Cost 3 vsldoi4 <6,7,0,4>, <6,7,0,4>
+  2259468868U,	// <7,0,4,7>: Cost 3 vmrghw <7,4,5,6>, <0,7,1,4>
+  1705427345U,	// <7,0,4,u>: Cost 2 vsldoi12 RHS, <0,4,u,5>
+  2718715508U,	// <7,0,5,0>: Cost 3 vsldoi8 <5,6,7,0>, <5,0,6,1>
+  2260123750U,	// <7,0,5,1>: Cost 3 vmrghw <7,5,5,5>, LHS
+  3792457451U,	// <7,0,5,2>: Cost 4 vsldoi8 <5,6,7,0>, <5,2,1,3>
+  3852911024U,	// <7,0,5,3>: Cost 4 vsldoi12 RHS, <0,5,3,0>
+  2718715836U,	// <7,0,5,4>: Cost 3 vsldoi8 <5,6,7,0>, <5,4,6,5>
+  2718715908U,	// <7,0,5,5>: Cost 3 vsldoi8 <5,6,7,0>, <5,5,5,5>
+  1644974178U,	// <7,0,5,6>: Cost 2 vsldoi8 <5,6,7,0>, <5,6,7,0>
+  3792457853U,	// <7,0,5,7>: Cost 4 vsldoi8 <5,6,7,0>, <5,7,1,0>
+  1646301444U,	// <7,0,5,u>: Cost 2 vsldoi8 <5,u,7,0>, <5,u,7,0>
+  2720706901U,	// <7,0,6,0>: Cost 3 vsldoi8 <6,0,7,0>, <6,0,7,0>
+  2779169270U,	// <7,0,6,1>: Cost 3 vsldoi12 RHS, <0,6,1,7>
+  2718716410U,	// <7,0,6,2>: Cost 3 vsldoi8 <5,6,7,0>, <6,2,7,3>
+  2722697800U,	// <7,0,6,3>: Cost 3 vsldoi8 <6,3,7,0>, <6,3,7,0>
+  3852911121U,	// <7,0,6,4>: Cost 4 vsldoi12 RHS, <0,6,4,7>
+  3852911130U,	// <7,0,6,5>: Cost 4 vsldoi12 RHS, <0,6,5,7>
+  2718716728U,	// <7,0,6,6>: Cost 3 vsldoi8 <5,6,7,0>, <6,6,6,6>
+  2718716750U,	// <7,0,6,7>: Cost 3 vsldoi8 <5,6,7,0>, <6,7,0,1>
+  2779169333U,	// <7,0,6,u>: Cost 3 vsldoi12 RHS, <0,6,u,7>
+  2718716922U,	// <7,0,7,0>: Cost 3 vsldoi8 <5,6,7,0>, <7,0,1,2>
+  1187872870U,	// <7,0,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2718717076U,	// <7,0,7,2>: Cost 3 vsldoi8 <5,6,7,0>, <7,2,0,3>
+  3847160408U,	// <7,0,7,3>: Cost 4 vsldoi12 <3,6,0,7>, <0,7,3,6>
+  2718717286U,	// <7,0,7,4>: Cost 3 vsldoi8 <5,6,7,0>, <7,4,5,6>
+  2718717377U,	// <7,0,7,5>: Cost 3 vsldoi8 <5,6,7,0>, <7,5,6,7>
+  2718717404U,	// <7,0,7,6>: Cost 3 vsldoi8 <5,6,7,0>, <7,6,0,7>
+  2718717478U,	// <7,0,7,7>: Cost 3 vsldoi8 <5,6,7,0>, <7,7,0,0>
+  1187873437U,	// <7,0,7,u>: Cost 2 vmrghw <7,7,7,7>, LHS
+  1584046182U,	// <7,0,u,0>: Cost 2 vsldoi4 <6,7,0,u>, LHS
+  1705427602U,	// <7,0,u,1>: Cost 2 vsldoi12 RHS, <0,u,1,1>
+  631685789U,	// <7,0,u,2>: Cost 1 vsldoi12 RHS, LHS
+  2639874762U,	// <7,0,u,3>: Cost 3 vsldoi4 <3,7,0,u>, <3,7,0,u>
+  1584049462U,	// <7,0,u,4>: Cost 2 vsldoi4 <6,7,0,u>, RHS
+  1644976282U,	// <7,0,u,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  1584051029U,	// <7,0,u,6>: Cost 2 vsldoi4 <6,7,0,u>, <6,7,0,u>
+  2718718208U,	// <7,0,u,7>: Cost 3 vsldoi8 <5,6,7,0>, <u,7,0,1>
+  631685843U,	// <7,0,u,u>: Cost 1 vsldoi12 RHS, LHS
+  2721374218U,	// <7,1,0,0>: Cost 3 vsldoi8 <6,1,7,1>, <0,0,1,1>
+  2779169507U,	// <7,1,0,1>: Cost 3 vsldoi12 RHS, <1,0,1,1>
+  2779169516U,	// <7,1,0,2>: Cost 3 vsldoi12 RHS, <1,0,2,1>
+  3852911348U,	// <7,1,0,3>: Cost 4 vsldoi12 RHS, <1,0,3,0>
+  2669743414U,	// <7,1,0,4>: Cost 3 vsldoi4 <u,7,1,0>, RHS
+  2316058962U,	// <7,1,0,5>: Cost 3 vmrglw <5,6,7,0>, <0,4,1,5>
+  2316059044U,	// <7,1,0,6>: Cost 3 vmrglw <5,6,7,0>, <0,5,1,6>
+  2669745146U,	// <7,1,0,7>: Cost 3 vsldoi4 <u,7,1,0>, <7,0,1,2>
+  2779169570U,	// <7,1,0,u>: Cost 3 vsldoi12 RHS, <1,0,u,1>
+  2779169579U,	// <7,1,1,0>: Cost 3 vsldoi12 RHS, <1,1,0,1>
+  1705427764U,	// <7,1,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169598U,	// <7,1,1,2>: Cost 3 vsldoi12 RHS, <1,1,2,2>
+  3713632972U,	// <7,1,1,3>: Cost 4 vsldoi4 <3,7,1,1>, <3,7,1,1>
+  2779169619U,	// <7,1,1,4>: Cost 3 vsldoi12 RHS, <1,1,4,5>
+  2779169628U,	// <7,1,1,5>: Cost 3 vsldoi12 RHS, <1,1,5,5>
+  2657809239U,	// <7,1,1,6>: Cost 3 vsldoi4 <6,7,1,1>, <6,7,1,1>
+  3835290474U,	// <7,1,1,7>: Cost 4 vsldoi12 <1,6,1,7>, <1,1,7,1>
+  1705427764U,	// <7,1,1,u>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169660U,	// <7,1,2,0>: Cost 3 vsldoi12 RHS, <1,2,0,1>
+  2779169671U,	// <7,1,2,1>: Cost 3 vsldoi12 RHS, <1,2,1,3>
+  2779169680U,	// <7,1,2,2>: Cost 3 vsldoi12 RHS, <1,2,2,3>
+  1705427862U,	// <7,1,2,3>: Cost 2 vsldoi12 RHS, <1,2,3,0>
+  2779169700U,	// <7,1,2,4>: Cost 3 vsldoi12 RHS, <1,2,4,5>
+  2779169707U,	// <7,1,2,5>: Cost 3 vsldoi12 RHS, <1,2,5,3>
+  2657817432U,	// <7,1,2,6>: Cost 3 vsldoi4 <6,7,1,2>, <6,7,1,2>
+  2803057594U,	// <7,1,2,7>: Cost 3 vsldoi12 RHS, <1,2,7,0>
+  1705427907U,	// <7,1,2,u>: Cost 2 vsldoi12 RHS, <1,2,u,0>
+  3776538827U,	// <7,1,3,0>: Cost 4 vsldoi8 <3,0,7,1>, <3,0,7,1>
+  2319400970U,	// <7,1,3,1>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,1>
+  2316085398U,	// <7,1,3,2>: Cost 3 vmrglw <5,6,7,3>, <3,0,1,2>
+  3852911591U,	// <7,1,3,3>: Cost 4 vsldoi12 RHS, <1,3,3,0>
+  3852911600U,	// <7,1,3,4>: Cost 4 vsldoi12 RHS, <1,3,4,0>
+  2319401298U,	// <7,1,3,5>: Cost 3 vmrglw <6,2,7,3>, <0,4,1,5>
+  3833668617U,	// <7,1,3,6>: Cost 4 vsldoi12 <1,3,6,7>, <1,3,6,7>
+  3367265487U,	// <7,1,3,7>: Cost 4 vmrglw <1,u,7,3>, <1,6,1,7>
+  2319400977U,	// <7,1,3,u>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,u>
+  2724031378U,	// <7,1,4,0>: Cost 3 vsldoi8 <6,5,7,1>, <4,0,5,1>
+  2779169835U,	// <7,1,4,1>: Cost 3 vsldoi12 RHS, <1,4,1,5>
+  2779169844U,	// <7,1,4,2>: Cost 3 vsldoi12 RHS, <1,4,2,5>
+  3852911672U,	// <7,1,4,3>: Cost 4 vsldoi12 RHS, <1,4,3,0>
+  2669776182U,	// <7,1,4,4>: Cost 3 vsldoi4 <u,7,1,4>, RHS
+  2779169872U,	// <7,1,4,5>: Cost 3 vsldoi12 RHS, <1,4,5,6>
+  3835290712U,	// <7,1,4,6>: Cost 4 vsldoi12 <1,6,1,7>, <1,4,6,5>
+  2669778278U,	// <7,1,4,7>: Cost 3 vsldoi4 <u,7,1,4>, <7,4,5,6>
+  2779169898U,	// <7,1,4,u>: Cost 3 vsldoi12 RHS, <1,4,u,5>
+  2779169903U,	// <7,1,5,0>: Cost 3 vsldoi12 RHS, <1,5,0,1>
+  3835585661U,	// <7,1,5,1>: Cost 4 vsldoi12 <1,6,5,7>, <1,5,1,6>
+  3841410182U,	// <7,1,5,2>: Cost 4 vsldoi12 <2,6,3,7>, <1,5,2,6>
+  3852911753U,	// <7,1,5,3>: Cost 4 vsldoi12 RHS, <1,5,3,0>
+  2779169943U,	// <7,1,5,4>: Cost 3 vsldoi12 RHS, <1,5,4,5>
+  2318754130U,	// <7,1,5,5>: Cost 3 vmrglw <6,1,7,5>, <0,4,1,5>
+  2718724195U,	// <7,1,5,6>: Cost 3 vsldoi8 <5,6,7,1>, <5,6,7,1>
+  3859178670U,	// <7,1,5,7>: Cost 4 vsldoi12 <5,6,1,7>, <1,5,7,1>
+  2779169975U,	// <7,1,5,u>: Cost 3 vsldoi12 RHS, <1,5,u,1>
+  2720715094U,	// <7,1,6,0>: Cost 3 vsldoi8 <6,0,7,1>, <6,0,7,1>
+  2761549007U,	// <7,1,6,1>: Cost 3 vsldoi12 <1,6,1,7>, <1,6,1,7>
+  2779170008U,	// <7,1,6,2>: Cost 3 vsldoi12 RHS, <1,6,2,7>
+  3835438305U,	// <7,1,6,3>: Cost 4 vsldoi12 <1,6,3,7>, <1,6,3,7>
+  3835512042U,	// <7,1,6,4>: Cost 4 vsldoi12 <1,6,4,7>, <1,6,4,7>
+  2761843955U,	// <7,1,6,5>: Cost 3 vsldoi12 <1,6,5,7>, <1,6,5,7>
+  3835659516U,	// <7,1,6,6>: Cost 4 vsldoi12 <1,6,6,7>, <1,6,6,7>
+  2803057918U,	// <7,1,6,7>: Cost 3 vsldoi12 RHS, <1,6,7,0>
+  2762065166U,	// <7,1,6,u>: Cost 3 vsldoi12 <1,6,u,7>, <1,6,u,7>
+  2669797478U,	// <7,1,7,0>: Cost 3 vsldoi4 <u,7,1,7>, LHS
+  2322087946U,	// <7,1,7,1>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,1>
+  2317448186U,	// <7,1,7,2>: Cost 3 vmrglw <5,u,7,7>, <7,0,1,2>
+  3395829934U,	// <7,1,7,3>: Cost 4 vmrglw <6,6,7,7>, <0,2,1,3>
+  2669800758U,	// <7,1,7,4>: Cost 3 vsldoi4 <u,7,1,7>, RHS
+  2322088274U,	// <7,1,7,5>: Cost 3 vmrglw <6,6,7,7>, <0,4,1,5>
+  3375923377U,	// <7,1,7,6>: Cost 4 vmrglw <3,3,7,7>, <0,2,1,6>
+  2731996780U,	// <7,1,7,7>: Cost 3 vsldoi8 <7,u,7,1>, <7,7,7,7>
+  2322087953U,	// <7,1,7,u>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,u>
+  2779170146U,	// <7,1,u,0>: Cost 3 vsldoi12 RHS, <1,u,0,1>
+  1705427764U,	// <7,1,u,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779170164U,	// <7,1,u,2>: Cost 3 vsldoi12 RHS, <1,u,2,1>
+  1705428348U,	// <7,1,u,3>: Cost 2 vsldoi12 RHS, <1,u,3,0>
+  2779170186U,	// <7,1,u,4>: Cost 3 vsldoi12 RHS, <1,u,4,5>
+  2763171221U,	// <7,1,u,5>: Cost 3 vsldoi12 <1,u,5,7>, <1,u,5,7>
+  2657866590U,	// <7,1,u,6>: Cost 3 vsldoi4 <6,7,1,u>, <6,7,1,u>
+  2803058080U,	// <7,1,u,7>: Cost 3 vsldoi12 RHS, <1,u,7,0>
+  1705428393U,	// <7,1,u,u>: Cost 2 vsldoi12 RHS, <1,u,u,0>
+  3713695846U,	// <7,2,0,0>: Cost 4 vsldoi4 <3,7,2,0>, LHS
+  2779170237U,	// <7,2,0,1>: Cost 3 vsldoi12 RHS, <2,0,1,2>
+  2779170245U,	// <7,2,0,2>: Cost 3 vsldoi12 RHS, <2,0,2,1>
+  1242316902U,	// <7,2,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3713699126U,	// <7,2,0,4>: Cost 4 vsldoi4 <3,7,2,0>, RHS
+  3852912096U,	// <7,2,0,5>: Cost 4 vsldoi12 RHS, <2,0,5,1>
+  2767668713U,	// <7,2,0,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,0,6,1>
+  2256488426U,	// <7,2,0,7>: Cost 3 vmrghw <7,0,1,2>, <2,7,0,1>
+  1242316907U,	// <7,2,0,u>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3852912132U,	// <7,2,1,0>: Cost 4 vsldoi12 RHS, <2,1,0,1>
+  3852912141U,	// <7,2,1,1>: Cost 4 vsldoi12 RHS, <2,1,1,1>
+  3852912149U,	// <7,2,1,2>: Cost 4 vsldoi12 RHS, <2,1,2,0>
+  2779170335U,	// <7,2,1,3>: Cost 3 vsldoi12 RHS, <2,1,3,1>
+  3852912172U,	// <7,2,1,4>: Cost 4 vsldoi12 RHS, <2,1,4,5>
+  3840747062U,	// <7,2,1,5>: Cost 5 vsldoi12 <2,5,3,7>, <2,1,5,6>
+  3841410617U,	// <7,2,1,6>: Cost 4 vsldoi12 <2,6,3,7>, <2,1,6,0>
+  3795125538U,	// <7,2,1,7>: Cost 4 vsldoi8 <6,1,7,2>, <1,7,2,0>
+  2779170380U,	// <7,2,1,u>: Cost 3 vsldoi12 RHS, <2,1,u,1>
+  2779170389U,	// <7,2,2,0>: Cost 3 vsldoi12 RHS, <2,2,0,1>
+  3852912222U,	// <7,2,2,1>: Cost 4 vsldoi12 RHS, <2,2,1,1>
+  1705428584U,	// <7,2,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705428594U,	// <7,2,2,3>: Cost 2 vsldoi12 RHS, <2,2,3,3>
+  2779170429U,	// <7,2,2,4>: Cost 3 vsldoi12 RHS, <2,2,4,5>
+  3852912259U,	// <7,2,2,5>: Cost 4 vsldoi12 RHS, <2,2,5,2>
+  2767668880U,	// <7,2,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,2,6,6>
+  3841336981U,	// <7,2,2,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,2,7,2>
+  1705428639U,	// <7,2,2,u>: Cost 2 vsldoi12 RHS, <2,2,u,3>
+  1705428646U,	// <7,2,3,0>: Cost 2 vsldoi12 RHS, <2,3,0,1>
+  2779170479U,	// <7,2,3,1>: Cost 3 vsldoi12 RHS, <2,3,1,1>
+  2767668925U,	// <7,2,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <2,3,2,6>
+  1245659238U,	// <7,2,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705428686U,	// <7,2,3,4>: Cost 2 vsldoi12 RHS, <2,3,4,5>
+  2779170519U,	// <7,2,3,5>: Cost 3 vsldoi12 RHS, <2,3,5,5>
+  2657899362U,	// <7,2,3,6>: Cost 3 vsldoi4 <6,7,2,3>, <6,7,2,3>
+  2319406574U,	// <7,2,3,7>: Cost 3 vmrglw <6,2,7,3>, <7,6,2,7>
+  1705428718U,	// <7,2,3,u>: Cost 2 vsldoi12 RHS, <2,3,u,1>
+  3713728614U,	// <7,2,4,0>: Cost 4 vsldoi4 <3,7,2,4>, LHS
+  3852912388U,	// <7,2,4,1>: Cost 4 vsldoi12 RHS, <2,4,1,5>
+  2779170573U,	// <7,2,4,2>: Cost 3 vsldoi12 RHS, <2,4,2,5>
+  1242349670U,	// <7,2,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3713731894U,	// <7,2,4,4>: Cost 4 vsldoi4 <3,7,2,4>, RHS
+  2779170601U,	// <7,2,4,5>: Cost 3 vsldoi12 RHS, <2,4,5,6>
+  2767669041U,	// <7,2,4,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,4,6,5>
+  3389834456U,	// <7,2,4,7>: Cost 4 vmrglw <5,6,7,4>, <1,6,2,7>
+  1242349675U,	// <7,2,4,u>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3852912456U,	// <7,2,5,0>: Cost 4 vsldoi12 RHS, <2,5,0,1>
+  3852912466U,	// <7,2,5,1>: Cost 4 vsldoi12 RHS, <2,5,1,2>
+  3852912475U,	// <7,2,5,2>: Cost 4 vsldoi12 RHS, <2,5,2,2>
+  2779170664U,	// <7,2,5,3>: Cost 3 vsldoi12 RHS, <2,5,3,6>
+  3852912496U,	// <7,2,5,4>: Cost 4 vsldoi12 RHS, <2,5,4,5>
+  3792474116U,	// <7,2,5,5>: Cost 4 vsldoi8 <5,6,7,2>, <5,5,5,5>
+  2718732388U,	// <7,2,5,6>: Cost 3 vsldoi8 <5,6,7,2>, <5,6,7,2>
+  3841337228U,	// <7,2,5,7>: Cost 5 vsldoi12 <2,6,2,7>, <2,5,7,6>
+  2779170709U,	// <7,2,5,u>: Cost 3 vsldoi12 RHS, <2,5,u,6>
+  2640003174U,	// <7,2,6,0>: Cost 3 vsldoi4 <3,7,2,6>, LHS
+  2721386920U,	// <7,2,6,1>: Cost 3 vsldoi8 <6,1,7,2>, <6,1,7,2>
+  2767595441U,	// <7,2,6,2>: Cost 3 vsldoi12 <2,6,2,7>, <2,6,2,7>
+  1693927354U,	// <7,2,6,3>: Cost 2 vsldoi12 <2,6,3,7>, <2,6,3,7>
+  2640006454U,	// <7,2,6,4>: Cost 3 vsldoi4 <3,7,2,6>, RHS
+  3841558476U,	// <7,2,6,5>: Cost 4 vsldoi12 <2,6,5,7>, <2,6,5,7>
+  2657923941U,	// <7,2,6,6>: Cost 3 vsldoi4 <6,7,2,6>, <6,7,2,6>
+  3841337310U,	// <7,2,6,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,6,7,7>
+  1694296039U,	// <7,2,6,u>: Cost 2 vsldoi12 <2,6,u,7>, <2,6,u,7>
+  2803058666U,	// <7,2,7,0>: Cost 3 vsldoi12 RHS, <2,7,0,1>
+  3852912632U,	// <7,2,7,1>: Cost 4 vsldoi12 RHS, <2,7,1,6>
+  2322089576U,	// <7,2,7,2>: Cost 3 vmrglw <6,6,7,7>, <2,2,2,2>
+  1248346214U,	// <7,2,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  3841337362U,	// <7,2,7,4>: Cost 4 vsldoi12 <2,6,2,7>, <2,7,4,5>
+  3395830836U,	// <7,2,7,5>: Cost 4 vmrglw <6,6,7,7>, <1,4,2,5>
+  2261616570U,	// <7,2,7,6>: Cost 3 vmrghw <7,7,7,7>, <2,6,3,7>
+  3371943857U,	// <7,2,7,7>: Cost 4 vmrglw <2,6,7,7>, <2,6,2,7>
+  1248346219U,	// <7,2,7,u>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705429051U,	// <7,2,u,0>: Cost 2 vsldoi12 RHS, <2,u,0,1>
+  2779170884U,	// <7,2,u,1>: Cost 3 vsldoi12 RHS, <2,u,1,1>
+  1705428584U,	// <7,2,u,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1695254620U,	// <7,2,u,3>: Cost 2 vsldoi12 <2,u,3,7>, <2,u,3,7>
+  1705429091U,	// <7,2,u,4>: Cost 2 vsldoi12 RHS, <2,u,4,5>
+  2779170924U,	// <7,2,u,5>: Cost 3 vsldoi12 RHS, <2,u,5,5>
+  2767669361U,	// <7,2,u,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,u,6,1>
+  2803058809U,	// <7,2,u,7>: Cost 3 vsldoi12 RHS, <2,u,7,0>
+  1695623305U,	// <7,2,u,u>: Cost 2 vsldoi12 <2,u,u,7>, <2,u,u,7>
+  2779170955U,	// <7,3,0,0>: Cost 3 vsldoi12 RHS, <3,0,0,0>
+  1705429142U,	// <7,3,0,1>: Cost 2 vsldoi12 RHS, <3,0,1,2>
+  2634057732U,	// <7,3,0,2>: Cost 3 vsldoi4 <2,7,3,0>, <2,7,3,0>
+  2779170983U,	// <7,3,0,3>: Cost 3 vsldoi12 RHS, <3,0,3,1>
+  2779170992U,	// <7,3,0,4>: Cost 3 vsldoi12 RHS, <3,0,4,1>
+  3852912829U,	// <7,3,0,5>: Cost 4 vsldoi12 RHS, <3,0,5,5>
+  2657948520U,	// <7,3,0,6>: Cost 3 vsldoi4 <6,7,3,0>, <6,7,3,0>
+  2316060602U,	// <7,3,0,7>: Cost 3 vmrglw <5,6,7,0>, <2,6,3,7>
+  1705429205U,	// <7,3,0,u>: Cost 2 vsldoi12 RHS, <3,0,u,2>
+  3852912860U,	// <7,3,1,0>: Cost 4 vsldoi12 RHS, <3,1,0,0>
+  2779171046U,	// <7,3,1,1>: Cost 3 vsldoi12 RHS, <3,1,1,1>
+  2779171057U,	// <7,3,1,2>: Cost 3 vsldoi12 RHS, <3,1,2,3>
+  3852912887U,	// <7,3,1,3>: Cost 4 vsldoi12 RHS, <3,1,3,0>
+  3852912896U,	// <7,3,1,4>: Cost 4 vsldoi12 RHS, <3,1,4,0>
+  3852912905U,	// <7,3,1,5>: Cost 4 vsldoi12 RHS, <3,1,5,0>
+  3835291923U,	// <7,3,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <3,1,6,1>
+  3841411356U,	// <7,3,1,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,1,7,1>
+  2779171111U,	// <7,3,1,u>: Cost 3 vsldoi12 RHS, <3,1,u,3>
+  2779171120U,	// <7,3,2,0>: Cost 3 vsldoi12 RHS, <3,2,0,3>
+  3852912952U,	// <7,3,2,1>: Cost 4 vsldoi12 RHS, <3,2,1,2>
+  2779171137U,	// <7,3,2,2>: Cost 3 vsldoi12 RHS, <3,2,2,2>
+  2779171144U,	// <7,3,2,3>: Cost 3 vsldoi12 RHS, <3,2,3,0>
+  2779171156U,	// <7,3,2,4>: Cost 3 vsldoi12 RHS, <3,2,4,3>
+  3852912989U,	// <7,3,2,5>: Cost 4 vsldoi12 RHS, <3,2,5,3>
+  2767669606U,	// <7,3,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,6,3>
+  2767669615U,	// <7,3,2,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,7,3>
+  2779171189U,	// <7,3,2,u>: Cost 3 vsldoi12 RHS, <3,2,u,0>
+  2779171198U,	// <7,3,3,0>: Cost 3 vsldoi12 RHS, <3,3,0,0>
+  3852913032U,	// <7,3,3,1>: Cost 4 vsldoi12 RHS, <3,3,1,1>
+  2704140655U,	// <7,3,3,2>: Cost 3 vsldoi8 <3,2,7,3>, <3,2,7,3>
+  1705429404U,	// <7,3,3,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171238U,	// <7,3,3,4>: Cost 3 vsldoi12 RHS, <3,3,4,4>
+  3852913070U,	// <7,3,3,5>: Cost 4 vsldoi12 RHS, <3,3,5,3>
+  2657973099U,	// <7,3,3,6>: Cost 3 vsldoi4 <6,7,3,3>, <6,7,3,3>
+  2767669700U,	// <7,3,3,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,3,7,7>
+  1705429404U,	// <7,3,3,u>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171280U,	// <7,3,4,0>: Cost 3 vsldoi12 RHS, <3,4,0,1>
+  2779171290U,	// <7,3,4,1>: Cost 3 vsldoi12 RHS, <3,4,1,2>
+  2634090504U,	// <7,3,4,2>: Cost 3 vsldoi4 <2,7,3,4>, <2,7,3,4>
+  2779171311U,	// <7,3,4,3>: Cost 3 vsldoi12 RHS, <3,4,3,5>
+  2779171319U,	// <7,3,4,4>: Cost 3 vsldoi12 RHS, <3,4,4,4>
+  1705429506U,	// <7,3,4,5>: Cost 2 vsldoi12 RHS, <3,4,5,6>
+  2722057593U,	// <7,3,4,6>: Cost 3 vsldoi8 <6,2,7,3>, <4,6,5,2>
+  2316093370U,	// <7,3,4,7>: Cost 3 vmrglw <5,6,7,4>, <2,6,3,7>
+  1705429533U,	// <7,3,4,u>: Cost 2 vsldoi12 RHS, <3,4,u,6>
+  3852913185U,	// <7,3,5,0>: Cost 4 vsldoi12 RHS, <3,5,0,1>
+  3795799695U,	// <7,3,5,1>: Cost 4 vsldoi8 <6,2,7,3>, <5,1,0,1>
+  3852913203U,	// <7,3,5,2>: Cost 4 vsldoi12 RHS, <3,5,2,1>
+  3852913214U,	// <7,3,5,3>: Cost 4 vsldoi12 RHS, <3,5,3,3>
+  3852913225U,	// <7,3,5,4>: Cost 4 vsldoi12 RHS, <3,5,4,5>
+  2779171410U,	// <7,3,5,5>: Cost 3 vsldoi12 RHS, <3,5,5,5>
+  2718740581U,	// <7,3,5,6>: Cost 3 vsldoi8 <5,6,7,3>, <5,6,7,3>
+  3841411685U,	// <7,3,5,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,5,7,6>
+  2720067847U,	// <7,3,5,u>: Cost 3 vsldoi8 <5,u,7,3>, <5,u,7,3>
+  2773420664U,	// <7,3,6,0>: Cost 3 vsldoi12 <3,6,0,7>, <3,6,0,7>
+  3847236225U,	// <7,3,6,1>: Cost 4 vsldoi12 <3,6,1,7>, <3,6,1,7>
+  1648316922U,	// <7,3,6,2>: Cost 2 vsldoi8 <6,2,7,3>, <6,2,7,3>
+  2773641875U,	// <7,3,6,3>: Cost 3 vsldoi12 <3,6,3,7>, <3,6,3,7>
+  2773715612U,	// <7,3,6,4>: Cost 3 vsldoi12 <3,6,4,7>, <3,6,4,7>
+  3847531173U,	// <7,3,6,5>: Cost 4 vsldoi12 <3,6,5,7>, <3,6,5,7>
+  2722059024U,	// <7,3,6,6>: Cost 3 vsldoi8 <6,2,7,3>, <6,6,2,2>
+  2767669943U,	// <7,3,6,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,6,7,7>
+  1652298720U,	// <7,3,6,u>: Cost 2 vsldoi8 <6,u,7,3>, <6,u,7,3>
+  2767669955U,	// <7,3,7,0>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,0,1>
+  3841411788U,	// <7,3,7,1>: Cost 4 vsldoi12 <2,6,3,7>, <3,7,1,1>
+  2767669978U,	// <7,3,7,2>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,2,6>
+  2722059546U,	// <7,3,7,3>: Cost 3 vsldoi8 <6,2,7,3>, <7,3,6,2>
+  2767669995U,	// <7,3,7,4>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,4,5>
+  3852913396U,	// <7,3,7,5>: Cost 4 vsldoi12 RHS, <3,7,5,5>
+  2722059758U,	// <7,3,7,6>: Cost 3 vsldoi8 <6,2,7,3>, <7,6,2,7>
+  2302183354U,	// <7,3,7,7>: Cost 3 vmrglw <3,3,7,7>, <2,6,3,7>
+  2767670027U,	// <7,3,7,u>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,u,1>
+  2774747930U,	// <7,3,u,0>: Cost 3 vsldoi12 <3,u,0,7>, <3,u,0,7>
+  1705429790U,	// <7,3,u,1>: Cost 2 vsldoi12 RHS, <3,u,1,2>
+  1660262316U,	// <7,3,u,2>: Cost 2 vsldoi8 <u,2,7,3>, <u,2,7,3>
+  1705429404U,	// <7,3,u,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2775042878U,	// <7,3,u,4>: Cost 3 vsldoi12 <3,u,4,7>, <3,u,4,7>
+  1705429830U,	// <7,3,u,5>: Cost 2 vsldoi12 RHS, <3,u,5,6>
+  2779171660U,	// <7,3,u,6>: Cost 3 vsldoi12 RHS, <3,u,6,3>
+  2767670101U,	// <7,3,u,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,u,7,3>
+  1705429853U,	// <7,3,u,u>: Cost 2 vsldoi12 RHS, <3,u,u,2>
+  2718744576U,	// <7,4,0,0>: Cost 3 vsldoi8 <5,6,7,4>, <0,0,0,0>
+  1645002854U,	// <7,4,0,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  3852913527U,	// <7,4,0,2>: Cost 4 vsldoi12 RHS, <4,0,2,1>
+  3852913536U,	// <7,4,0,3>: Cost 4 vsldoi12 RHS, <4,0,3,1>
+  2316061904U,	// <7,4,0,4>: Cost 3 vmrglw <5,6,7,0>, <4,4,4,4>
+  1705429906U,	// <7,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658022257U,	// <7,4,0,6>: Cost 3 vsldoi4 <6,7,4,0>, <6,7,4,0>
+  2256489928U,	// <7,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1707420589U,	// <7,4,0,u>: Cost 2 vsldoi12 RHS, <4,0,u,1>
+  3852913590U,	// <7,4,1,0>: Cost 4 vsldoi12 RHS, <4,1,0,1>
+  2718745396U,	// <7,4,1,1>: Cost 3 vsldoi8 <5,6,7,4>, <1,1,1,1>
+  2779171786U,	// <7,4,1,2>: Cost 3 vsldoi12 RHS, <4,1,2,3>
+  3852913616U,	// <7,4,1,3>: Cost 4 vsldoi12 RHS, <4,1,3,0>
+  3852913627U,	// <7,4,1,4>: Cost 4 vsldoi12 RHS, <4,1,4,2>
+  2779171810U,	// <7,4,1,5>: Cost 3 vsldoi12 RHS, <4,1,5,0>
+  3792487631U,	// <7,4,1,6>: Cost 4 vsldoi8 <5,6,7,4>, <1,6,1,7>
+  3394456220U,	// <7,4,1,7>: Cost 4 vmrglw <6,4,7,1>, <3,6,4,7>
+  2779171837U,	// <7,4,1,u>: Cost 3 vsldoi12 RHS, <4,1,u,0>
+  3852913673U,	// <7,4,2,0>: Cost 4 vsldoi12 RHS, <4,2,0,3>
+  3852913682U,	// <7,4,2,1>: Cost 4 vsldoi12 RHS, <4,2,1,3>
+  2718746216U,	// <7,4,2,2>: Cost 3 vsldoi8 <5,6,7,4>, <2,2,2,2>
+  2718746278U,	// <7,4,2,3>: Cost 3 vsldoi8 <5,6,7,4>, <2,3,0,1>
+  2779171885U,	// <7,4,2,4>: Cost 3 vsldoi12 RHS, <4,2,4,3>
+  2779171893U,	// <7,4,2,5>: Cost 3 vsldoi12 RHS, <4,2,5,2>
+  2718746554U,	// <7,4,2,6>: Cost 3 vsldoi8 <5,6,7,4>, <2,6,3,7>
+  3847457864U,	// <7,4,2,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,2,7,3>
+  2779171921U,	// <7,4,2,u>: Cost 3 vsldoi12 RHS, <4,2,u,3>
+  2718746774U,	// <7,4,3,0>: Cost 3 vsldoi8 <5,6,7,4>, <3,0,1,2>
+  3852913762U,	// <7,4,3,1>: Cost 4 vsldoi12 RHS, <4,3,1,2>
+  3852913772U,	// <7,4,3,2>: Cost 4 vsldoi12 RHS, <4,3,2,3>
+  2718747036U,	// <7,4,3,3>: Cost 3 vsldoi8 <5,6,7,4>, <3,3,3,3>
+  2718747138U,	// <7,4,3,4>: Cost 3 vsldoi8 <5,6,7,4>, <3,4,5,6>
+  2779171972U,	// <7,4,3,5>: Cost 3 vsldoi12 RHS, <4,3,5,0>
+  2706803380U,	// <7,4,3,6>: Cost 3 vsldoi8 <3,6,7,4>, <3,6,7,4>
+  3847457946U,	// <7,4,3,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,3,7,4>
+  2781162655U,	// <7,4,3,u>: Cost 3 vsldoi12 RHS, <4,3,u,0>
+  2718747538U,	// <7,4,4,0>: Cost 3 vsldoi8 <5,6,7,4>, <4,0,5,1>
+  3852913842U,	// <7,4,4,1>: Cost 4 vsldoi12 RHS, <4,4,1,1>
+  3852913852U,	// <7,4,4,2>: Cost 4 vsldoi12 RHS, <4,4,2,2>
+  2316096696U,	// <7,4,4,3>: Cost 3 vmrglw <5,6,7,4>, <7,2,4,3>
+  1705430224U,	// <7,4,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705430234U,	// <7,4,4,5>: Cost 2 vsldoi12 RHS, <4,4,5,5>
+  2658055029U,	// <7,4,4,6>: Cost 3 vsldoi4 <6,7,4,4>, <6,7,4,4>
+  2316097024U,	// <7,4,4,7>: Cost 3 vmrglw <5,6,7,4>, <7,6,4,7>
+  1707420917U,	// <7,4,4,u>: Cost 2 vsldoi12 RHS, <4,4,u,5>
+  1584316518U,	// <7,4,5,0>: Cost 2 vsldoi4 <6,7,4,5>, LHS
+  2658059060U,	// <7,4,5,1>: Cost 3 vsldoi4 <6,7,4,5>, <1,1,1,1>
+  2640144314U,	// <7,4,5,2>: Cost 3 vsldoi4 <3,7,4,5>, <2,6,3,7>
+  2640145131U,	// <7,4,5,3>: Cost 3 vsldoi4 <3,7,4,5>, <3,7,4,5>
+  1584319798U,	// <7,4,5,4>: Cost 2 vsldoi4 <6,7,4,5>, RHS
+  2779172134U,	// <7,4,5,5>: Cost 3 vsldoi12 RHS, <4,5,5,0>
+  631688502U,	// <7,4,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2658063354U,	// <7,4,5,7>: Cost 3 vsldoi4 <6,7,4,5>, <7,0,1,2>
+  631688520U,	// <7,4,5,u>: Cost 1 vsldoi12 RHS, RHS
+  3852914001U,	// <7,4,6,0>: Cost 4 vsldoi12 RHS, <4,6,0,7>
+  3852914010U,	// <7,4,6,1>: Cost 4 vsldoi12 RHS, <4,6,1,7>
+  2718749178U,	// <7,4,6,2>: Cost 3 vsldoi8 <5,6,7,4>, <6,2,7,3>
+  2722730572U,	// <7,4,6,3>: Cost 3 vsldoi8 <6,3,7,4>, <6,3,7,4>
+  2723394205U,	// <7,4,6,4>: Cost 3 vsldoi8 <6,4,7,4>, <6,4,7,4>
+  2779172221U,	// <7,4,6,5>: Cost 3 vsldoi12 RHS, <4,6,5,6>
+  2718749496U,	// <7,4,6,6>: Cost 3 vsldoi8 <5,6,7,4>, <6,6,6,6>
+  2718749518U,	// <7,4,6,7>: Cost 3 vsldoi8 <5,6,7,4>, <6,7,0,1>
+  2779172249U,	// <7,4,6,u>: Cost 3 vsldoi12 RHS, <4,6,u,7>
+  2718749690U,	// <7,4,7,0>: Cost 3 vsldoi8 <5,6,7,4>, <7,0,1,2>
+  3847458214U,	// <7,4,7,1>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,1,2>
+  2718749880U,	// <7,4,7,2>: Cost 3 vsldoi8 <5,6,7,4>, <7,2,4,3>
+  3847458236U,	// <7,4,7,3>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,3,6>
+  2718750004U,	// <7,4,7,4>: Cost 3 vsldoi8 <5,6,7,4>, <7,4,0,1>
+  1187876150U,	// <7,4,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2718750208U,	// <7,4,7,6>: Cost 3 vsldoi8 <5,6,7,4>, <7,6,4,7>
+  2718750286U,	// <7,4,7,7>: Cost 3 vsldoi8 <5,6,7,4>, <7,7,4,4>
+  1187876393U,	// <7,4,7,u>: Cost 2 vmrghw <7,7,7,7>, RHS
+  1584341094U,	// <7,4,u,0>: Cost 2 vsldoi4 <6,7,4,u>, LHS
+  1645008686U,	// <7,4,u,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  2640168890U,	// <7,4,u,2>: Cost 3 vsldoi4 <3,7,4,u>, <2,6,3,7>
+  2640169710U,	// <7,4,u,3>: Cost 3 vsldoi4 <3,7,4,u>, <3,7,4,u>
+  1584344374U,	// <7,4,u,4>: Cost 2 vsldoi4 <6,7,4,u>, RHS
+  1705430554U,	// <7,4,u,5>: Cost 2 vsldoi12 RHS, <4,u,5,1>
+  631688745U,	// <7,4,u,6>: Cost 1 vsldoi12 RHS, RHS
+  2718750976U,	// <7,4,u,7>: Cost 3 vsldoi8 <5,6,7,4>, <u,7,0,1>
+  631688763U,	// <7,4,u,u>: Cost 1 vsldoi12 RHS, RHS
+  2646147174U,	// <7,5,0,0>: Cost 3 vsldoi4 <4,7,5,0>, LHS
+  2779172424U,	// <7,5,0,1>: Cost 3 vsldoi12 RHS, <5,0,1,2>
+  3852914258U,	// <7,5,0,2>: Cost 4 vsldoi12 RHS, <5,0,2,3>
+  3852914268U,	// <7,5,0,3>: Cost 4 vsldoi12 RHS, <5,0,3,4>
+  2779172450U,	// <7,5,0,4>: Cost 3 vsldoi12 RHS, <5,0,4,1>
+  2316061914U,	// <7,5,0,5>: Cost 3 vmrglw <5,6,7,0>, <4,4,5,5>
+  2316061186U,	// <7,5,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,5,6>
+  2646152186U,	// <7,5,0,7>: Cost 3 vsldoi4 <4,7,5,0>, <7,0,1,2>
+  2779172486U,	// <7,5,0,u>: Cost 3 vsldoi12 RHS, <5,0,u,1>
+  2781163151U,	// <7,5,1,0>: Cost 3 vsldoi12 RHS, <5,1,0,1>
+  2321378194U,	// <7,5,1,1>: Cost 3 vmrglw <6,5,7,1>, <4,0,5,1>
+  3852914339U,	// <7,5,1,2>: Cost 4 vsldoi12 RHS, <5,1,2,3>
+  3852914350U,	// <7,5,1,3>: Cost 4 vsldoi12 RHS, <5,1,3,5>
+  2781163191U,	// <7,5,1,4>: Cost 3 vsldoi12 RHS, <5,1,4,5>
+  3852914363U,	// <7,5,1,5>: Cost 4 vsldoi12 RHS, <5,1,5,0>
+  3835588297U,	// <7,5,1,6>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,6,5>
+  3835588306U,	// <7,5,1,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,7,5>
+  2781163223U,	// <7,5,1,u>: Cost 3 vsldoi12 RHS, <5,1,u,1>
+  3852914400U,	// <7,5,2,0>: Cost 4 vsldoi12 RHS, <5,2,0,1>
+  2781163243U,	// <7,5,2,1>: Cost 3 vsldoi12 RHS, <5,2,1,3>
+  3852914419U,	// <7,5,2,2>: Cost 4 vsldoi12 RHS, <5,2,2,2>
+  2779172606U,	// <7,5,2,3>: Cost 3 vsldoi12 RHS, <5,2,3,4>
+  3780552497U,	// <7,5,2,4>: Cost 4 vsldoi8 <3,6,7,5>, <2,4,6,5>
+  2781163279U,	// <7,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2779172632U,	// <7,5,2,6>: Cost 3 vsldoi12 RHS, <5,2,6,3>
+  3835588385U,	// <7,5,2,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,2,7,3>
+  2779172650U,	// <7,5,2,u>: Cost 3 vsldoi12 RHS, <5,2,u,3>
+  3852914481U,	// <7,5,3,0>: Cost 4 vsldoi12 RHS, <5,3,0,1>
+  2319403922U,	// <7,5,3,1>: Cost 3 vmrglw <6,2,7,3>, <4,0,5,1>
+  2319404409U,	// <7,5,3,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  3852914510U,	// <7,5,3,3>: Cost 4 vsldoi12 RHS, <5,3,3,3>
+  3779226131U,	// <7,5,3,4>: Cost 4 vsldoi8 <3,4,7,5>, <3,4,7,5>
+  2319404250U,	// <7,5,3,5>: Cost 3 vmrglw <6,2,7,3>, <4,4,5,5>
+  2319403522U,	// <7,5,3,6>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,6>
+  3852914547U,	// <7,5,3,7>: Cost 4 vsldoi12 RHS, <5,3,7,4>
+  2319403524U,	// <7,5,3,u>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,u>
+  2646179942U,	// <7,5,4,0>: Cost 3 vsldoi4 <4,7,5,4>, LHS
+  2316094354U,	// <7,5,4,1>: Cost 3 vmrglw <5,6,7,4>, <4,0,5,1>
+  3852914582U,	// <7,5,4,2>: Cost 4 vsldoi12 RHS, <5,4,2,3>
+  3852914592U,	// <7,5,4,3>: Cost 4 vsldoi12 RHS, <5,4,3,4>
+  2646183372U,	// <7,5,4,4>: Cost 3 vsldoi4 <4,7,5,4>, <4,7,5,4>
+  2779172788U,	// <7,5,4,5>: Cost 3 vsldoi12 RHS, <5,4,5,6>
+  2316093954U,	// <7,5,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,5,6>
+  2646185318U,	// <7,5,4,7>: Cost 3 vsldoi4 <4,7,5,4>, <7,4,5,6>
+  2779172815U,	// <7,5,4,u>: Cost 3 vsldoi12 RHS, <5,4,u,6>
+  2781163475U,	// <7,5,5,0>: Cost 3 vsldoi12 RHS, <5,5,0,1>
+  2781163484U,	// <7,5,5,1>: Cost 3 vsldoi12 RHS, <5,5,1,1>
+  3852914662U,	// <7,5,5,2>: Cost 4 vsldoi12 RHS, <5,5,2,2>
+  3852914672U,	// <7,5,5,3>: Cost 4 vsldoi12 RHS, <5,5,3,3>
+  2781163515U,	// <7,5,5,4>: Cost 3 vsldoi12 RHS, <5,5,4,5>
+  1705431044U,	// <7,5,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172878U,	// <7,5,5,6>: Cost 3 vsldoi12 RHS, <5,5,6,6>
+  3835588632U,	// <7,5,5,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,5,7,7>
+  1705431044U,	// <7,5,5,u>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172900U,	// <7,5,6,0>: Cost 3 vsldoi12 RHS, <5,6,0,1>
+  2781163571U,	// <7,5,6,1>: Cost 3 vsldoi12 RHS, <5,6,1,7>
+  3852914743U,	// <7,5,6,2>: Cost 4 vsldoi12 RHS, <5,6,2,2>
+  2779172930U,	// <7,5,6,3>: Cost 3 vsldoi12 RHS, <5,6,3,4>
+  2779172940U,	// <7,5,6,4>: Cost 3 vsldoi12 RHS, <5,6,4,5>
+  2781163607U,	// <7,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  2779172960U,	// <7,5,6,6>: Cost 3 vsldoi12 RHS, <5,6,6,7>
+  1705431138U,	// <7,5,6,7>: Cost 2 vsldoi12 RHS, <5,6,7,0>
+  1705578603U,	// <7,5,6,u>: Cost 2 vsldoi12 RHS, <5,6,u,0>
+  2646204518U,	// <7,5,7,0>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2322090898U,	// <7,5,7,1>: Cost 3 vmrglw <6,6,7,7>, <4,0,5,1>
+  3719947880U,	// <7,5,7,2>: Cost 4 vsldoi4 <4,7,5,7>, <2,2,2,2>
+  3719948438U,	// <7,5,7,3>: Cost 4 vsldoi4 <4,7,5,7>, <3,0,1,2>
+  2646207951U,	// <7,5,7,4>: Cost 3 vsldoi4 <4,7,5,7>, <4,7,5,7>
+  2322091226U,	// <7,5,7,5>: Cost 3 vmrglw <6,6,7,7>, <4,4,5,5>
+  2322090498U,	// <7,5,7,6>: Cost 3 vmrglw <6,6,7,7>, <3,4,5,6>
+  2646210156U,	// <7,5,7,7>: Cost 3 vsldoi4 <4,7,5,7>, <7,7,7,7>
+  2646210350U,	// <7,5,7,u>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2779173062U,	// <7,5,u,0>: Cost 3 vsldoi12 RHS, <5,u,0,1>
+  2779173072U,	// <7,5,u,1>: Cost 3 vsldoi12 RHS, <5,u,1,2>
+  2319404409U,	// <7,5,u,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  2779173092U,	// <7,5,u,3>: Cost 3 vsldoi12 RHS, <5,u,3,4>
+  2779173101U,	// <7,5,u,4>: Cost 3 vsldoi12 RHS, <5,u,4,4>
+  1705431044U,	// <7,5,u,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779173118U,	// <7,5,u,6>: Cost 3 vsldoi12 RHS, <5,u,6,3>
+  1705578756U,	// <7,5,u,7>: Cost 2 vsldoi12 RHS, <5,u,7,0>
+  1707421965U,	// <7,5,u,u>: Cost 2 vsldoi12 RHS, <5,u,u,0>
+  3852914966U,	// <7,6,0,0>: Cost 4 vsldoi12 RHS, <6,0,0,0>
+  2779173153U,	// <7,6,0,1>: Cost 3 vsldoi12 RHS, <6,0,1,2>
+  2256491002U,	// <7,6,0,2>: Cost 3 vmrghw <7,0,1,2>, <6,2,7,3>
+  3852914994U,	// <7,6,0,3>: Cost 4 vsldoi12 RHS, <6,0,3,1>
+  3852915003U,	// <7,6,0,4>: Cost 4 vsldoi12 RHS, <6,0,4,1>
+  2316062652U,	// <7,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316063544U,	// <7,6,0,6>: Cost 3 vmrglw <5,6,7,0>, <6,6,6,6>
+  1242320182U,	// <7,6,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1242320183U,	// <7,6,0,u>: Cost 2 vmrglw <5,6,7,0>, RHS
+  3852915048U,	// <7,6,1,0>: Cost 4 vsldoi12 RHS, <6,1,0,1>
+  3377866217U,	// <7,6,1,1>: Cost 4 vmrglw <3,6,7,1>, <2,0,6,1>
+  3852915068U,	// <7,6,1,2>: Cost 4 vsldoi12 RHS, <6,1,2,3>
+  3833672072U,	// <7,6,1,3>: Cost 5 vsldoi12 <1,3,6,7>, <6,1,3,6>
+  3852915088U,	// <7,6,1,4>: Cost 4 vsldoi12 RHS, <6,1,4,5>
+  3395122056U,	// <7,6,1,5>: Cost 4 vmrglw <6,5,7,1>, <6,7,6,5>
+  3389813560U,	// <7,6,1,6>: Cost 4 vmrglw <5,6,7,1>, <6,6,6,6>
+  2779173287U,	// <7,6,1,7>: Cost 3 vsldoi12 RHS, <6,1,7,1>
+  2779320752U,	// <7,6,1,u>: Cost 3 vsldoi12 RHS, <6,1,u,1>
+  2658181222U,	// <7,6,2,0>: Cost 3 vsldoi4 <6,7,6,2>, LHS
+  3852915140U,	// <7,6,2,1>: Cost 4 vsldoi12 RHS, <6,2,1,3>
+  2257973754U,	// <7,6,2,2>: Cost 3 vmrghw <7,2,3,3>, <6,2,7,3>
+  3841413589U,	// <7,6,2,3>: Cost 4 vsldoi12 <2,6,3,7>, <6,2,3,2>
+  2658184502U,	// <7,6,2,4>: Cost 3 vsldoi4 <6,7,6,2>, RHS
+  3852915176U,	// <7,6,2,5>: Cost 4 vsldoi12 RHS, <6,2,5,3>
+  2658186117U,	// <7,6,2,6>: Cost 3 vsldoi4 <6,7,6,2>, <6,7,6,2>
+  1705431546U,	// <7,6,2,7>: Cost 2 vsldoi12 RHS, <6,2,7,3>
+  1705579011U,	// <7,6,2,u>: Cost 2 vsldoi12 RHS, <6,2,u,3>
+  3714015334U,	// <7,6,3,0>: Cost 4 vsldoi4 <3,7,6,3>, LHS
+  3777243425U,	// <7,6,3,1>: Cost 4 vsldoi8 <3,1,7,6>, <3,1,7,6>
+  2319405957U,	// <7,6,3,2>: Cost 3 vmrglw <6,2,7,3>, <6,7,6,2>
+  3375229286U,	// <7,6,3,3>: Cost 4 vmrglw <3,2,7,3>, <3,2,6,3>
+  2779173426U,	// <7,6,3,4>: Cost 3 vsldoi12 RHS, <6,3,4,5>
+  3375228721U,	// <7,6,3,5>: Cost 4 vmrglw <3,2,7,3>, <2,4,6,5>
+  2319405880U,	// <7,6,3,6>: Cost 3 vmrglw <6,2,7,3>, <6,6,6,6>
+  1245662518U,	// <7,6,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1245662519U,	// <7,6,3,u>: Cost 2 vmrglw <6,2,7,3>, RHS
+  3852915291U,	// <7,6,4,0>: Cost 4 vsldoi12 RHS, <6,4,0,1>
+  3389834729U,	// <7,6,4,1>: Cost 4 vmrglw <5,6,7,4>, <2,0,6,1>
+  2259472890U,	// <7,6,4,2>: Cost 3 vmrghw <7,4,5,6>, <6,2,7,3>
+  3852915321U,	// <7,6,4,3>: Cost 4 vsldoi12 RHS, <6,4,3,4>
+  3852915330U,	// <7,6,4,4>: Cost 4 vsldoi12 RHS, <6,4,4,4>
+  2779173517U,	// <7,6,4,5>: Cost 3 vsldoi12 RHS, <6,4,5,6>
+  2316096312U,	// <7,6,4,6>: Cost 3 vmrglw <5,6,7,4>, <6,6,6,6>
+  1242352950U,	// <7,6,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1242352951U,	// <7,6,4,u>: Cost 2 vmrglw <5,6,7,4>, RHS
+  3852915372U,	// <7,6,5,0>: Cost 4 vsldoi12 RHS, <6,5,0,1>
+  3835294392U,	// <7,6,5,1>: Cost 5 vsldoi12 <1,6,1,7>, <6,5,1,4>
+  3852915395U,	// <7,6,5,2>: Cost 4 vsldoi12 RHS, <6,5,2,6>
+  3852915404U,	// <7,6,5,3>: Cost 4 vsldoi12 RHS, <6,5,3,6>
+  3852915412U,	// <7,6,5,4>: Cost 4 vsldoi12 RHS, <6,5,4,5>
+  3377899313U,	// <7,6,5,5>: Cost 4 vmrglw <3,6,7,5>, <2,4,6,5>
+  2718765160U,	// <7,6,5,6>: Cost 3 vsldoi8 <5,6,7,6>, <5,6,7,6>
+  2779173611U,	// <7,6,5,7>: Cost 3 vsldoi12 RHS, <6,5,7,1>
+  2779321076U,	// <7,6,5,u>: Cost 3 vsldoi12 RHS, <6,5,u,1>
+  2658213990U,	// <7,6,6,0>: Cost 3 vsldoi4 <6,7,6,6>, LHS
+  3852915462U,	// <7,6,6,1>: Cost 4 vsldoi12 RHS, <6,6,1,1>
+  2718765562U,	// <7,6,6,2>: Cost 3 vsldoi8 <5,6,7,6>, <6,2,7,3>
+  3714042622U,	// <7,6,6,3>: Cost 4 vsldoi4 <3,7,6,6>, <3,7,6,6>
+  2658217270U,	// <7,6,6,4>: Cost 3 vsldoi4 <6,7,6,6>, RHS
+  2724074224U,	// <7,6,6,5>: Cost 3 vsldoi8 <6,5,7,6>, <6,5,7,6>
+  1705431864U,	// <7,6,6,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705431874U,	// <7,6,6,7>: Cost 2 vsldoi12 RHS, <6,6,7,7>
+  1705579339U,	// <7,6,6,u>: Cost 2 vsldoi12 RHS, <6,6,u,7>
+  1705431886U,	// <7,6,7,0>: Cost 2 vsldoi12 RHS, <6,7,0,1>
+  2779173719U,	// <7,6,7,1>: Cost 3 vsldoi12 RHS, <6,7,1,1>
+  2779173729U,	// <7,6,7,2>: Cost 3 vsldoi12 RHS, <6,7,2,2>
+  2779173736U,	// <7,6,7,3>: Cost 3 vsldoi12 RHS, <6,7,3,0>
+  1705431926U,	// <7,6,7,4>: Cost 2 vsldoi12 RHS, <6,7,4,5>
+  2779173759U,	// <7,6,7,5>: Cost 3 vsldoi12 RHS, <6,7,5,5>
+  2779173765U,	// <7,6,7,6>: Cost 3 vsldoi12 RHS, <6,7,6,2>
+  1248349494U,	// <7,6,7,7>: Cost 2 vmrglw <6,6,7,7>, RHS
+  1705431958U,	// <7,6,7,u>: Cost 2 vsldoi12 RHS, <6,7,u,1>
+  1705579423U,	// <7,6,u,0>: Cost 2 vsldoi12 RHS, <6,u,0,1>
+  2779173801U,	// <7,6,u,1>: Cost 3 vsldoi12 RHS, <6,u,1,2>
+  2779321266U,	// <7,6,u,2>: Cost 3 vsldoi12 RHS, <6,u,2,2>
+  2779321273U,	// <7,6,u,3>: Cost 3 vsldoi12 RHS, <6,u,3,0>
+  1705579463U,	// <7,6,u,4>: Cost 2 vsldoi12 RHS, <6,u,4,5>
+  2779173841U,	// <7,6,u,5>: Cost 3 vsldoi12 RHS, <6,u,5,6>
+  1705431864U,	// <7,6,u,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705432032U,	// <7,6,u,7>: Cost 2 vsldoi12 RHS, <6,u,7,3>
+  1705579495U,	// <7,6,u,u>: Cost 2 vsldoi12 RHS, <6,u,u,1>
+  1242320994U,	// <7,7,0,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705432058U,	// <7,7,0,1>: Cost 2 vsldoi12 RHS, <7,0,1,2>
+  3841414146U,	// <7,7,0,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,0,2,1>
+  2316063226U,	// <7,7,0,3>: Cost 3 vmrglw <5,6,7,0>, <6,2,7,3>
+  2779173908U,	// <7,7,0,4>: Cost 3 vsldoi12 RHS, <7,0,4,1>
+  2658242658U,	// <7,7,0,5>: Cost 3 vsldoi4 <6,7,7,0>, <5,6,7,0>
+  2658243468U,	// <7,7,0,6>: Cost 3 vsldoi4 <6,7,7,0>, <6,7,7,0>
+  2316063554U,	// <7,7,0,7>: Cost 3 vmrglw <5,6,7,0>, <6,6,7,7>
+  1705432121U,	// <7,7,0,u>: Cost 2 vsldoi12 RHS, <7,0,u,2>
+  3852915777U,	// <7,7,1,0>: Cost 4 vsldoi12 RHS, <7,1,0,1>
+  2779173962U,	// <7,7,1,1>: Cost 3 vsldoi12 RHS, <7,1,1,1>
+  2779173973U,	// <7,7,1,2>: Cost 3 vsldoi12 RHS, <7,1,2,3>
+  3389813242U,	// <7,7,1,3>: Cost 4 vmrglw <5,6,7,1>, <6,2,7,3>
+  3852915813U,	// <7,7,1,4>: Cost 4 vsldoi12 RHS, <7,1,4,1>
+  3852915821U,	// <7,7,1,5>: Cost 4 vsldoi12 RHS, <7,1,5,0>
+  3835294839U,	// <7,7,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <7,1,6,1>
+  2329343596U,	// <7,7,1,7>: Cost 3 vmrglw <7,u,7,1>, <7,7,7,7>
+  2779174027U,	// <7,7,1,u>: Cost 3 vsldoi12 RHS, <7,1,u,3>
+  2803061908U,	// <7,7,2,0>: Cost 3 vsldoi12 RHS, <7,2,0,3>
+  3852915869U,	// <7,7,2,1>: Cost 4 vsldoi12 RHS, <7,2,1,3>
+  2779174053U,	// <7,7,2,2>: Cost 3 vsldoi12 RHS, <7,2,2,2>
+  2779174060U,	// <7,7,2,3>: Cost 3 vsldoi12 RHS, <7,2,3,0>
+  2803061944U,	// <7,7,2,4>: Cost 3 vsldoi12 RHS, <7,2,4,3>
+  3852915905U,	// <7,7,2,5>: Cost 4 vsldoi12 RHS, <7,2,5,3>
+  2767672522U,	// <7,7,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <7,2,6,3>
+  2791855315U,	// <7,7,2,7>: Cost 3 vsldoi12 <6,6,7,7>, <7,2,7,3>
+  2768999644U,	// <7,7,2,u>: Cost 3 vsldoi12 <2,u,3,7>, <7,2,u,3>
+  2779174115U,	// <7,7,3,0>: Cost 3 vsldoi12 RHS, <7,3,0,1>
+  3852915948U,	// <7,7,3,1>: Cost 4 vsldoi12 RHS, <7,3,1,1>
+  3841414394U,	// <7,7,3,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,3,2,6>
+  1245663738U,	// <7,7,3,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174155U,	// <7,7,3,4>: Cost 3 vsldoi12 RHS, <7,3,4,5>
+  3852915988U,	// <7,7,3,5>: Cost 4 vsldoi12 RHS, <7,3,5,5>
+  2706827959U,	// <7,7,3,6>: Cost 3 vsldoi8 <3,6,7,7>, <3,6,7,7>
+  2319405890U,	// <7,7,3,7>: Cost 3 vmrglw <6,2,7,3>, <6,6,7,7>
+  1245663738U,	// <7,7,3,u>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174200U,	// <7,7,4,0>: Cost 3 vsldoi12 RHS, <7,4,0,5>
+  3852916030U,	// <7,7,4,1>: Cost 4 vsldoi12 RHS, <7,4,1,2>
+  3714099130U,	// <7,7,4,2>: Cost 4 vsldoi4 <3,7,7,4>, <2,6,3,7>
+  2316095994U,	// <7,7,4,3>: Cost 3 vmrglw <5,6,7,4>, <6,2,7,3>
+  1242353766U,	// <7,7,4,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705432422U,	// <7,7,4,5>: Cost 2 vsldoi12 RHS, <7,4,5,6>
+  2658276240U,	// <7,7,4,6>: Cost 3 vsldoi4 <6,7,7,4>, <6,7,7,4>
+  2316096322U,	// <7,7,4,7>: Cost 3 vmrglw <5,6,7,4>, <6,6,7,7>
+  1705432449U,	// <7,7,4,u>: Cost 2 vsldoi12 RHS, <7,4,u,6>
+  3852916101U,	// <7,7,5,0>: Cost 4 vsldoi12 RHS, <7,5,0,1>
+  3854906765U,	// <7,7,5,1>: Cost 4 vsldoi12 RHS, <7,5,1,0>
+  3852916121U,	// <7,7,5,2>: Cost 4 vsldoi12 RHS, <7,5,2,3>
+  3389846010U,	// <7,7,5,3>: Cost 4 vmrglw <5,6,7,5>, <6,2,7,3>
+  3852916141U,	// <7,7,5,4>: Cost 4 vsldoi12 RHS, <7,5,4,5>
+  2779174326U,	// <7,7,5,5>: Cost 3 vsldoi12 RHS, <7,5,5,5>
+  2779174337U,	// <7,7,5,6>: Cost 3 vsldoi12 RHS, <7,5,6,7>
+  2329376364U,	// <7,7,5,7>: Cost 3 vmrglw <7,u,7,5>, <7,7,7,7>
+  2779321811U,	// <7,7,5,u>: Cost 3 vsldoi12 RHS, <7,5,u,7>
+  2658287718U,	// <7,7,6,0>: Cost 3 vsldoi4 <6,7,7,6>, LHS
+  3852916197U,	// <7,7,6,1>: Cost 4 vsldoi12 RHS, <7,6,1,7>
+  2779174382U,	// <7,7,6,2>: Cost 3 vsldoi12 RHS, <7,6,2,7>
+  2316112378U,	// <7,7,6,3>: Cost 3 vmrglw <5,6,7,6>, <6,2,7,3>
+  2658290998U,	// <7,7,6,4>: Cost 3 vsldoi4 <6,7,7,6>, RHS
+  3852916233U,	// <7,7,6,5>: Cost 4 vsldoi12 RHS, <7,6,5,7>
+  1651004226U,	// <7,7,6,6>: Cost 2 vsldoi8 <6,6,7,7>, <6,6,7,7>
+  2779174420U,	// <7,7,6,7>: Cost 3 vsldoi12 RHS, <7,6,7,0>
+  1652331492U,	// <7,7,6,u>: Cost 2 vsldoi8 <6,u,7,7>, <6,u,7,7>
+  1590526054U,	// <7,7,7,0>: Cost 2 vsldoi4 <7,7,7,7>, LHS
+  2328728623U,	// <7,7,7,1>: Cost 3 vmrglw <7,7,7,7>, <7,0,7,1>
+  2724746451U,	// <7,7,7,2>: Cost 3 vsldoi8 <6,6,7,7>, <7,2,7,3>
+  2322092538U,	// <7,7,7,3>: Cost 3 vmrglw <6,6,7,7>, <6,2,7,3>
+  1590529334U,	// <7,7,7,4>: Cost 2 vsldoi4 <7,7,7,7>, RHS
+  2328728951U,	// <7,7,7,5>: Cost 3 vmrglw <7,7,7,7>, <7,4,7,5>
+  2724746770U,	// <7,7,7,6>: Cost 3 vsldoi8 <6,6,7,7>, <7,6,6,7>
+  430361910U,	// <7,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,7,u>: Cost 1 vspltisw3 RHS
+  1242320994U,	// <7,7,u,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705580162U,	// <7,7,u,1>: Cost 2 vsldoi12 RHS, <7,u,1,2>
+  2779321996U,	// <7,7,u,2>: Cost 3 vsldoi12 RHS, <7,u,2,3>
+  1245663738U,	// <7,7,u,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  1242353766U,	// <7,7,u,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705580202U,	// <7,7,u,5>: Cost 2 vsldoi12 RHS, <7,u,5,6>
+  1662949620U,	// <7,7,u,6>: Cost 2 vsldoi8 <u,6,7,7>, <u,6,7,7>
+  430361910U,	// <7,7,u,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,u,u>: Cost 1 vspltisw3 RHS
+  1705426944U,	// <7,u,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705432787U,	// <7,u,0,1>: Cost 2 vsldoi12 RHS, <u,0,1,2>
+  2316060885U,	// <7,u,0,2>: Cost 3 vmrglw <5,6,7,0>, <3,0,u,2>
+  1242316956U,	// <7,u,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  2779174637U,	// <7,u,0,4>: Cost 3 vsldoi12 RHS, <u,0,4,1>
+  1182750874U,	// <7,u,0,5>: Cost 2 vmrghw <7,0,1,2>, RHS
+  2316061213U,	// <7,u,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,u,6>
+  1242320200U,	// <7,u,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1705432850U,	// <7,u,0,u>: Cost 2 vsldoi12 RHS, <u,0,u,2>
+  1584578662U,	// <7,u,1,0>: Cost 2 vsldoi4 <6,7,u,1>, LHS
+  1705427764U,	// <7,u,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  631691054U,	// <7,u,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2640407307U,	// <7,u,1,3>: Cost 3 vsldoi4 <3,7,u,1>, <3,7,u,1>
+  1584581942U,	// <7,u,1,4>: Cost 2 vsldoi4 <6,7,u,1>, RHS
+  2779174726U,	// <7,u,1,5>: Cost 3 vsldoi12 RHS, <u,1,5,0>
+  1584583574U,	// <7,u,1,6>: Cost 2 vsldoi4 <6,7,u,1>, <6,7,u,1>
+  2779322201U,	// <7,u,1,7>: Cost 3 vsldoi12 RHS, <u,1,7,1>
+  631691108U,	// <7,u,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779174763U,	// <7,u,2,0>: Cost 3 vsldoi12 RHS, <u,2,0,1>
+  2779174774U,	// <7,u,2,1>: Cost 3 vsldoi12 RHS, <u,2,1,3>
+  1705428584U,	// <7,u,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705432965U,	// <7,u,2,3>: Cost 2 vsldoi12 RHS, <u,2,3,0>
+  2779174801U,	// <7,u,2,4>: Cost 3 vsldoi12 RHS, <u,2,4,3>
+  2779174810U,	// <7,u,2,5>: Cost 3 vsldoi12 RHS, <u,2,5,3>
+  2767673251U,	// <7,u,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <u,2,6,3>
+  1705580460U,	// <7,u,2,7>: Cost 2 vsldoi12 RHS, <u,2,7,3>
+  1705433010U,	// <7,u,2,u>: Cost 2 vsldoi12 RHS, <u,2,u,0>
+  1705433020U,	// <7,u,3,0>: Cost 2 vsldoi12 RHS, <u,3,0,1>
+  2779174853U,	// <7,u,3,1>: Cost 3 vsldoi12 RHS, <u,3,1,1>
+  2767673299U,	// <7,u,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <u,3,2,6>
+  1245659292U,	// <7,u,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705433060U,	// <7,u,3,4>: Cost 2 vsldoi12 RHS, <u,3,4,5>
+  2779174893U,	// <7,u,3,5>: Cost 3 vsldoi12 RHS, <u,3,5,5>
+  2706836152U,	// <7,u,3,6>: Cost 3 vsldoi8 <3,6,7,u>, <3,6,7,u>
+  1245662536U,	// <7,u,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1705433092U,	// <7,u,3,u>: Cost 2 vsldoi12 RHS, <u,3,u,1>
+  2779174925U,	// <7,u,4,0>: Cost 3 vsldoi12 RHS, <u,4,0,1>
+  1185732398U,	// <7,u,4,1>: Cost 2 vmrghw <7,4,5,6>, LHS
+  2316093653U,	// <7,u,4,2>: Cost 3 vmrglw <5,6,7,4>, <3,0,u,2>
+  1242349724U,	// <7,u,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  1705430224U,	// <7,u,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705433151U,	// <7,u,4,5>: Cost 2 vsldoi12 RHS, <u,4,5,6>
+  2316093981U,	// <7,u,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,u,6>
+  1242352968U,	// <7,u,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1705433178U,	// <7,u,4,u>: Cost 2 vsldoi12 RHS, <u,4,u,6>
+  1584611430U,	// <7,u,5,0>: Cost 2 vsldoi4 <6,7,u,5>, LHS
+  2781165670U,	// <7,u,5,1>: Cost 3 vsldoi12 RHS, <u,5,1,0>
+  2640439226U,	// <7,u,5,2>: Cost 3 vsldoi4 <3,7,u,5>, <2,6,3,7>
+  2640440079U,	// <7,u,5,3>: Cost 3 vsldoi4 <3,7,u,5>, <3,7,u,5>
+  1584614710U,	// <7,u,5,4>: Cost 2 vsldoi4 <6,7,u,5>, RHS
+  1705431044U,	// <7,u,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  631691418U,	// <7,u,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2779322525U,	// <7,u,5,7>: Cost 3 vsldoi12 RHS, <u,5,7,1>
+  631691436U,	// <7,u,5,u>: Cost 1 vsldoi12 RHS, RHS
+  2779175087U,	// <7,u,6,0>: Cost 3 vsldoi12 RHS, <u,6,0,1>
+  2779175102U,	// <7,u,6,1>: Cost 3 vsldoi12 RHS, <u,6,1,7>
+  1648357887U,	// <7,u,6,2>: Cost 2 vsldoi8 <6,2,7,u>, <6,2,7,u>
+  1705433296U,	// <7,u,6,3>: Cost 2 vsldoi12 RHS, <u,6,3,7>
+  2779175127U,	// <7,u,6,4>: Cost 3 vsldoi12 RHS, <u,6,4,5>
+  2779175138U,	// <7,u,6,5>: Cost 3 vsldoi12 RHS, <u,6,5,7>
+  1651012419U,	// <7,u,6,6>: Cost 2 vsldoi8 <6,6,7,u>, <6,6,7,u>
+  1705580788U,	// <7,u,6,7>: Cost 2 vsldoi12 RHS, <u,6,7,7>
+  1705433341U,	// <7,u,6,u>: Cost 2 vsldoi12 RHS, <u,6,u,7>
+  1705580800U,	// <7,u,7,0>: Cost 2 vsldoi12 RHS, <u,7,0,1>
+  1187878702U,	// <7,u,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2768042263U,	// <7,u,7,2>: Cost 3 vsldoi12 <2,6,u,7>, <u,7,2,6>
+  1248346268U,	// <7,u,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705580840U,	// <7,u,7,4>: Cost 2 vsldoi12 RHS, <u,7,4,5>
+  1187879066U,	// <7,u,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2779322679U,	// <7,u,7,6>: Cost 3 vsldoi12 RHS, <u,7,6,2>
+  430361910U,	// <7,u,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,u,7,u>: Cost 1 vspltisw3 RHS
+  1705433425U,	// <7,u,u,0>: Cost 2 vsldoi12 RHS, <u,u,0,1>
+  1705433435U,	// <7,u,u,1>: Cost 2 vsldoi12 RHS, <u,u,1,2>
+  631691621U,	// <7,u,u,2>: Cost 1 vsldoi12 RHS, LHS
+  1705433451U,	// <7,u,u,3>: Cost 2 vsldoi12 RHS, <u,u,3,0>
+  1705433465U,	// <7,u,u,4>: Cost 2 vsldoi12 RHS, <u,u,4,5>
+  1705433475U,	// <7,u,u,5>: Cost 2 vsldoi12 RHS, <u,u,5,6>
+  631691661U,	// <7,u,u,6>: Cost 1 vsldoi12 RHS, RHS
+  430361910U,	// <7,u,u,7>: Cost 1 vspltisw3 RHS
+  631691675U,	// <7,u,u,u>: Cost 1 vsldoi12 RHS, LHS
+  202162278U,	// <u,0,0,0>: Cost 1 vspltisw0 LHS
+  1678598154U,	// <u,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2634500154U,	// <u,0,0,2>: Cost 3 vsldoi4 <2,u,0,0>, <2,u,0,0>
+  2289596269U,	// <u,0,0,3>: Cost 3 vmrglw <1,2,u,0>, <u,2,0,3>
+  1548815670U,	// <u,0,0,4>: Cost 2 vsldoi4 <0,u,0,0>, RHS
+  2663698530U,	// <u,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2658390942U,	// <u,0,0,6>: Cost 3 vsldoi4 <6,u,0,0>, <6,u,0,0>
+  2289596597U,	// <u,0,0,7>: Cost 3 vmrglw <1,2,u,0>, <u,6,0,7>
+  202162278U,	// <u,0,0,u>: Cost 1 vspltisw0 LHS
+  1560764518U,	// <u,0,1,0>: Cost 2 vsldoi4 <2,u,0,1>, LHS
+  115720294U,	// <u,0,1,1>: Cost 1 vmrghw LHS, LHS
+  604856427U,	// <u,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2634508438U,	// <u,0,1,3>: Cost 3 vsldoi4 <2,u,0,1>, <3,0,1,2>
+  1560767798U,	// <u,0,1,4>: Cost 2 vsldoi4 <2,u,0,1>, RHS
+  2652426438U,	// <u,0,1,5>: Cost 3 vsldoi4 <5,u,0,1>, <5,u,0,1>
+  1584657311U,	// <u,0,1,6>: Cost 2 vsldoi4 <6,u,0,1>, <6,u,0,1>
+  2658399226U,	// <u,0,1,7>: Cost 3 vsldoi4 <6,u,0,1>, <7,0,1,2>
+  604856476U,	// <u,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696889850U,	// <u,0,2,0>: Cost 3 vsldoi8 <2,0,u,0>, <2,0,u,0>
+  1190174822U,	// <u,0,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  2692245096U,	// <u,0,2,2>: Cost 3 vsldoi8 <1,2,u,0>, <2,2,2,2>
+  2692245158U,	// <u,0,2,3>: Cost 3 vsldoi8 <1,2,u,0>, <2,3,0,1>
+  2263916882U,	// <u,0,2,4>: Cost 3 vmrghw <u,2,3,0>, <0,4,1,5>
+  2299709908U,	// <u,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  2692245434U,	// <u,0,2,6>: Cost 3 vsldoi8 <1,2,u,0>, <2,6,3,7>
+  2701535281U,	// <u,0,2,7>: Cost 3 vsldoi8 <2,7,u,0>, <2,7,u,0>
+  1190175389U,	// <u,0,2,u>: Cost 2 vmrghw <u,2,3,0>, LHS
+  1209237504U,	// <u,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1209239206U,	// <u,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2704189813U,	// <u,0,3,2>: Cost 3 vsldoi8 <3,2,u,0>, <3,2,u,0>
+  2692245916U,	// <u,0,3,3>: Cost 3 vsldoi8 <1,2,u,0>, <3,3,3,3>
+  2282981033U,	// <u,0,3,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2664386658U,	// <u,0,3,5>: Cost 3 vsldoi4 <7,u,0,3>, <5,6,7,0>
+  2691877496U,	// <u,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  2664388218U,	// <u,0,3,7>: Cost 3 vsldoi4 <7,u,0,3>, <7,u,0,3>
+  1209239213U,	// <u,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  2289623040U,	// <u,0,4,0>: Cost 3 vmrglw <1,2,u,4>, <0,0,0,0>
+  1678598482U,	// <u,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2634532926U,	// <u,0,4,2>: Cost 3 vsldoi4 <2,u,0,4>, <2,u,0,4>
+  2235580672U,	// <u,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  1143619922U,	// <u,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1618505014U,	// <u,0,4,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  2658423714U,	// <u,0,4,6>: Cost 3 vsldoi4 <6,u,0,4>, <6,u,0,4>
+  2713259464U,	// <u,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1683243409U,	// <u,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  1192443904U,	// <u,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  118702182U,	// <u,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2266185901U,	// <u,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2640513816U,	// <u,0,5,3>: Cost 3 vsldoi4 <3,u,0,5>, <3,u,0,5>
+  1192444242U,	// <u,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2718789636U,	// <u,0,5,5>: Cost 3 vsldoi8 <5,6,u,0>, <5,5,5,5>
+  1645047915U,	// <u,0,5,6>: Cost 2 vsldoi8 <5,6,u,0>, <5,6,u,0>
+  2664404604U,	// <u,0,5,7>: Cost 3 vsldoi4 <7,u,0,5>, <7,u,0,5>
+  118702749U,	// <u,0,5,u>: Cost 1 vmrghw RHS, LHS
+  2302910464U,	// <u,0,6,0>: Cost 3 vmrglw <3,4,u,6>, <0,0,0,0>
+  1192886374U,	// <u,0,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  2718790138U,	// <u,0,6,2>: Cost 3 vsldoi8 <5,6,u,0>, <6,2,7,3>
+  2722771537U,	// <u,0,6,3>: Cost 3 vsldoi8 <6,3,u,0>, <6,3,u,0>
+  2266628434U,	// <u,0,6,4>: Cost 3 vmrghw <u,6,3,7>, <0,4,1,5>
+  2248950180U,	// <u,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  2718790456U,	// <u,0,6,6>: Cost 3 vsldoi8 <5,6,u,0>, <6,6,6,6>
+  2718790478U,	// <u,0,6,7>: Cost 3 vsldoi8 <5,6,u,0>, <6,7,0,1>
+  1192886941U,	// <u,0,6,u>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1235812352U,	// <u,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235814054U,	// <u,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  2728080601U,	// <u,0,7,2>: Cost 3 vsldoi8 <7,2,u,0>, <7,2,u,0>
+  2640530202U,	// <u,0,7,3>: Cost 3 vsldoi4 <3,u,0,7>, <3,u,0,7>
+  2640530742U,	// <u,0,7,4>: Cost 3 vsldoi4 <3,u,0,7>, RHS
+  2309556692U,	// <u,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  2730735133U,	// <u,0,7,6>: Cost 3 vsldoi8 <7,6,u,0>, <7,6,u,0>
+  2309556856U,	// <u,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235814061U,	// <u,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  202162278U,	// <u,0,u,0>: Cost 1 vspltisw0 LHS
+  120365158U,	// <u,0,u,1>: Cost 1 vmrghw LHS, LHS
+  604856989U,	// <u,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2692249532U,	// <u,0,u,3>: Cost 3 vsldoi8 <1,2,u,0>, <u,3,0,1>
+  1560825142U,	// <u,0,u,4>: Cost 2 vsldoi4 <2,u,0,u>, RHS
+  1618507930U,	// <u,0,u,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  1584714662U,	// <u,0,u,6>: Cost 2 vsldoi4 <6,u,0,u>, <6,u,0,u>
+  2309565048U,	// <u,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  604857043U,	// <u,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  1611210825U,	// <u,1,0,0>: Cost 2 vsldoi8 <0,0,u,1>, <0,0,u,1>
+  1616519270U,	// <u,1,0,1>: Cost 2 vsldoi8 <0,u,u,1>, LHS
+  2287605459U,	// <u,1,0,2>: Cost 3 vmrglw <0,u,u,0>, <u,0,1,2>
+  2640546588U,	// <u,1,0,3>: Cost 3 vsldoi4 <3,u,1,0>, <3,u,1,0>
+  2622631222U,	// <u,1,0,4>: Cost 3 vsldoi4 <0,u,1,0>, RHS
+  2289590610U,	// <u,1,0,5>: Cost 3 vmrglw <1,2,u,0>, <0,4,1,5>
+  2664436630U,	// <u,1,0,6>: Cost 3 vsldoi4 <7,u,1,0>, <6,7,u,1>
+  2664437376U,	// <u,1,0,7>: Cost 3 vsldoi4 <7,u,1,0>, <7,u,1,0>
+  1616519889U,	// <u,1,0,u>: Cost 2 vsldoi8 <0,u,u,1>, <0,u,u,1>
+  1548894866U,	// <u,1,1,0>: Cost 2 vsldoi4 <0,u,1,1>, <0,u,1,1>
+  269271142U,	// <u,1,1,1>: Cost 1 vspltisw1 LHS
+  1189462934U,	// <u,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2622638230U,	// <u,1,1,3>: Cost 3 vsldoi4 <0,u,1,1>, <3,0,1,2>
+  1548897590U,	// <u,1,1,4>: Cost 2 vsldoi4 <0,u,1,1>, RHS
+  2756985692U,	// <u,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  2658472872U,	// <u,1,1,6>: Cost 3 vsldoi4 <6,u,1,1>, <6,u,1,1>
+  2287614142U,	// <u,1,1,7>: Cost 3 vmrglw <0,u,u,1>, <u,6,1,7>
+  269271142U,	// <u,1,1,u>: Cost 1 vspltisw1 LHS
+  1566818406U,	// <u,1,2,0>: Cost 2 vsldoi4 <3,u,1,2>, LHS
+  2756985735U,	// <u,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  1148371862U,	// <u,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  835584U,	// <u,1,2,3>: Cost 0 copy LHS
+  1566821686U,	// <u,1,2,4>: Cost 2 vsldoi4 <3,u,1,2>, RHS
+  2756985771U,	// <u,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  2690262970U,	// <u,1,2,6>: Cost 3 vsldoi8 <0,u,u,1>, <2,6,3,7>
+  1590711938U,	// <u,1,2,7>: Cost 2 vsldoi4 <7,u,1,2>, <7,u,1,2>
+  835584U,	// <u,1,2,u>: Cost 0 copy LHS
+  2282979337U,	// <u,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1209237514U,	// <u,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1209239702U,	// <u,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282979502U,	// <u,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282979341U,	// <u,1,3,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1209237842U,	// <u,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282979505U,	// <u,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287625423U,	// <u,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1209237521U,	// <u,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  1635101613U,	// <u,1,4,0>: Cost 2 vsldoi8 <4,0,u,1>, <4,0,u,1>
+  2289623050U,	// <u,1,4,1>: Cost 3 vmrglw <1,2,u,4>, <0,0,1,1>
+  2289625238U,	// <u,1,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,1,2>
+  2640579360U,	// <u,1,4,3>: Cost 3 vsldoi4 <3,u,1,4>, <3,u,1,4>
+  2622663990U,	// <u,1,4,4>: Cost 3 vsldoi4 <0,u,1,4>, RHS
+  1616522550U,	// <u,1,4,5>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  2664469398U,	// <u,1,4,6>: Cost 3 vsldoi4 <7,u,1,4>, <6,7,u,1>
+  2664470148U,	// <u,1,4,7>: Cost 3 vsldoi4 <7,u,1,4>, <7,u,1,4>
+  1616522793U,	// <u,1,4,u>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  1548927638U,	// <u,1,5,0>: Cost 2 vsldoi4 <0,u,1,5>, <0,u,1,5>
+  1192444724U,	// <u,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1192444822U,	// <u,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2622670998U,	// <u,1,5,3>: Cost 3 vsldoi4 <0,u,1,5>, <3,0,1,2>
+  1548930358U,	// <u,1,5,4>: Cost 2 vsldoi4 <0,u,1,5>, RHS
+  1210728786U,	// <u,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2714153058U,	// <u,1,5,6>: Cost 3 vsldoi8 <4,u,u,1>, <5,6,7,0>
+  2670449658U,	// <u,1,5,7>: Cost 3 vsldoi4 <u,u,1,5>, <7,0,1,2>
+  1548932910U,	// <u,1,5,u>: Cost 2 vsldoi4 <0,u,1,5>, LHS
+  2622677655U,	// <u,1,6,0>: Cost 3 vsldoi4 <0,u,1,6>, <0,u,1,6>
+  2756986063U,	// <u,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2302912662U,	// <u,1,6,2>: Cost 3 vmrglw <3,4,u,6>, <3,0,1,2>
+  3696421014U,	// <u,1,6,3>: Cost 4 vsldoi4 <0,u,1,6>, <3,0,1,2>
+  2622680374U,	// <u,1,6,4>: Cost 3 vsldoi4 <0,u,1,6>, RHS
+  2756986099U,	// <u,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  2714153784U,	// <u,1,6,6>: Cost 3 vsldoi8 <4,u,u,1>, <6,6,6,6>
+  1651692438U,	// <u,1,6,7>: Cost 2 vsldoi8 <6,7,u,1>, <6,7,u,1>
+  1652356071U,	// <u,1,6,u>: Cost 2 vsldoi8 <6,u,u,1>, <6,u,u,1>
+  2628657254U,	// <u,1,7,0>: Cost 3 vsldoi4 <1,u,1,7>, LHS
+  1235812362U,	// <u,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235814550U,	// <u,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309554350U,	// <u,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2628660534U,	// <u,1,7,4>: Cost 3 vsldoi4 <1,u,1,7>, RHS
+  1235812690U,	// <u,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309554353U,	// <u,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309554678U,	// <u,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235812369U,	// <u,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  1548952217U,	// <u,1,u,0>: Cost 2 vsldoi4 <0,u,1,u>, <0,u,1,u>
+  269271142U,	// <u,1,u,1>: Cost 1 vspltisw1 LHS
+  1209280662U,	// <u,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  835584U,	// <u,1,u,3>: Cost 0 copy LHS
+  1548954934U,	// <u,1,u,4>: Cost 2 vsldoi4 <0,u,1,u>, RHS
+  1209278802U,	// <u,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2283020465U,	// <u,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  1590761096U,	// <u,1,u,7>: Cost 2 vsldoi4 <7,u,1,u>, <7,u,1,u>
+  835584U,	// <u,1,u,u>: Cost 0 copy LHS
+  2702876672U,	// <u,2,0,0>: Cost 3 vsldoi8 <3,0,u,2>, <0,0,0,0>
+  1629134950U,	// <u,2,0,1>: Cost 2 vsldoi8 <3,0,u,2>, LHS
+  2289591912U,	// <u,2,0,2>: Cost 3 vmrglw <1,2,u,0>, <2,2,2,2>
+  1215848550U,	// <u,2,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2702877010U,	// <u,2,0,4>: Cost 3 vsldoi8 <3,0,u,2>, <0,4,1,5>
+  2289222708U,	// <u,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2779178473U,	// <u,2,0,6>: Cost 3 vsldoi12 RHS, <2,0,6,1>
+  2726249024U,	// <u,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1215848555U,	// <u,2,0,u>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2690933539U,	// <u,2,1,0>: Cost 3 vsldoi8 <1,0,u,2>, <1,0,u,2>
+  2628683124U,	// <u,2,1,1>: Cost 3 vsldoi4 <1,u,2,1>, <1,u,2,1>
+  1189463656U,	// <u,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1213866086U,	// <u,2,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  2628685110U,	// <u,2,1,4>: Cost 3 vsldoi4 <1,u,2,1>, RHS
+  2263205736U,	// <u,2,1,5>: Cost 3 vmrghw LHS, <2,5,3,6>
+  1189463994U,	// <u,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2263205866U,	// <u,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1213866091U,	// <u,2,1,u>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1556938854U,	// <u,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2697569869U,	// <u,2,2,1>: Cost 3 vsldoi8 <2,1,u,2>, <2,1,u,2>
+  336380006U,	// <u,2,2,2>: Cost 1 vspltisw2 LHS
+  1678599794U,	// <u,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  1556942134U,	// <u,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <u,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2702878650U,	// <u,2,2,6>: Cost 3 vsldoi8 <3,0,u,2>, <2,6,3,7>
+  2300229831U,	// <u,2,2,7>: Cost 3 vmrglw <3,0,u,2>, <u,6,2,7>
+  336380006U,	// <u,2,2,u>: Cost 1 vspltisw2 LHS
+  475243165U,	// <u,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1548985140U,	// <u,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1209239144U,	// <u,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135495782U,	// <u,2,3,3>: Cost 1 vmrglw LHS, LHS
+  475245878U,	// <u,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1596764164U,	// <u,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1596764666U,	// <u,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1596765178U,	// <u,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135495787U,	// <u,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2708851630U,	// <u,2,4,0>: Cost 3 vsldoi8 <4,0,u,2>, <4,0,u,2>
+  2217362979U,	// <u,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2289624680U,	// <u,2,4,2>: Cost 3 vmrglw <1,2,u,4>, <2,2,2,2>
+  1215881318U,	// <u,2,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2726767824U,	// <u,2,4,4>: Cost 3 vsldoi8 <7,0,u,2>, <4,4,4,4>
+  1629138230U,	// <u,2,4,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  2779178801U,	// <u,2,4,6>: Cost 3 vsldoi12 RHS, <2,4,6,5>
+  2726251976U,	// <u,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1215881323U,	// <u,2,4,u>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2628714598U,	// <u,2,5,0>: Cost 3 vsldoi4 <1,u,2,5>, LHS
+  2628715896U,	// <u,2,5,1>: Cost 3 vsldoi4 <1,u,2,5>, <1,u,2,5>
+  1192445544U,	// <u,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1213898854U,	// <u,2,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2628717878U,	// <u,2,5,4>: Cost 3 vsldoi4 <1,u,2,5>, RHS
+  2726768644U,	// <u,2,5,5>: Cost 3 vsldoi8 <7,0,u,2>, <5,5,5,5>
+  1192445882U,	// <u,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2266187754U,	// <u,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1213898859U,	// <u,2,5,u>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2634694758U,	// <u,2,6,0>: Cost 3 vsldoi4 <2,u,2,6>, LHS
+  2721460657U,	// <u,2,6,1>: Cost 3 vsldoi8 <6,1,u,2>, <6,1,u,2>
+  2296940136U,	// <u,2,6,2>: Cost 3 vmrglw <2,4,u,6>, <2,2,2,2>
+  1678600122U,	// <u,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2634698038U,	// <u,2,6,4>: Cost 3 vsldoi4 <2,u,2,6>, RHS
+  3370682125U,	// <u,2,6,5>: Cost 4 vmrglw <2,4,u,6>, <2,4,2,5>
+  1157056442U,	// <u,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725442455U,	// <u,2,6,7>: Cost 3 vsldoi8 <6,7,u,2>, <6,7,u,2>
+  1678600167U,	// <u,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  1653027897U,	// <u,2,7,0>: Cost 2 vsldoi8 <7,0,u,2>, <7,0,u,2>
+  2309554924U,	// <u,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235813992U,	// <u,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  162070630U,	// <u,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2634706230U,	// <u,2,7,4>: Cost 3 vsldoi4 <2,u,2,7>, RHS
+  2309555252U,	// <u,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309555901U,	// <u,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309555416U,	// <u,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  162070635U,	// <u,2,7,u>: Cost 1 vmrglw RHS, LHS
+  475284130U,	// <u,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1549026100U,	// <u,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  336380006U,	// <u,2,u,2>: Cost 1 vspltisw2 LHS
+  135536742U,	// <u,2,u,3>: Cost 1 vmrglw LHS, LHS
+  475286838U,	// <u,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1629141146U,	// <u,2,u,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  1194108858U,	// <u,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  1596806138U,	// <u,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135536747U,	// <u,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611890688U,	// <u,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  538149020U,	// <u,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685632685U,	// <u,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685632764U,	// <u,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611891026U,	// <u,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2733408722U,	// <u,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2658612153U,	// <u,3,0,6>: Cost 3 vsldoi4 <6,u,3,0>, <6,u,3,0>
+  2289592250U,	// <u,3,0,7>: Cost 3 vmrglw <1,2,u,0>, <2,6,3,7>
+  538149533U,	// <u,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1189464214U,	// <u,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  1611891508U,	// <u,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611891606U,	// <u,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  1189464476U,	// <u,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1189464578U,	// <u,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2690278511U,	// <u,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2690278607U,	// <u,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2287609786U,	// <u,3,1,7>: Cost 3 vmrglw <0,u,u,1>, <2,6,3,7>
+  1611892092U,	// <u,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2685634042U,	// <u,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,u,0>
+  2685634079U,	// <u,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611892328U,	// <u,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611892390U,	// <u,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2685634371U,	// <u,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,u,5>
+  2685634453U,	// <u,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,u,6>
+  1611892666U,	// <u,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2300225466U,	// <u,3,2,7>: Cost 3 vmrglw <3,0,u,2>, <2,6,3,7>
+  1611892795U,	// <u,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,0,1>
+  1209238422U,	// <u,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282980247U,	// <u,3,3,1>: Cost 3 vmrglw LHS, <1,2,3,1>
+  1561004120U,	// <u,3,3,2>: Cost 2 vsldoi4 <2,u,3,3>, <2,u,3,3>
+  403488870U,	// <u,3,3,3>: Cost 1 vspltisw3 LHS
+  1209238426U,	// <u,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282980899U,	// <u,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2282985598U,	// <u,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1209239482U,	// <u,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  403488870U,	// <u,3,3,u>: Cost 1 vspltisw3 LHS
+  1555038310U,	// <u,3,4,0>: Cost 2 vsldoi4 <1,u,3,4>, LHS
+  1555039616U,	// <u,3,4,1>: Cost 2 vsldoi4 <1,u,3,4>, <1,u,3,4>
+  2628781672U,	// <u,3,4,2>: Cost 3 vsldoi4 <1,u,3,4>, <2,2,2,2>
+  2289624690U,	// <u,3,4,3>: Cost 3 vmrglw <1,2,u,4>, <2,2,3,3>
+  1555041590U,	// <u,3,4,4>: Cost 2 vsldoi4 <1,u,3,4>, RHS
+  538152246U,	// <u,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2658644925U,	// <u,3,4,6>: Cost 3 vsldoi4 <6,u,3,4>, <6,u,3,4>
+  2289625018U,	// <u,3,4,7>: Cost 3 vmrglw <1,2,u,4>, <2,6,3,7>
+  538152489U,	// <u,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1192446102U,	// <u,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2733411983U,	// <u,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2634762330U,	// <u,3,5,2>: Cost 3 vsldoi4 <2,u,3,5>, <2,u,3,5>
+  1192446364U,	// <u,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1192446466U,	// <u,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  1659670532U,	// <u,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659670626U,	// <u,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  2287642554U,	// <u,3,5,7>: Cost 3 vmrglw <0,u,u,5>, <2,6,3,7>
+  1659670788U,	// <u,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2634768486U,	// <u,3,6,0>: Cost 3 vsldoi4 <2,u,3,6>, LHS
+  2733412775U,	// <u,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1648390659U,	// <u,3,6,2>: Cost 2 vsldoi8 <6,2,u,3>, <6,2,u,3>
+  2634770973U,	// <u,3,6,3>: Cost 3 vsldoi4 <2,u,3,6>, <3,4,u,6>
+  2634771766U,	// <u,3,6,4>: Cost 3 vsldoi4 <2,u,3,6>, RHS
+  2733413099U,	// <u,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659671352U,	// <u,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659671374U,	// <u,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1652372457U,	// <u,3,6,u>: Cost 2 vsldoi8 <6,u,u,3>, <6,u,u,3>
+  1561034854U,	// <u,3,7,0>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  2634777396U,	// <u,3,7,1>: Cost 3 vsldoi4 <2,u,3,7>, <1,1,1,1>
+  1561036892U,	// <u,3,7,2>: Cost 2 vsldoi4 <2,u,3,7>, <2,u,3,7>
+  1235814002U,	// <u,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1561038134U,	// <u,3,7,4>: Cost 2 vsldoi4 <2,u,3,7>, RHS
+  2309555747U,	// <u,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2309556072U,	// <u,3,7,6>: Cost 3 vmrglw RHS, <2,5,3,6>
+  1235814330U,	// <u,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1561040686U,	// <u,3,7,u>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  1611896531U,	// <u,3,u,0>: Cost 2 vsldoi8 LHS, <u,0,1,2>
+  538154798U,	// <u,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1611896712U,	// <u,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,3>
+  403488870U,	// <u,3,u,3>: Cost 1 vspltisw3 LHS
+  1611896895U,	// <u,3,u,4>: Cost 2 vsldoi8 LHS, <u,4,5,6>
+  538155162U,	// <u,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1611897040U,	// <u,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1209280442U,	// <u,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  538155365U,	// <u,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  1165118354U,	// <u,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1618534502U,	// <u,4,0,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  2634795102U,	// <u,4,0,2>: Cost 3 vsldoi4 <2,u,4,0>, <2,u,4,0>
+  2686451968U,	// <u,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2692276562U,	// <u,4,0,4>: Cost 3 vsldoi8 <1,2,u,4>, <0,4,1,5>
+  1705438098U,	// <u,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658685890U,	// <u,4,0,6>: Cost 3 vsldoi4 <6,u,4,0>, <6,u,4,0>
+  2256489928U,	// <u,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1618535069U,	// <u,4,0,u>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1189464978U,	// <u,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2692277044U,	// <u,4,1,1>: Cost 3 vsldoi8 <1,2,u,4>, <1,1,1,1>
+  1618535367U,	// <u,4,1,2>: Cost 2 vsldoi8 <1,2,u,4>, <1,2,u,4>
+  2640775992U,	// <u,4,1,3>: Cost 3 vsldoi4 <3,u,4,1>, <3,u,4,1>
+  1189465296U,	// <u,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  115723574U,	// <u,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2263207289U,	// <u,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2664666780U,	// <u,4,1,7>: Cost 3 vsldoi4 <7,u,4,1>, <7,u,4,1>
+  115723817U,	// <u,4,1,u>: Cost 1 vmrghw LHS, RHS
+  2263919506U,	// <u,4,2,0>: Cost 3 vmrghw <u,2,3,0>, <4,0,5,1>
+  2222115812U,	// <u,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  2692277864U,	// <u,4,2,2>: Cost 3 vsldoi8 <1,2,u,4>, <2,2,2,2>
+  2692277926U,	// <u,4,2,3>: Cost 3 vsldoi8 <1,2,u,4>, <2,3,0,1>
+  2324114640U,	// <u,4,2,4>: Cost 3 vmrglw <7,0,u,2>, <4,4,4,4>
+  1190178102U,	// <u,4,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278202U,	// <u,4,2,6>: Cost 3 vsldoi8 <1,2,u,4>, <2,6,3,7>
+  2701568053U,	// <u,4,2,7>: Cost 3 vsldoi8 <2,7,u,4>, <2,7,u,4>
+  1190178345U,	// <u,4,2,u>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278422U,	// <u,4,3,0>: Cost 3 vsldoi8 <1,2,u,4>, <3,0,1,2>
+  2282981552U,	// <u,4,3,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2704222585U,	// <u,4,3,2>: Cost 3 vsldoi8 <3,2,u,4>, <3,2,u,4>
+  2692278684U,	// <u,4,3,3>: Cost 3 vsldoi8 <1,2,u,4>, <3,3,3,3>
+  1257016528U,	// <u,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1209239246U,	// <u,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2691910300U,	// <u,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  2664683166U,	// <u,4,3,7>: Cost 3 vsldoi4 <7,u,4,3>, <7,u,4,3>
+  1209239249U,	// <u,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  1573027942U,	// <u,4,4,0>: Cost 2 vsldoi4 <4,u,4,4>, LHS
+  2634826695U,	// <u,4,4,1>: Cost 3 vsldoi4 <2,u,4,4>, <1,2,u,4>
+  2634827874U,	// <u,4,4,2>: Cost 3 vsldoi4 <2,u,4,4>, <2,u,4,4>
+  2289629073U,	// <u,4,4,3>: Cost 3 vmrglw <1,2,u,4>, <u,2,4,3>
+  229035318U,	// <u,4,4,4>: Cost 1 vspltisw0 RHS
+  1618537782U,	// <u,4,4,5>: Cost 2 vsldoi8 <1,2,u,4>, RHS
+  2658718662U,	// <u,4,4,6>: Cost 3 vsldoi4 <6,u,4,4>, <6,u,4,4>
+  2289629401U,	// <u,4,4,7>: Cost 3 vmrglw <1,2,u,4>, <u,6,4,7>
+  229035318U,	// <u,4,4,u>: Cost 1 vspltisw0 RHS
+  1561092198U,	// <u,4,5,0>: Cost 2 vsldoi4 <2,u,4,5>, LHS
+  2628863370U,	// <u,4,5,1>: Cost 3 vsldoi4 <1,u,4,5>, <1,u,4,5>
+  1561094243U,	// <u,4,5,2>: Cost 2 vsldoi4 <2,u,4,5>, <2,u,4,5>
+  2634836118U,	// <u,4,5,3>: Cost 3 vsldoi4 <2,u,4,5>, <3,0,1,2>
+  1561095478U,	// <u,4,5,4>: Cost 2 vsldoi4 <2,u,4,5>, RHS
+  118705462U,	// <u,4,5,5>: Cost 1 vmrghw RHS, RHS
+  604859702U,	// <u,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2658726906U,	// <u,4,5,7>: Cost 3 vsldoi4 <6,u,4,5>, <7,0,1,2>
+  604859720U,	// <u,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2266631058U,	// <u,4,6,0>: Cost 3 vmrghw <u,6,3,7>, <4,0,5,1>
+  2302692152U,	// <u,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  2718822906U,	// <u,4,6,2>: Cost 3 vsldoi8 <5,6,u,4>, <6,2,7,3>
+  2722804309U,	// <u,4,6,3>: Cost 3 vsldoi8 <6,3,u,4>, <6,3,u,4>
+  2723467942U,	// <u,4,6,4>: Cost 3 vsldoi8 <6,4,u,4>, <6,4,u,4>
+  1192889654U,	// <u,4,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2718823224U,	// <u,4,6,6>: Cost 3 vsldoi8 <5,6,u,4>, <6,6,6,6>
+  2718823246U,	// <u,4,6,7>: Cost 3 vsldoi8 <5,6,u,4>, <6,7,0,1>
+  1192889897U,	// <u,4,6,u>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2640822374U,	// <u,4,7,0>: Cost 3 vsldoi4 <3,u,4,7>, LHS
+  2640823194U,	// <u,4,7,1>: Cost 3 vsldoi4 <3,u,4,7>, <1,2,3,4>
+  2728113373U,	// <u,4,7,2>: Cost 3 vsldoi8 <7,2,u,4>, <7,2,u,4>
+  2640825150U,	// <u,4,7,3>: Cost 3 vsldoi4 <3,u,4,7>, <3,u,4,7>
+  1235815632U,	// <u,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235814094U,	// <u,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  2730767905U,	// <u,4,7,6>: Cost 3 vsldoi8 <7,6,u,4>, <7,6,u,4>
+  2309556892U,	// <u,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235814097U,	// <u,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  1561116774U,	// <u,4,u,0>: Cost 2 vsldoi4 <2,u,4,u>, LHS
+  1618540334U,	// <u,4,u,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1561118822U,	// <u,4,u,2>: Cost 2 vsldoi4 <2,u,4,u>, <2,u,4,u>
+  2692282300U,	// <u,4,u,3>: Cost 3 vsldoi8 <1,2,u,4>, <u,3,0,1>
+  229035318U,	// <u,4,u,4>: Cost 1 vspltisw0 RHS
+  120368438U,	// <u,4,u,5>: Cost 1 vmrghw LHS, RHS
+  604859945U,	// <u,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2309565084U,	// <u,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  604859963U,	// <u,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2690293760U,	// <u,5,0,0>: Cost 3 vsldoi8 <0,u,u,5>, <0,0,0,0>
+  1616552038U,	// <u,5,0,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2640840434U,	// <u,5,0,2>: Cost 3 vsldoi4 <3,u,5,0>, <2,3,u,5>
+  2640841536U,	// <u,5,0,3>: Cost 3 vsldoi4 <3,u,5,0>, <3,u,5,0>
+  1613381970U,	// <u,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2316135642U,	// <u,5,0,5>: Cost 3 vmrglw <5,6,u,0>, <4,4,5,5>
+  2289592834U,	// <u,5,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,5,6>
+  2664732324U,	// <u,5,0,7>: Cost 3 vsldoi4 <7,u,5,0>, <7,u,5,0>
+  1616552661U,	// <u,5,0,u>: Cost 2 vsldoi8 <0,u,u,5>, <0,u,u,5>
+  1573077094U,	// <u,5,1,0>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  1237536282U,	// <u,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2690294678U,	// <u,5,1,2>: Cost 3 vsldoi8 <0,u,u,5>, <1,2,3,0>
+  2646821014U,	// <u,5,1,3>: Cost 3 vsldoi4 <4,u,5,1>, <3,0,1,2>
+  1573080602U,	// <u,5,1,4>: Cost 2 vsldoi4 <4,u,5,1>, <4,u,5,1>
+  1189466116U,	// <u,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1189466210U,	// <u,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2646823930U,	// <u,5,1,7>: Cost 3 vsldoi4 <4,u,5,1>, <7,0,1,2>
+  1573082926U,	// <u,5,1,u>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  2640855142U,	// <u,5,2,0>: Cost 3 vsldoi4 <3,u,5,2>, LHS
+  2697594448U,	// <u,5,2,1>: Cost 3 vsldoi8 <2,1,u,5>, <2,1,u,5>
+  2690295400U,	// <u,5,2,2>: Cost 3 vsldoi8 <0,u,u,5>, <2,2,2,2>
+  1625179890U,	// <u,5,2,3>: Cost 2 vsldoi8 <2,3,u,5>, <2,3,u,5>
+  2699585347U,	// <u,5,2,4>: Cost 3 vsldoi8 <2,4,u,5>, <2,4,u,5>
+  2781171471U,	// <u,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2690295738U,	// <u,5,2,6>: Cost 3 vsldoi8 <0,u,u,5>, <2,6,3,7>
+  3775318070U,	// <u,5,2,7>: Cost 4 vsldoi8 <2,7,u,5>, <2,7,u,5>
+  1628498055U,	// <u,5,2,u>: Cost 2 vsldoi8 <2,u,u,5>, <2,u,u,5>
+  2287627234U,	// <u,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1257016210U,	// <u,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2646836942U,	// <u,5,3,2>: Cost 3 vsldoi4 <4,u,5,3>, <2,3,4,5>
+  2287625131U,	// <u,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287627238U,	// <u,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1257016538U,	// <u,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1209240066U,	// <u,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287625459U,	// <u,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1209240068U,	// <u,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  2640871526U,	// <u,5,4,0>: Cost 3 vsldoi4 <3,u,5,4>, LHS
+  2316168082U,	// <u,5,4,1>: Cost 3 vmrglw <5,6,u,4>, <4,0,5,1>
+  2640873202U,	// <u,5,4,2>: Cost 3 vsldoi4 <3,u,5,4>, <2,3,u,5>
+  2640874308U,	// <u,5,4,3>: Cost 3 vsldoi4 <3,u,5,4>, <3,u,5,4>
+  1637788917U,	// <u,5,4,4>: Cost 2 vsldoi8 <4,4,u,5>, <4,4,u,5>
+  1616555318U,	// <u,5,4,5>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  2287638591U,	// <u,5,4,6>: Cost 3 vmrglw <0,u,u,4>, <u,4,5,6>
+  2664765096U,	// <u,5,4,7>: Cost 3 vsldoi4 <7,u,5,4>, <7,u,5,4>
+  1616555561U,	// <u,5,4,u>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  1573109862U,	// <u,5,5,0>: Cost 2 vsldoi4 <4,u,5,5>, LHS
+  2646852404U,	// <u,5,5,1>: Cost 3 vsldoi4 <4,u,5,5>, <1,1,1,1>
+  2646853224U,	// <u,5,5,2>: Cost 3 vsldoi4 <4,u,5,5>, <2,2,2,2>
+  2287646618U,	// <u,5,5,3>: Cost 3 vmrglw <0,u,u,5>, <u,2,5,3>
+  1573113374U,	// <u,5,5,4>: Cost 2 vsldoi4 <4,u,5,5>, <4,u,5,5>
+  296144182U,	// <u,5,5,5>: Cost 1 vspltisw1 RHS
+  1192448098U,	// <u,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2287646946U,	// <u,5,5,7>: Cost 3 vmrglw <0,u,u,5>, <u,6,5,7>
+  296144182U,	// <u,5,5,u>: Cost 1 vspltisw1 RHS
+  1567146086U,	// <u,5,6,0>: Cost 2 vsldoi4 <3,u,5,6>, LHS
+  2628945300U,	// <u,5,6,1>: Cost 3 vsldoi4 <1,u,5,6>, <1,u,5,6>
+  2634917997U,	// <u,5,6,2>: Cost 3 vsldoi4 <2,u,5,6>, <2,u,5,6>
+  1567148870U,	// <u,5,6,3>: Cost 2 vsldoi4 <3,u,5,6>, <3,u,5,6>
+  1567149366U,	// <u,5,6,4>: Cost 2 vsldoi4 <3,u,5,6>, RHS
+  2781171799U,	// <u,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  1228950018U,	// <u,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  27705344U,	// <u,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,6,u>: Cost 0 copy RHS
+  2628952166U,	// <u,5,7,0>: Cost 3 vsldoi4 <1,u,5,7>, LHS
+  1235815314U,	// <u,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309556734U,	// <u,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309555115U,	// <u,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2628955446U,	// <u,5,7,4>: Cost 3 vsldoi4 <1,u,5,7>, RHS
+  1235815642U,	// <u,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235814914U,	// <u,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309555443U,	// <u,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235814916U,	// <u,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  1567162470U,	// <u,5,u,0>: Cost 2 vsldoi4 <3,u,5,u>, LHS
+  1616557870U,	// <u,5,u,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2690299781U,	// <u,5,u,2>: Cost 3 vsldoi8 <0,u,u,5>, <u,2,3,0>
+  1567165256U,	// <u,5,u,3>: Cost 2 vsldoi4 <3,u,5,u>, <3,u,5,u>
+  1567165750U,	// <u,5,u,4>: Cost 2 vsldoi4 <3,u,5,u>, RHS
+  296144182U,	// <u,5,u,5>: Cost 1 vspltisw1 RHS
+  1209281026U,	// <u,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  27705344U,	// <u,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,u,u>: Cost 0 copy RHS
+  2705563648U,	// <u,6,0,0>: Cost 3 vsldoi8 <3,4,u,6>, <0,0,0,0>
+  1631821926U,	// <u,6,0,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  2262462970U,	// <u,6,0,2>: Cost 3 vmrghw <u,0,1,2>, <6,2,7,3>
+  2646886941U,	// <u,6,0,3>: Cost 3 vsldoi4 <4,u,6,0>, <3,4,u,6>
+  2705563986U,	// <u,6,0,4>: Cost 3 vsldoi8 <3,4,u,6>, <0,4,1,5>
+  2316062652U,	// <u,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316137272U,	// <u,6,0,6>: Cost 3 vmrglw <5,6,u,0>, <6,6,6,6>
+  1215851830U,	// <u,6,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  1215851831U,	// <u,6,0,u>: Cost 2 vmrglw <1,2,u,0>, RHS
+  2634948710U,	// <u,6,1,0>: Cost 3 vsldoi4 <2,u,6,1>, LHS
+  2705564468U,	// <u,6,1,1>: Cost 3 vsldoi8 <3,4,u,6>, <1,1,1,1>
+  1189466618U,	// <u,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2263208498U,	// <u,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2693620843U,	// <u,6,1,4>: Cost 3 vsldoi8 <1,4,u,6>, <1,4,u,6>
+  2652868860U,	// <u,6,1,5>: Cost 3 vsldoi4 <5,u,6,1>, <5,u,6,1>
+  1189466936U,	// <u,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1213869366U,	// <u,6,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  1213869367U,	// <u,6,1,u>: Cost 2 vmrglw <0,u,u,1>, RHS
+  2658844774U,	// <u,6,2,0>: Cost 3 vsldoi4 <6,u,6,2>, LHS
+  3771344465U,	// <u,6,2,1>: Cost 4 vsldoi8 <2,1,u,6>, <2,1,u,6>
+  1178554874U,	// <u,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2698929907U,	// <u,6,2,3>: Cost 3 vsldoi8 <2,3,u,6>, <2,3,u,6>
+  2699593540U,	// <u,6,2,4>: Cost 3 vsldoi8 <2,4,u,6>, <2,4,u,6>
+  2700257173U,	// <u,6,2,5>: Cost 3 vsldoi8 <2,5,u,6>, <2,5,u,6>
+  2705565626U,	// <u,6,2,6>: Cost 3 vsldoi8 <3,4,u,6>, <2,6,3,7>
+  1226485046U,	// <u,6,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  1226485047U,	// <u,6,2,u>: Cost 2 vmrglw <3,0,u,2>, RHS
+  2705565846U,	// <u,6,3,0>: Cost 3 vsldoi8 <3,4,u,6>, <3,0,1,2>
+  2330756585U,	// <u,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2330756829U,	// <u,6,3,2>: Cost 3 vmrglw LHS, <2,3,6,2>
+  2282981734U,	// <u,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  1631824413U,	// <u,6,3,4>: Cost 2 vsldoi8 <3,4,u,6>, <3,4,u,6>
+  2652885246U,	// <u,6,3,5>: Cost 3 vsldoi4 <5,u,6,3>, <5,u,6,3>
+  1257018168U,	// <u,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135499062U,	// <u,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135499063U,	// <u,6,3,u>: Cost 1 vmrglw LHS, RHS
+  2646917222U,	// <u,6,4,0>: Cost 3 vsldoi4 <4,u,6,4>, LHS
+  2217365931U,	// <u,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2790167156U,	// <u,6,4,2>: Cost 3 vsldoi12 <6,4,2,u>, <6,4,2,u>
+  2646919709U,	// <u,6,4,3>: Cost 3 vsldoi4 <4,u,6,4>, <3,4,u,6>
+  2711538934U,	// <u,6,4,4>: Cost 3 vsldoi8 <4,4,u,6>, <4,4,u,6>
+  1631825206U,	// <u,6,4,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  2316170040U,	// <u,6,4,6>: Cost 3 vmrglw <5,6,u,4>, <6,6,6,6>
+  1215884598U,	// <u,6,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  1215884599U,	// <u,6,4,u>: Cost 2 vmrglw <1,2,u,4>, RHS
+  2634981478U,	// <u,6,5,0>: Cost 3 vsldoi4 <2,u,6,5>, LHS
+  2266190247U,	// <u,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1192448506U,	// <u,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2266190386U,	// <u,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2634984758U,	// <u,6,5,4>: Cost 3 vsldoi4 <2,u,6,5>, RHS
+  2652901632U,	// <u,6,5,5>: Cost 3 vsldoi4 <5,u,6,5>, <5,u,6,5>
+  1192448824U,	// <u,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1213902134U,	// <u,6,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1213902135U,	// <u,6,5,u>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1583808614U,	// <u,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <u,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2718839290U,	// <u,6,6,2>: Cost 3 vsldoi8 <5,6,u,6>, <6,2,7,3>
+  2670823965U,	// <u,6,6,3>: Cost 3 vsldoi4 <u,u,6,6>, <3,4,u,6>
+  1583811894U,	// <u,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2724147961U,	// <u,6,6,5>: Cost 3 vsldoi8 <6,5,u,6>, <6,5,u,6>
+  363253046U,	// <u,6,6,6>: Cost 1 vspltisw2 RHS
+  1229172022U,	// <u,6,6,7>: Cost 2 vmrglw <3,4,u,6>, RHS
+  363253046U,	// <u,6,6,u>: Cost 1 vspltisw2 RHS
+  499458150U,	// <u,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1573200692U,	// <u,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1573201512U,	// <u,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573202070U,	// <u,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499461673U,	// <u,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1573203972U,	// <u,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1235817272U,	// <u,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  162073910U,	// <u,6,7,7>: Cost 1 vmrglw RHS, RHS
+  162073911U,	// <u,6,7,u>: Cost 1 vmrglw RHS, RHS
+  499466342U,	// <u,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631827758U,	// <u,6,u,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  1573209704U,	// <u,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573210262U,	// <u,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499469866U,	// <u,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631828122U,	// <u,6,u,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  363253046U,	// <u,6,u,6>: Cost 1 vspltisw2 RHS
+  135540022U,	// <u,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135540023U,	// <u,6,u,u>: Cost 1 vmrglw LHS, RHS
+  1638465536U,	// <u,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564723814U,	// <u,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712207533U,	// <u,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712207612U,	// <u,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638465874U,	// <u,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1579192580U,	// <u,7,0,5>: Cost 2 vsldoi4 <5,u,7,0>, <5,u,7,0>
+  2712207862U,	// <u,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2316137282U,	// <u,7,0,7>: Cost 3 vmrglw <5,6,u,0>, <6,6,7,7>
+  564724381U,	// <u,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  1189467130U,	// <u,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  1638466356U,	// <u,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638466454U,	// <u,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  2311500282U,	// <u,7,1,3>: Cost 3 vmrglw <4,u,u,1>, <6,2,7,3>
+  1189467494U,	// <u,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2712208495U,	// <u,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2694956302U,	// <u,7,1,6>: Cost 3 vsldoi8 <1,6,u,7>, <1,6,u,7>
+  1189467756U,	// <u,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1638466940U,	// <u,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712208829U,	// <u,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712208927U,	// <u,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638467176U,	// <u,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638467238U,	// <u,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712209165U,	// <u,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712209256U,	// <u,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1627187175U,	// <u,7,2,6>: Cost 2 vsldoi8 <2,6,u,7>, <2,6,u,7>
+  2324116290U,	// <u,7,2,7>: Cost 3 vmrglw <7,0,u,2>, <6,6,7,7>
+  1628514441U,	// <u,7,2,u>: Cost 2 vsldoi8 <2,u,u,7>, <2,u,u,7>
+  1638467734U,	// <u,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712209638U,	// <u,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2700929387U,	// <u,7,3,2>: Cost 3 vsldoi8 <2,6,u,7>, <3,2,6,u>
+  1638467996U,	// <u,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638468098U,	// <u,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712210002U,	// <u,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  1585189856U,	// <u,7,3,6>: Cost 2 vsldoi4 <6,u,7,3>, <6,u,7,3>
+  1257018178U,	// <u,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1638468382U,	// <u,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638468498U,	// <u,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712210378U,	// <u,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712210485U,	// <u,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712210564U,	// <u,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638468816U,	// <u,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564727112U,	// <u,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712210809U,	// <u,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2712210888U,	// <u,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,0>
+  564727337U,	// <u,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  1192449018U,	// <u,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2714201743U,	// <u,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712211198U,	// <u,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2311533050U,	// <u,7,5,3>: Cost 3 vmrglw <4,u,u,5>, <6,2,7,3>
+  1192449382U,	// <u,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  1638469636U,	// <u,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638469730U,	// <u,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  1192449644U,	// <u,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1638469892U,	// <u,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712211745U,	// <u,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712211879U,	// <u,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638470138U,	// <u,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712212018U,	// <u,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712212109U,	// <u,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712212203U,	// <u,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638470456U,	// <u,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638470478U,	// <u,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638470559U,	// <u,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,0,1>
+  1235816546U,	// <u,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309558371U,	// <u,7,7,1>: Cost 3 vmrglw RHS, <5,6,7,1>
+  2641045434U,	// <u,7,7,2>: Cost 3 vsldoi4 <3,u,7,7>, <2,6,3,7>
+  1235816954U,	// <u,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1235816550U,	// <u,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309558375U,	// <u,7,7,5>: Cost 3 vmrglw RHS, <5,6,7,5>
+  1585222628U,	// <u,7,7,6>: Cost 2 vsldoi4 <6,u,7,7>, <6,u,7,7>
+  430361910U,	// <u,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <u,7,7,u>: Cost 1 vspltisw3 RHS
+  1638471379U,	// <u,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564729646U,	// <u,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638471557U,	// <u,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638471612U,	// <u,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638471743U,	// <u,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564730010U,	// <u,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638471888U,	// <u,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  430361910U,	// <u,7,u,7>: Cost 1 vspltisw3 RHS
+  564730213U,	// <u,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  202162278U,	// <u,u,0,0>: Cost 1 vspltisw0 LHS
+  538189985U,	// <u,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685673645U,	// <u,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  1215848604U,	// <u,u,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  1611931986U,	// <u,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  1579266317U,	// <u,u,0,5>: Cost 2 vsldoi4 <5,u,u,0>, <5,u,u,0>
+  2289592861U,	// <u,u,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,u,6>
+  1215851848U,	// <u,u,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  538190493U,	// <u,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1549411025U,	// <u,u,1,0>: Cost 2 vsldoi4 <0,u,u,1>, <0,u,u,1>
+  115726126U,	// <u,u,1,1>: Cost 1 vmrghw LHS, LHS
+  604862254U,	// <u,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  1213866140U,	// <u,u,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1549413686U,	// <u,u,1,4>: Cost 2 vsldoi4 <0,u,u,1>, RHS
+  115726490U,	// <u,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1585247207U,	// <u,u,1,6>: Cost 2 vsldoi4 <6,u,u,1>, <6,u,u,1>
+  1213869384U,	// <u,u,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  604862308U,	// <u,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  1567334502U,	// <u,u,2,0>: Cost 2 vsldoi4 <3,u,u,2>, LHS
+  1190180654U,	// <u,u,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  336380006U,	// <u,u,2,2>: Cost 1 vspltisw2 LHS
+  835584U,	// <u,u,2,3>: Cost 0 copy LHS
+  1567337782U,	// <u,u,2,4>: Cost 2 vsldoi4 <3,u,u,2>, RHS
+  1190181018U,	// <u,u,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  1611933626U,	// <u,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1226485064U,	// <u,u,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  835584U,	// <u,u,2,u>: Cost 0 copy LHS
+  475685587U,	// <u,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1209239278U,	// <u,u,3,1>: Cost 2 vmrglw LHS, <2,3,u,1>
+  1209239765U,	// <u,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135495836U,	// <u,u,3,3>: Cost 1 vmrglw LHS, LHS
+  475688246U,	// <u,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1209239282U,	// <u,u,3,5>: Cost 2 vmrglw LHS, <2,3,u,5>
+  1209240093U,	// <u,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135499080U,	// <u,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135495841U,	// <u,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1555406950U,	// <u,u,4,0>: Cost 2 vsldoi4 <1,u,u,4>, LHS
+  1555408301U,	// <u,u,4,1>: Cost 2 vsldoi4 <1,u,u,4>, <1,u,u,4>
+  2289625301U,	// <u,u,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,u,2>
+  1215881372U,	// <u,u,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  229035318U,	// <u,u,4,4>: Cost 1 vspltisw0 RHS
+  538193206U,	// <u,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2289625629U,	// <u,u,4,6>: Cost 3 vmrglw <1,2,u,4>, <3,4,u,6>
+  1215884616U,	// <u,u,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  538193449U,	// <u,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1549443797U,	// <u,u,5,0>: Cost 2 vsldoi4 <0,u,u,5>, <0,u,u,5>
+  118708014U,	// <u,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1561389191U,	// <u,u,5,2>: Cost 2 vsldoi4 <2,u,u,5>, <2,u,u,5>
+  1213898908U,	// <u,u,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  1549446454U,	// <u,u,5,4>: Cost 2 vsldoi4 <0,u,u,5>, RHS
+  118708378U,	// <u,u,5,5>: Cost 1 vmrghw RHS, RHS
+  604862618U,	// <u,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  1213902152U,	// <u,u,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  604862636U,	// <u,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  1567367270U,	// <u,u,6,0>: Cost 2 vsldoi4 <3,u,u,6>, LHS
+  1192892206U,	// <u,u,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1638478330U,	// <u,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1679046864U,	// <u,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  1567370550U,	// <u,u,6,4>: Cost 2 vsldoi4 <3,u,u,6>, RHS
+  1192892570U,	// <u,u,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  363253046U,	// <u,u,6,6>: Cost 1 vspltisw2 RHS
+  27705344U,	// <u,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,u,6,u>: Cost 0 copy RHS
+  499605606U,	// <u,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1235812425U,	// <u,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1561405577U,	// <u,u,7,2>: Cost 2 vsldoi4 <2,u,u,7>, <2,u,u,7>
+  162070684U,	// <u,u,7,3>: Cost 1 vmrglw RHS, LHS
+  499609147U,	// <u,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1235812753U,	// <u,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235814941U,	// <u,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  162073928U,	// <u,u,7,7>: Cost 1 vmrglw RHS, RHS
+  162070689U,	// <u,u,7,u>: Cost 1 vmrglw RHS, LHS
+  475726552U,	// <u,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  538195758U,	// <u,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  604862821U,	// <u,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  835584U,	// <u,u,u,3>: Cost 0 copy LHS
+  475729206U,	// <u,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  538196122U,	// <u,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  604862861U,	// <u,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  27705344U,	// <u,u,u,7>: Cost 0 copy RHS
+  835584U,	// <u,u,u,u>: Cost 0 copy LHS
+  0
+};

diff --git a/lib/Target/PowerPC/PPCPredicates.cpp b/lib/Target/PowerPC/PPCPredicates.cpp
new file mode 100644
index 0000000..ccda5c0
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPredicates.cpp

@@ -0,0 +1,30 @@
+//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCPredicates.h"
+#include <cassert>
+using namespace llvm;
+
+PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
+  switch (Opcode) {
+  default: assert(0 && "Unknown PPC branch opcode!");
+  case PPC::PRED_EQ: return PPC::PRED_NE;
+  case PPC::PRED_NE: return PPC::PRED_EQ;
+  case PPC::PRED_LT: return PPC::PRED_GE;
+  case PPC::PRED_GE: return PPC::PRED_LT;
+  case PPC::PRED_GT: return PPC::PRED_LE;
+  case PPC::PRED_LE: return PPC::PRED_GT;
+  case PPC::PRED_NU: return PPC::PRED_UN;
+  case PPC::PRED_UN: return PPC::PRED_NU;
+  }
+}

diff --git a/lib/Target/PowerPC/PPCPredicates.h b/lib/Target/PowerPC/PPCPredicates.h
new file mode 100644
index 0000000..ba1bb74
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPredicates.h

@@ -0,0 +1,39 @@
+//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H
+#define LLVM_TARGET_POWERPC_PPCPREDICATES_H
+
+#include "PPC.h"
+
+namespace llvm {
+namespace PPC {
+  /// Predicate - These are "(BI << 5) | BO"  for various predicates.
+  enum Predicate {
+    PRED_ALWAYS = (0 << 5) | 20,
+    PRED_LT     = (0 << 5) | 12,
+    PRED_LE     = (1 << 5) |  4,
+    PRED_EQ     = (2 << 5) | 12,
+    PRED_GE     = (0 << 5) |  4,
+    PRED_GT     = (1 << 5) | 12,
+    PRED_NE     = (2 << 5) |  4,
+    PRED_UN     = (3 << 5) | 12,
+    PRED_NU     = (3 << 5) |  4
+  };
+  
+  /// Invert the specified predicate.  != -> ==, < -> >=.
+  Predicate InvertPredicate(Predicate Opcode);
+}
+}
+
+#endif

diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
new file mode 100644
index 0000000..19780a8
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp

@@ -0,0 +1,1153 @@
+//===- PPCRegisterInfo.cpp - PowerPC Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reginfo"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCRegisterInfo.h"
+#include "PPCFrameInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cstdlib>
+using namespace llvm;
+
+/// getRegisterNumbering - Given the enum value for some register, e.g.
+/// PPC::F14, return the number that it corresponds to (e.g. 14).
+unsigned PPCRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
+  using namespace PPC;
+  switch (RegEnum) {
+  case R0 :  case X0 :  case F0 :  case V0 : case CR0:  return  0;
+  case R1 :  case X1 :  case F1 :  case V1 : case CR1:  return  1;
+  case R2 :  case X2 :  case F2 :  case V2 : case CR2:  return  2;
+  case R3 :  case X3 :  case F3 :  case V3 : case CR3:  return  3;
+  case R4 :  case X4 :  case F4 :  case V4 : case CR4:  return  4;
+  case R5 :  case X5 :  case F5 :  case V5 : case CR5:  return  5;
+  case R6 :  case X6 :  case F6 :  case V6 : case CR6:  return  6;
+  case R7 :  case X7 :  case F7 :  case V7 : case CR7:  return  7;
+  case R8 :  case X8 :  case F8 :  case V8 : return  8;
+  case R9 :  case X9 :  case F9 :  case V9 : return  9;
+  case R10:  case X10:  case F10:  case V10: return 10;
+  case R11:  case X11:  case F11:  case V11: return 11;
+  case R12:  case X12:  case F12:  case V12: return 12;
+  case R13:  case X13:  case F13:  case V13: return 13;
+  case R14:  case X14:  case F14:  case V14: return 14;
+  case R15:  case X15:  case F15:  case V15: return 15;
+  case R16:  case X16:  case F16:  case V16: return 16;
+  case R17:  case X17:  case F17:  case V17: return 17;
+  case R18:  case X18:  case F18:  case V18: return 18;
+  case R19:  case X19:  case F19:  case V19: return 19;
+  case R20:  case X20:  case F20:  case V20: return 20;
+  case R21:  case X21:  case F21:  case V21: return 21;
+  case R22:  case X22:  case F22:  case V22: return 22;
+  case R23:  case X23:  case F23:  case V23: return 23;
+  case R24:  case X24:  case F24:  case V24: return 24;
+  case R25:  case X25:  case F25:  case V25: return 25;
+  case R26:  case X26:  case F26:  case V26: return 26;
+  case R27:  case X27:  case F27:  case V27: return 27;
+  case R28:  case X28:  case F28:  case V28: return 28;
+  case R29:  case X29:  case F29:  case V29: return 29;
+  case R30:  case X30:  case F30:  case V30: return 30;
+  case R31:  case X31:  case F31:  case V31: return 31;
+  default:
+    cerr << "Unhandled reg in PPCRegisterInfo::getRegisterNumbering!\n";
+    abort();
+  }
+}
+
+PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
+                                 const TargetInstrInfo &tii)
+  : PPCGenRegisterInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+    Subtarget(ST), TII(tii) {
+  ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
+  ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
+  ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
+  ImmToIdxMap[PPC::LWZ]  = PPC::LWZX;   ImmToIdxMap[PPC::LWA]  = PPC::LWAX;
+  ImmToIdxMap[PPC::LFS]  = PPC::LFSX;   ImmToIdxMap[PPC::LFD]  = PPC::LFDX;
+  ImmToIdxMap[PPC::STH]  = PPC::STHX;   ImmToIdxMap[PPC::STW]  = PPC::STWX;
+  ImmToIdxMap[PPC::STFS] = PPC::STFSX;  ImmToIdxMap[PPC::STFD] = PPC::STFDX;
+  ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
+  ImmToIdxMap[PPC::ADDI8] = PPC::ADD8;
+}
+
+void
+PPCRegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     unsigned SrcReg, int FrameIdx,
+                                     const TargetRegisterClass *RC) const {
+  if (RC == PPC::GPRCRegisterClass) {
+    if (SrcReg != PPC::LR) {
+      addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STW))
+                        .addReg(SrcReg, false, false, true), FrameIdx);
+    } else {
+      // FIXME: this spills LR immediately to memory in one step.  To do this,
+      // we use R11, which we know cannot be used in the prolog/epilog.  This is
+      // a hack.
+      BuildMI(MBB, MI, TII.get(PPC::MFLR), PPC::R11);
+      addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STW))
+                        .addReg(PPC::R11, false, false, true), FrameIdx);
+    }
+  } else if (RC == PPC::G8RCRegisterClass) {
+    if (SrcReg != PPC::LR8) {
+      addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STD))
+                        .addReg(SrcReg, false, false, true), FrameIdx);
+    } else {
+      // FIXME: this spills LR immediately to memory in one step.  To do this,
+      // we use R11, which we know cannot be used in the prolog/epilog.  This is
+      // a hack.
+      BuildMI(MBB, MI, TII.get(PPC::MFLR8), PPC::X11);
+      addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STD))
+                        .addReg(PPC::X11, false, false, true), FrameIdx);
+    }
+  } else if (RC == PPC::F8RCRegisterClass) {
+    addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STFD))
+                      .addReg(SrcReg, false, false, true), FrameIdx);
+  } else if (RC == PPC::F4RCRegisterClass) {
+    addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STFS))
+                      .addReg(SrcReg, false, false, true), FrameIdx);
+  } else if (RC == PPC::CRRCRegisterClass) {
+    // FIXME: We use R0 here, because it isn't available for RA.
+    // We need to store the CR in the low 4-bits of the saved value.  First,
+    // issue a MFCR to save all of the CRBits.
+    BuildMI(MBB, MI, TII.get(PPC::MFCR), PPC::R0);
+    
+    // If the saved register wasn't CR0, shift the bits left so that they are in
+    // CR0's slot.
+    if (SrcReg != PPC::CR0) {
+      unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(SrcReg)*4;
+      // rlwinm r0, r0, ShiftBits, 0, 31.
+      BuildMI(MBB, MI, TII.get(PPC::RLWINM), PPC::R0)
+        .addReg(PPC::R0).addImm(ShiftBits).addImm(0).addImm(31);
+    }
+    
+    addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STW))
+                      .addReg(PPC::R0, false, false, true), FrameIdx);
+  } else if (RC == PPC::VRRCRegisterClass) {
+    // We don't have indexed addressing for vector loads.  Emit:
+    // R11 = ADDI FI#
+    // Dest = LVX R0, R11
+    // 
+    // FIXME: We use R0 here, because it isn't available for RA.
+    addFrameReference(BuildMI(MBB, MI, TII.get(PPC::ADDI), PPC::R0),
+                      FrameIdx, 0, 0);
+    BuildMI(MBB, MI, TII.get(PPC::STVX))
+      .addReg(SrcReg, false, false, true).addReg(PPC::R0).addReg(PPC::R0);
+  } else {
+    assert(0 && "Unknown regclass!");
+    abort();
+  }
+}
+
+void
+PPCRegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      unsigned DestReg, int FrameIdx,
+                                      const TargetRegisterClass *RC) const {
+  if (RC == PPC::GPRCRegisterClass) {
+    if (DestReg != PPC::LR) {
+      addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LWZ), DestReg), FrameIdx);
+    } else {
+      addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LWZ), PPC::R11),FrameIdx);
+      BuildMI(MBB, MI, TII.get(PPC::MTLR)).addReg(PPC::R11);
+    }
+  } else if (RC == PPC::G8RCRegisterClass) {
+    if (DestReg != PPC::LR8) {
+      addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LD), DestReg), FrameIdx);
+    } else {
+      addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LD), PPC::R11), FrameIdx);
+      BuildMI(MBB, MI, TII.get(PPC::MTLR8)).addReg(PPC::R11);
+    }
+  } else if (RC == PPC::F8RCRegisterClass) {
+    addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LFD), DestReg), FrameIdx);
+  } else if (RC == PPC::F4RCRegisterClass) {
+    addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LFS), DestReg), FrameIdx);
+  } else if (RC == PPC::CRRCRegisterClass) {
+    // FIXME: We use R0 here, because it isn't available for RA.
+    addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LWZ), PPC::R0), FrameIdx);
+    
+    // If the reloaded register isn't CR0, shift the bits right so that they are
+    // in the right CR's slot.
+    if (DestReg != PPC::CR0) {
+      unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(DestReg)*4;
+      // rlwinm r11, r11, 32-ShiftBits, 0, 31.
+      BuildMI(MBB, MI, TII.get(PPC::RLWINM), PPC::R0)
+        .addReg(PPC::R0).addImm(32-ShiftBits).addImm(0).addImm(31);
+    }
+    
+    BuildMI(MBB, MI, TII.get(PPC::MTCRF), DestReg).addReg(PPC::R0);
+  } else if (RC == PPC::VRRCRegisterClass) {
+    // We don't have indexed addressing for vector loads.  Emit:
+    // R11 = ADDI FI#
+    // Dest = LVX R0, R11
+    // 
+    // FIXME: We use R0 here, because it isn't available for RA.
+    addFrameReference(BuildMI(MBB, MI, TII.get(PPC::ADDI), PPC::R0),
+                      FrameIdx, 0, 0);
+    BuildMI(MBB, MI, TII.get(PPC::LVX),DestReg).addReg(PPC::R0).addReg(PPC::R0);
+  } else {
+    assert(0 && "Unknown regclass!");
+    abort();
+  }
+}
+
+void PPCRegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *RC) const {
+  if (RC == PPC::GPRCRegisterClass) {
+    BuildMI(MBB, MI, TII.get(PPC::OR), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else if (RC == PPC::G8RCRegisterClass) {
+    BuildMI(MBB, MI, TII.get(PPC::OR8), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else if (RC == PPC::F4RCRegisterClass) {
+    BuildMI(MBB, MI, TII.get(PPC::FMRS), DestReg).addReg(SrcReg);
+  } else if (RC == PPC::F8RCRegisterClass) {
+    BuildMI(MBB, MI, TII.get(PPC::FMRD), DestReg).addReg(SrcReg);
+  } else if (RC == PPC::CRRCRegisterClass) {
+    BuildMI(MBB, MI, TII.get(PPC::MCRF), DestReg).addReg(SrcReg);
+  } else if (RC == PPC::VRRCRegisterClass) {
+    BuildMI(MBB, MI, TII.get(PPC::VOR), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else {
+    cerr << "Attempt to copy register that is not GPR or FPR";
+    abort();
+  }
+}
+
+void PPCRegisterInfo::reMaterialize(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned DestReg,
+                                    const MachineInstr *Orig) const {
+  MachineInstr *MI = Orig->clone();
+  MI->getOperand(0).setReg(DestReg);
+  MBB.insert(I, MI);
+}
+
+const unsigned* PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  // 32-bit Darwin calling convention. 
+  static const unsigned Macho32_CalleeSavedRegs[] = {
+              PPC::R13, PPC::R14, PPC::R15,
+    PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+    PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+    PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+    PPC::R28, PPC::R29, PPC::R30, PPC::R31,
+
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+    
+    PPC::CR2, PPC::CR3, PPC::CR4,
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+    
+    PPC::LR,  0
+  };
+  
+  static const unsigned ELF32_CalleeSavedRegs[] = {
+              PPC::R13, PPC::R14, PPC::R15,
+    PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+    PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+    PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+    PPC::R28, PPC::R29, PPC::R30, PPC::R31,
+
+                                  PPC::F9,
+    PPC::F10, PPC::F11, PPC::F12, PPC::F13,
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+    
+    PPC::CR2, PPC::CR3, PPC::CR4,
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+    
+    PPC::LR,  0
+  };
+  // 64-bit Darwin calling convention. 
+  static const unsigned Macho64_CalleeSavedRegs[] = {
+    PPC::X14, PPC::X15,
+    PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+    PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+    PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+    PPC::X28, PPC::X29, PPC::X30, PPC::X31,
+    
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+    
+    PPC::CR2, PPC::CR3, PPC::CR4,
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+    
+    PPC::LR8,  0
+  };
+  
+  if (Subtarget.isMachoABI())
+    return Subtarget.isPPC64() ? Macho64_CalleeSavedRegs :
+                                 Macho32_CalleeSavedRegs;
+  
+  // ELF 32.
+  return ELF32_CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+PPCRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  // 32-bit Macho calling convention. 
+  static const TargetRegisterClass * const Macho32_CalleeSavedRegClasses[] = {
+                       &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+    
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    
+    &PPC::GPRCRegClass, 0
+  };
+  
+  static const TargetRegisterClass * const ELF32_CalleeSavedRegClasses[] = {
+                       &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+
+                                                             &PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+    
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    
+    &PPC::GPRCRegClass, 0
+  };
+  
+  // 64-bit Macho calling convention. 
+  static const TargetRegisterClass * const Macho64_CalleeSavedRegClasses[] = {
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+    
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    
+    &PPC::G8RCRegClass, 0
+  };
+  
+  if (Subtarget.isMachoABI())
+    return Subtarget.isPPC64() ? Macho64_CalleeSavedRegClasses :
+                                 Macho32_CalleeSavedRegClasses;
+  
+  // ELF 32.
+  return ELF32_CalleeSavedRegClasses;
+}
+
+// needsFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+static bool needsFP(const MachineFunction &MF) {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects();
+}
+
+BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(PPC::R0);
+  Reserved.set(PPC::R1);
+  Reserved.set(PPC::LR);
+  // In Linux, r2 is reserved for the OS.
+  if (!Subtarget.isDarwin())
+    Reserved.set(PPC::R2);
+  // On PPC64, r13 is the thread pointer.  Never allocate this register.
+  // Note that this is overconservative, as it also prevents allocation of
+  // R31 when the FP is not needed.
+  if (Subtarget.isPPC64()) {
+    Reserved.set(PPC::R13);
+    Reserved.set(PPC::R31);
+  }
+  if (needsFP(MF))
+    Reserved.set(PPC::R31);
+  return Reserved;
+}
+
+/// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into
+/// copy instructions, turning them into load/store instructions.
+MachineInstr *PPCRegisterInfo::foldMemoryOperand(MachineInstr *MI,
+                                                 unsigned OpNum,
+                                                 int FrameIndex) const {
+  // Make sure this is a reg-reg copy.  Note that we can't handle MCRF, because
+  // it takes more than one instruction to store it.
+  unsigned Opc = MI->getOpcode();
+
+  MachineInstr *NewMI = NULL;
+  if ((Opc == PPC::OR &&
+       MI->getOperand(1).getReg() == MI->getOperand(2).getReg())) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      NewMI = addFrameReference(BuildMI(TII.get(PPC::STW)).addReg(InReg),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      NewMI = addFrameReference(BuildMI(TII.get(PPC::LWZ), OutReg),
+                                FrameIndex);
+    }
+  } else if ((Opc == PPC::OR8 &&
+              MI->getOperand(1).getReg() == MI->getOperand(2).getReg())) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      NewMI = addFrameReference(BuildMI(TII.get(PPC::STD)).addReg(InReg),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      NewMI = addFrameReference(BuildMI(TII.get(PPC::LD), OutReg), FrameIndex);
+    }
+  } else if (Opc == PPC::FMRD) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      NewMI = addFrameReference(BuildMI(TII.get(PPC::STFD)).addReg(InReg),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      NewMI = addFrameReference(BuildMI(TII.get(PPC::LFD), OutReg), FrameIndex);
+    }
+  } else if (Opc == PPC::FMRS) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      NewMI = addFrameReference(BuildMI(TII.get(PPC::STFS)).addReg(InReg),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      NewMI = addFrameReference(BuildMI(TII.get(PPC::LFS), OutReg), FrameIndex);
+    }
+  }
+
+  if (NewMI)
+    NewMI->copyKillDeadInfo(MI);
+  return NewMI;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function actually has a dedicated frame
+// pointer register.  This is true if the function needs a frame pointer and has
+// a non-zero stack size.
+bool PPCRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getStackSize() && needsFP(MF);
+}
+
+/// usesLR - Returns if the link registers (LR) has been used in the function.
+///
+bool PPCRegisterInfo::usesLR(MachineFunction &MF) const {
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  return FI->usesLR();
+}
+
+void PPCRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+/// LowerDynamicAlloc - Generate the code for allocating an object in the
+/// current frame.  The sequence of code with be in the general form
+///
+///   addi   R0, SP, #frameSize ; get the address of the previous frame
+///   stwxu  R0, SP, Rnegsize   ; add and update the SP with the negated size
+///   addi   Rnew, SP, #maxCalFrameSize ; get the top of the allocation
+///
+void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Determine whether 64-bit pointers are used.
+  bool LP64 = Subtarget.isPPC64();
+
+  // Get the maximum call stack size.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  // Get the total frame size.
+  unsigned FrameSize = MFI->getStackSize();
+  
+  // Get stack alignments.
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  assert(MaxAlign <= TargetAlign &&
+         "Dynamic alloca with large aligns not supported");
+
+  // Determine the previous frame's address.  If FrameSize can't be
+  // represented as 16 bits or we need special alignment, then we load the
+  // previous frame's address from 0(SP).  Why not do an addis of the hi? 
+  // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. 
+  // Constructing the constant and adding would take 3 instructions. 
+  // Fortunately, a frame greater than 32K is rare.
+  if (MaxAlign < TargetAlign && isInt16(FrameSize)) {
+    BuildMI(MBB, II, TII.get(PPC::ADDI), PPC::R0)
+      .addReg(PPC::R31)
+      .addImm(FrameSize);
+  } else if (LP64) {
+    BuildMI(MBB, II, TII.get(PPC::LD), PPC::X0)
+      .addImm(0)
+      .addReg(PPC::X1);
+  } else {
+    BuildMI(MBB, II, TII.get(PPC::LWZ), PPC::R0)
+      .addImm(0)
+      .addReg(PPC::R1);
+  }
+  
+  // Grow the stack and update the stack pointer link, then
+  // determine the address of new allocated space.
+  if (LP64) {
+    BuildMI(MBB, II, TII.get(PPC::STDUX))
+      .addReg(PPC::X0)
+      .addReg(PPC::X1)
+      .addReg(MI.getOperand(1).getReg());
+    BuildMI(MBB, II, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+      .addReg(PPC::X1)
+      .addImm(maxCallFrameSize);
+  } else {
+    BuildMI(MBB, II, TII.get(PPC::STWUX))
+      .addReg(PPC::R0)
+      .addReg(PPC::R1)
+      .addReg(MI.getOperand(1).getReg());
+    BuildMI(MBB, II, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+      .addReg(PPC::R1)
+      .addImm(maxCallFrameSize);
+  }
+  
+  // Discard the DYNALLOC instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Find out which operand is the frame index.
+  unsigned i = 0;
+  while (!MI.getOperand(i).isFrameIndex()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  // Take into account whether it's an add or mem instruction
+  unsigned OffIdx = (i == 2) ? 1 : 2;
+  if (MI.getOpcode() == TargetInstrInfo::INLINEASM)
+    OffIdx = i-1;
+      
+  // Get the frame index.
+  int FrameIndex = MI.getOperand(i).getFrameIndex();
+  
+  // Get the frame pointer save index.  Users of this index are primarily
+  // DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+  // Get the instruction opcode.
+  unsigned OpC = MI.getOpcode();
+  
+  // Special case for dynamic alloca.
+  if (FPSI && FrameIndex == FPSI &&
+      (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
+    lowerDynamicAlloc(II);
+    return;
+  }
+
+  // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
+  MI.getOperand(i).ChangeToRegister(hasFP(MF) ? PPC::R31 : PPC::R1, false);
+
+  // Figure out if the offset in the instruction is shifted right two bits. This
+  // is true for instructions like "STD", which the machine implicitly adds two
+  // low zeros to.
+  bool isIXAddr = false;
+  switch (OpC) {
+  case PPC::LWA:
+  case PPC::LD:
+  case PPC::STD:
+  case PPC::STD_32:
+    isIXAddr = true;
+    break;
+  }
+  
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MFI->getObjectOffset(FrameIndex);
+  
+  if (!isIXAddr)
+    Offset += MI.getOperand(OffIdx).getImmedValue();
+  else
+    Offset += MI.getOperand(OffIdx).getImmedValue() << 2;
+
+  // If we're not using a Frame Pointer that has been set to the value of the
+  // SP before having the stack size subtracted from it, then add the stack size
+  // to Offset to get the correct offset.
+  Offset += MFI->getStackSize();
+
+  if (!isInt16(Offset)) {
+    // Insert a set of r0 with the full offset value before the ld, st, or add
+    BuildMI(MBB, II, TII.get(PPC::LIS), PPC::R0).addImm(Offset >> 16);
+    BuildMI(MBB, II, TII.get(PPC::ORI), PPC::R0).addReg(PPC::R0).addImm(Offset);
+    
+    // convert into indexed form of the instruction
+    // sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0
+    // addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
+    assert(ImmToIdxMap.count(OpC) &&
+           "No indexed form of load or store available!");
+    unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
+    MI.setInstrDescriptor(TII.get(NewOpcode));
+    MI.getOperand(1).ChangeToRegister(MI.getOperand(i).getReg(), false);
+    MI.getOperand(2).ChangeToRegister(PPC::R0, false);
+  } else {
+    if (isIXAddr) {
+      assert((Offset & 3) == 0 && "Invalid frame offset!");
+      Offset >>= 2;    // The actual encoded value has the low two bits zero.
+    }
+    MI.getOperand(OffIdx).ChangeToImmediate(Offset);
+  }
+}
+
+/// VRRegNo - Map from a numbered VR register to its enum value.
+///
+static const unsigned short VRRegNo[] = {
+ PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
+ PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+/// RemoveVRSaveCode - We have found that this function does not need any code
+/// to manipulate the VRSAVE register, even though it uses vector registers.
+/// This can happen when the only registers used are known to be live in or out
+/// of the function.  Remove all of the VRSAVE related code from the function.
+static void RemoveVRSaveCode(MachineInstr *MI) {
+  MachineBasicBlock *Entry = MI->getParent();
+  MachineFunction *MF = Entry->getParent();
+
+  // We know that the MTVRSAVE instruction immediately follows MI.  Remove it.
+  MachineBasicBlock::iterator MBBI = MI;
+  ++MBBI;
+  assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE);
+  MBBI->eraseFromParent();
+  
+  bool RemovedAllMTVRSAVEs = true;
+  // See if we can find and remove the MTVRSAVE instruction from all of the
+  // epilog blocks.
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && TII.isReturn(I->back().getOpcode())) {
+      bool FoundIt = false;
+      for (MBBI = I->end(); MBBI != I->begin(); ) {
+        --MBBI;
+        if (MBBI->getOpcode() == PPC::MTVRSAVE) {
+          MBBI->eraseFromParent();  // remove it.
+          FoundIt = true;
+          break;
+        }
+      }
+      RemovedAllMTVRSAVEs &= FoundIt;
+    }
+  }
+
+  // If we found and removed all MTVRSAVE instructions, remove the read of
+  // VRSAVE as well.
+  if (RemovedAllMTVRSAVEs) {
+    MBBI = MI;
+    assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?");
+    --MBBI;
+    assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?");
+    MBBI->eraseFromParent();
+  }
+  
+  // Finally, nuke the UPDATE_VRSAVE.
+  MI->eraseFromParent();
+}
+
+// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the
+// instruction selector.  Based on the vector registers that have been used,
+// transform this into the appropriate ORI instruction.
+static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
+  MachineFunction *MF = MI->getParent()->getParent();
+
+  unsigned UsedRegMask = 0;
+  for (unsigned i = 0; i != 32; ++i)
+    if (MF->isPhysRegUsed(VRRegNo[i]))
+      UsedRegMask |= 1 << (31-i);
+  
+  // Live in and live out values already must be in the mask, so don't bother
+  // marking them.
+  for (MachineFunction::livein_iterator I = 
+       MF->livein_begin(), E = MF->livein_end(); I != E; ++I) {
+    unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(I->first);
+    if (VRRegNo[RegNo] == I->first)        // If this really is a vector reg.
+      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
+  }
+  for (MachineFunction::liveout_iterator I = 
+       MF->liveout_begin(), E = MF->liveout_end(); I != E; ++I) {
+    unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(*I);
+    if (VRRegNo[RegNo] == *I)              // If this really is a vector reg.
+      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
+  }
+  
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DstReg = MI->getOperand(0).getReg();
+  // If no registers are used, turn this into a copy.
+  if (UsedRegMask == 0) {
+    // Remove all VRSAVE code.
+    RemoveVRSaveCode(MI);
+    return;
+  } else if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
+    BuildMI(*MI->getParent(), MI, TII.get(PPC::ORI), DstReg)
+        .addReg(SrcReg).addImm(UsedRegMask);
+  } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) {
+    BuildMI(*MI->getParent(), MI, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg).addImm(UsedRegMask >> 16);
+  } else {
+    BuildMI(*MI->getParent(), MI, TII.get(PPC::ORIS), DstReg)
+       .addReg(SrcReg).addImm(UsedRegMask >> 16);
+    BuildMI(*MI->getParent(), MI, TII.get(PPC::ORI), DstReg)
+      .addReg(DstReg).addImm(UsedRegMask & 0xFFFF);
+  }
+  
+  // Remove the old UPDATE_VRSAVE instruction.
+  MI->eraseFromParent();
+}
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+void PPCRegisterInfo::determineFrameLayout(MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned FrameSize = MFI->getStackSize();
+  
+  // Get the alignments provided by the target, and the maximum alignment
+  // (if any) of the fixed frame objects.
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned AlignMask = TargetAlign - 1;  //
+
+  // If we are a leaf function, and use up to 224 bytes of stack space,
+  // don't have a frame pointer, calls, or dynamic alloca then we do not need
+  // to adjust the stack pointer (we fit in the Red Zone).
+  if (FrameSize <= 224 &&             // Fits in red zone.
+      !MFI->hasVarSizedObjects() &&   // No dynamic alloca.
+      !MFI->hasCalls() &&             // No calls.
+      MaxAlign <= TargetAlign) {      // No special alignment.
+    // No need for frame
+    MFI->setStackSize(0);
+    return;
+  }
+  
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  
+  // Maximum call frame needs to be at least big enough for linkage and 8 args.
+  unsigned minCallFrameSize =
+    PPCFrameInfo::getMinCallFrameSize(Subtarget.isPPC64(), 
+                                      Subtarget.isMachoABI());
+  maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize);
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+  
+  // Update maximum call frame size.
+  MFI->setMaxCallFrameSize(maxCallFrameSize);
+  
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+  
+  // Make sure the frame is aligned.
+  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+
+  // Update frame info.
+  MFI->setStackSize(FrameSize);
+}
+
+void PPCRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                           RegScavenger *RS)
+  const {
+  //  Save and clear the LR state.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  unsigned LR = getRARegister();
+  FI->setUsesLR(MF.isPhysRegUsed(LR));
+  MF.setPhysRegUnused(LR);
+
+  //  Save R31 if necessary
+  int FPSI = FI->getFramePointerSaveIndex();
+  bool IsPPC64 = Subtarget.isPPC64();
+  bool IsELF32_ABI = Subtarget.isELF32_ABI();
+  bool IsMachoABI  = Subtarget.isMachoABI();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+ 
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI &&  (NoFramePointerElim || MFI->hasVarSizedObjects()) 
+                                                  && IsELF32_ABI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64,
+                                                           IsMachoABI);
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MF.getFrameInfo()->CreateFixedObject(IsPPC64? 8 : 4, FPOffset);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);                      
+  }
+
+}
+
+void PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  
+  // Prepare for frame info.
+  unsigned FrameLabelId = 0;
+  
+  // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
+  // process it.
+  for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
+    if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
+      HandleVRSaveUpdate(MBBI, TII);
+      break;
+    }
+  }
+  
+  // Move MBBI back to the beginning of the function.
+  MBBI = MBB.begin();
+  
+  // Work out frame sizes.
+  determineFrameLayout(MF);
+  unsigned FrameSize = MFI->getStackSize();
+  
+  int NegFrameSize = -FrameSize;
+  
+  // Get processor type.
+  bool IsPPC64 = Subtarget.isPPC64();
+  // Get operating system
+  bool IsMachoABI = Subtarget.isMachoABI();
+  // Check if the link register (LR) has been used.
+  bool UsesLR = MFI->hasCalls() || usesLR(MF);
+  // Do we have a frame pointer for this function?
+  bool HasFP = hasFP(MF) && FrameSize;
+  
+  int LROffset = PPCFrameInfo::getReturnSaveOffset(IsPPC64, IsMachoABI);
+  int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64, IsMachoABI);
+  
+  if (IsPPC64) {
+    if (UsesLR)
+      BuildMI(MBB, MBBI, TII.get(PPC::MFLR8), PPC::X0);
+      
+    if (HasFP)
+      BuildMI(MBB, MBBI, TII.get(PPC::STD))
+         .addReg(PPC::X31).addImm(FPOffset/4).addReg(PPC::X1);
+    
+    if (UsesLR)
+      BuildMI(MBB, MBBI, TII.get(PPC::STD))
+         .addReg(PPC::X0).addImm(LROffset/4).addReg(PPC::X1);
+  } else {
+    if (UsesLR)
+      BuildMI(MBB, MBBI, TII.get(PPC::MFLR), PPC::R0);
+      
+    if (HasFP)
+      BuildMI(MBB, MBBI, TII.get(PPC::STW))
+        .addReg(PPC::R31).addImm(FPOffset).addReg(PPC::R1);
+
+    if (UsesLR)
+      BuildMI(MBB, MBBI, TII.get(PPC::STW))
+        .addReg(PPC::R0).addImm(LROffset).addReg(PPC::R1);
+  }
+  
+  // Skip if a leaf routine.
+  if (!FrameSize) return;
+  
+  // Get stack alignments.
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  if (MMI && MMI->needsFrameInfo()) {
+    // Mark effective beginning of when frame pointer becomes valid.
+    FrameLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, TII.get(PPC::LABEL)).addImm(FrameLabelId);
+  }
+  
+  // Adjust stack pointer: r1 += NegFrameSize.
+  // If there is a preferred stack alignment, align R1 now
+  if (!IsPPC64) {
+    // PPC32.
+    if (MaxAlign > TargetAlign) {
+      assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!");
+      assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!");
+      BuildMI(MBB, MBBI, TII.get(PPC::RLWINM), PPC::R0)
+        .addReg(PPC::R1).addImm(0).addImm(32-Log2_32(MaxAlign)).addImm(31);
+      BuildMI(MBB, MBBI, TII.get(PPC::SUBFIC) ,PPC::R0).addReg(PPC::R0)
+        .addImm(NegFrameSize);
+      BuildMI(MBB, MBBI, TII.get(PPC::STWUX))
+        .addReg(PPC::R1).addReg(PPC::R1).addReg(PPC::R0);
+    } else if (isInt16(NegFrameSize)) {
+      BuildMI(MBB, MBBI, TII.get(PPC::STWU),
+              PPC::R1).addReg(PPC::R1).addImm(NegFrameSize).addReg(PPC::R1);
+    } else {
+      BuildMI(MBB, MBBI, TII.get(PPC::LIS), PPC::R0).addImm(NegFrameSize >> 16);
+      BuildMI(MBB, MBBI, TII.get(PPC::ORI), PPC::R0).addReg(PPC::R0)
+        .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, TII.get(PPC::STWUX)).addReg(PPC::R1).addReg(PPC::R1)
+        .addReg(PPC::R0);
+    }
+  } else {    // PPC64.
+    if (MaxAlign > TargetAlign) {
+      assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!");
+      assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!");
+      BuildMI(MBB, MBBI, TII.get(PPC::RLDICL), PPC::X0)
+        .addReg(PPC::X1).addImm(0).addImm(64-Log2_32(MaxAlign));
+      BuildMI(MBB, MBBI, TII.get(PPC::SUBFIC8), PPC::X0).addReg(PPC::X0)
+        .addImm(NegFrameSize);
+      BuildMI(MBB, MBBI, TII.get(PPC::STDUX))
+        .addReg(PPC::X1).addReg(PPC::X1).addReg(PPC::X0);
+    } else if (isInt16(NegFrameSize)) {
+      BuildMI(MBB, MBBI, TII.get(PPC::STDU), PPC::X1)
+             .addReg(PPC::X1).addImm(NegFrameSize/4).addReg(PPC::X1);
+    } else {
+      BuildMI(MBB, MBBI, TII.get(PPC::LIS8), PPC::X0).addImm(NegFrameSize >>16);
+      BuildMI(MBB, MBBI, TII.get(PPC::ORI8), PPC::X0).addReg(PPC::X0)
+        .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, TII.get(PPC::STDUX)).addReg(PPC::X1).addReg(PPC::X1)
+        .addReg(PPC::X0);
+    }
+  }
+  
+  if (MMI && MMI->needsFrameInfo()) {
+    std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+    
+    if (NegFrameSize) {
+      // Show update of SP.
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, NegFrameSize);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    } else {
+      MachineLocation SP(IsPPC64 ? PPC::X31 : PPC::R31);
+      Moves.push_back(MachineMove(FrameLabelId, SP, SP));
+    }
+    
+    if (HasFP) {
+      MachineLocation FPDst(MachineLocation::VirtualFP, FPOffset);
+      MachineLocation FPSrc(IsPPC64 ? PPC::X31 : PPC::R31);
+      Moves.push_back(MachineMove(FrameLabelId, FPDst, FPSrc));
+    }
+
+    // Add callee saved registers to move list.
+    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+      unsigned Reg = CSI[I].getReg();
+      if (Reg == PPC::LR || Reg == PPC::LR8) continue;
+      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+      MachineLocation CSSrc(Reg);
+      Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+    }
+    
+    MachineLocation LRDst(MachineLocation::VirtualFP, LROffset);
+    MachineLocation LRSrc(IsPPC64 ? PPC::LR8 : PPC::LR);
+    Moves.push_back(MachineMove(FrameLabelId, LRDst, LRSrc));
+    
+    // Mark effective beginning of when frame pointer is ready.
+    unsigned ReadyLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, TII.get(PPC::LABEL)).addImm(ReadyLabelId);
+    
+    MachineLocation FPDst(HasFP ? (IsPPC64 ? PPC::X31 : PPC::R31) :
+                                  (IsPPC64 ? PPC::X1 : PPC::R1));
+    MachineLocation FPSrc(MachineLocation::VirtualFP);
+    Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+  }
+
+  // If there is a frame pointer, copy R1 into R31
+  if (HasFP) {
+    if (!IsPPC64) {
+      BuildMI(MBB, MBBI, TII.get(PPC::OR), PPC::R31).addReg(PPC::R1)
+        .addReg(PPC::R1);
+    } else {
+      BuildMI(MBB, MBBI, TII.get(PPC::OR8), PPC::X31).addReg(PPC::X1)
+        .addReg(PPC::X1);
+    }
+  }
+}
+
+void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert(MBBI->getOpcode() == PPC::BLR &&
+         "Can only insert epilog into returning blocks");
+
+  // Get alignment info so we know how to restore r1
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Get the number of bytes allocated from the FrameInfo.
+  unsigned FrameSize = MFI->getStackSize();
+
+  // Get processor type.
+  bool IsPPC64 = Subtarget.isPPC64();
+  // Get operating system
+  bool IsMachoABI = Subtarget.isMachoABI();
+  // Check if the link register (LR) has been used.
+  bool UsesLR = MFI->hasCalls() || usesLR(MF);
+  // Do we have a frame pointer for this function?
+  bool HasFP = hasFP(MF) && FrameSize;
+  
+  int LROffset = PPCFrameInfo::getReturnSaveOffset(IsPPC64, IsMachoABI);
+  int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64, IsMachoABI);
+  
+  if (FrameSize) {
+    // The loaded (or persistent) stack pointer value is offset by the 'stwu'
+    // on entry to the function.  Add this offset back now.
+    if (!Subtarget.isPPC64()) {
+      if (isInt16(FrameSize) && TargetAlign >= MaxAlign &&
+            !MFI->hasVarSizedObjects()) {
+          BuildMI(MBB, MBBI, TII.get(PPC::ADDI), PPC::R1)
+              .addReg(PPC::R1).addImm(FrameSize);
+      } else {
+        BuildMI(MBB, MBBI, TII.get(PPC::LWZ),PPC::R1).addImm(0).addReg(PPC::R1);
+      }
+    } else {
+      if (isInt16(FrameSize) && TargetAlign >= MaxAlign &&
+            !MFI->hasVarSizedObjects()) {
+        BuildMI(MBB, MBBI, TII.get(PPC::ADDI8), PPC::X1)
+           .addReg(PPC::X1).addImm(FrameSize);
+      } else {
+        BuildMI(MBB, MBBI, TII.get(PPC::LD), PPC::X1).addImm(0).addReg(PPC::X1);
+      }
+    }
+  }
+
+  if (IsPPC64) {
+    if (UsesLR)
+      BuildMI(MBB, MBBI, TII.get(PPC::LD), PPC::X0)
+        .addImm(LROffset/4).addReg(PPC::X1);
+        
+    if (HasFP)
+      BuildMI(MBB, MBBI, TII.get(PPC::LD), PPC::X31)
+        .addImm(FPOffset/4).addReg(PPC::X1);
+        
+    if (UsesLR)
+      BuildMI(MBB, MBBI, TII.get(PPC::MTLR8)).addReg(PPC::X0);
+  } else {
+    if (UsesLR)
+      BuildMI(MBB, MBBI, TII.get(PPC::LWZ), PPC::R0)
+          .addImm(LROffset).addReg(PPC::R1);
+        
+    if (HasFP)
+      BuildMI(MBB, MBBI, TII.get(PPC::LWZ), PPC::R31)
+          .addImm(FPOffset).addReg(PPC::R1);
+          
+    if (UsesLR)
+      BuildMI(MBB, MBBI, TII.get(PPC::MTLR)).addReg(PPC::R0);
+  }
+}
+
+unsigned PPCRegisterInfo::getRARegister() const {
+  return !Subtarget.isPPC64() ? PPC::LR : PPC::LR8;
+}
+
+unsigned PPCRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  if (!Subtarget.isPPC64())
+    return hasFP(MF) ? PPC::R31 : PPC::R1;
+  else
+    return hasFP(MF) ? PPC::X31 : PPC::X1;
+}
+
+void PPCRegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
+                                                                         const {
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(PPC::R1, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
+unsigned PPCRegisterInfo::getEHExceptionRegister() const {
+  return !Subtarget.isPPC64() ? PPC::R3 : PPC::X3;
+}
+
+unsigned PPCRegisterInfo::getEHHandlerRegister() const {
+  return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
+}
+
+#include "PPCGenRegisterInfo.inc"
+

diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
new file mode 100644
index 0000000..4112034
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h

@@ -0,0 +1,107 @@
+//===- PPCRegisterInfo.h - PowerPC Register Information Impl -----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC32_REGISTERINFO_H
+#define POWERPC32_REGISTERINFO_H
+
+#include "PPC.h"
+#include "PPCGenRegisterInfo.h.inc"
+#include <map>
+
+namespace llvm {
+class PPCSubtarget;
+class TargetInstrInfo;
+class Type;
+
+class PPCRegisterInfo : public PPCGenRegisterInfo {
+  std::map<unsigned, unsigned> ImmToIdxMap;
+  const PPCSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+public:
+  PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
+  
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// PPC::F14, return the number that it corresponds to (e.g. 14).
+  static unsigned getRegisterNumbering(unsigned RegEnum);
+
+  /// Code Generation virtual methods...
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, int FrameIndex,
+                           const TargetRegisterClass *RC) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC) const;
+
+  void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *RC) const;
+
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, const MachineInstr *Orig) const;
+
+  /// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into
+  /// copy instructions, turning them into load/store instructions.
+  virtual MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum,
+                                          int FrameIndex) const;
+  
+  const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  /// targetHandlesStackFrameRounding - Returns true if the target is
+  /// responsible for rounding up the stack frame (probably at emitPrologue
+  /// time).
+  bool targetHandlesStackFrameRounding() const { return true; }
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  /// usesLR - Returns if the link registers (LR) has been used in the function.
+  ///
+  bool usesLR(MachineFunction &MF) const;
+  
+  void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  /// determineFrameLayout - Determine the size of the frame and maximum call
+  /// frame size.
+  void determineFrameLayout(MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
new file mode 100644
index 0000000..0b3b4ca
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td

@@ -0,0 +1,333 @@
+//===- PowerPCRegisterInfo.td - The PowerPC Register File --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+class PPCReg<string n> : Register<n> {
+  let Namespace = "PPC";
+}
+
+// We identify all our registers with a 5-bit ID, for consistency's sake.
+
+// GPR - One of the 32 32-bit general-purpose registers
+class GPR<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+// GP8 - One of the 32 64-bit general-purpose registers
+class GP8<GPR SubReg> : PPCReg<SubReg.Name> {
+  field bits<5> Num = SubReg.Num;
+  let SubRegs = [SubReg];
+}
+
+// SPR - One of the 32-bit special-purpose registers
+class SPR<bits<10> num, string n> : PPCReg<n> {
+  field bits<10> Num = num;
+}
+
+// FPR - One of the 32 64-bit floating-point registers
+class FPR<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+// VR - One of the 32 128-bit vector registers
+class VR<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+// CR - One of the 8 4-bit condition registers
+class CR<bits<3> num, string n> : PPCReg<n> {
+  field bits<3> Num = num;
+}
+
+// CRBIT - One of the 32 1-bit condition register fields
+class CRBIT<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+
+// General-purpose registers
+def R0  : GPR< 0,  "r0">, DwarfRegNum<0>;
+def R1  : GPR< 1,  "r1">, DwarfRegNum<1>;
+def R2  : GPR< 2,  "r2">, DwarfRegNum<2>;
+def R3  : GPR< 3,  "r3">, DwarfRegNum<3>;
+def R4  : GPR< 4,  "r4">, DwarfRegNum<4>;
+def R5  : GPR< 5,  "r5">, DwarfRegNum<5>;
+def R6  : GPR< 6,  "r6">, DwarfRegNum<6>;
+def R7  : GPR< 7,  "r7">, DwarfRegNum<7>;
+def R8  : GPR< 8,  "r8">, DwarfRegNum<8>;
+def R9  : GPR< 9,  "r9">, DwarfRegNum<9>;
+def R10 : GPR<10, "r10">, DwarfRegNum<10>;
+def R11 : GPR<11, "r11">, DwarfRegNum<11>;
+def R12 : GPR<12, "r12">, DwarfRegNum<12>;
+def R13 : GPR<13, "r13">, DwarfRegNum<13>;
+def R14 : GPR<14, "r14">, DwarfRegNum<14>;
+def R15 : GPR<15, "r15">, DwarfRegNum<15>;
+def R16 : GPR<16, "r16">, DwarfRegNum<16>;
+def R17 : GPR<17, "r17">, DwarfRegNum<17>;
+def R18 : GPR<18, "r18">, DwarfRegNum<18>;
+def R19 : GPR<19, "r19">, DwarfRegNum<19>;
+def R20 : GPR<20, "r20">, DwarfRegNum<20>;
+def R21 : GPR<21, "r21">, DwarfRegNum<21>;
+def R22 : GPR<22, "r22">, DwarfRegNum<22>;
+def R23 : GPR<23, "r23">, DwarfRegNum<23>;
+def R24 : GPR<24, "r24">, DwarfRegNum<24>;
+def R25 : GPR<25, "r25">, DwarfRegNum<25>;
+def R26 : GPR<26, "r26">, DwarfRegNum<26>;
+def R27 : GPR<27, "r27">, DwarfRegNum<27>;
+def R28 : GPR<28, "r28">, DwarfRegNum<28>;
+def R29 : GPR<29, "r29">, DwarfRegNum<29>;
+def R30 : GPR<30, "r30">, DwarfRegNum<30>;
+def R31 : GPR<31, "r31">, DwarfRegNum<31>;
+
+// 64-bit General-purpose registers
+def X0  : GP8< R0>, DwarfRegNum<0>;
+def X1  : GP8< R1>, DwarfRegNum<1>;
+def X2  : GP8< R2>, DwarfRegNum<2>;
+def X3  : GP8< R3>, DwarfRegNum<3>;
+def X4  : GP8< R4>, DwarfRegNum<4>;
+def X5  : GP8< R5>, DwarfRegNum<5>;
+def X6  : GP8< R6>, DwarfRegNum<6>;
+def X7  : GP8< R7>, DwarfRegNum<7>;
+def X8  : GP8< R8>, DwarfRegNum<8>;
+def X9  : GP8< R9>, DwarfRegNum<9>;
+def X10 : GP8<R10>, DwarfRegNum<10>;
+def X11 : GP8<R11>, DwarfRegNum<11>;
+def X12 : GP8<R12>, DwarfRegNum<12>;
+def X13 : GP8<R13>, DwarfRegNum<13>;
+def X14 : GP8<R14>, DwarfRegNum<14>;
+def X15 : GP8<R15>, DwarfRegNum<15>;
+def X16 : GP8<R16>, DwarfRegNum<16>;
+def X17 : GP8<R17>, DwarfRegNum<17>;
+def X18 : GP8<R18>, DwarfRegNum<18>;
+def X19 : GP8<R19>, DwarfRegNum<19>;
+def X20 : GP8<R20>, DwarfRegNum<20>;
+def X21 : GP8<R21>, DwarfRegNum<21>;
+def X22 : GP8<R22>, DwarfRegNum<22>;
+def X23 : GP8<R23>, DwarfRegNum<23>;
+def X24 : GP8<R24>, DwarfRegNum<24>;
+def X25 : GP8<R25>, DwarfRegNum<25>;
+def X26 : GP8<R26>, DwarfRegNum<26>;
+def X27 : GP8<R27>, DwarfRegNum<27>;
+def X28 : GP8<R28>, DwarfRegNum<28>;
+def X29 : GP8<R29>, DwarfRegNum<29>;
+def X30 : GP8<R30>, DwarfRegNum<30>;
+def X31 : GP8<R31>, DwarfRegNum<31>;
+
+// Floating-point registers
+def F0  : FPR< 0,  "f0">, DwarfRegNum<32>;
+def F1  : FPR< 1,  "f1">, DwarfRegNum<33>;
+def F2  : FPR< 2,  "f2">, DwarfRegNum<34>;
+def F3  : FPR< 3,  "f3">, DwarfRegNum<35>;
+def F4  : FPR< 4,  "f4">, DwarfRegNum<36>;
+def F5  : FPR< 5,  "f5">, DwarfRegNum<37>;
+def F6  : FPR< 6,  "f6">, DwarfRegNum<38>;
+def F7  : FPR< 7,  "f7">, DwarfRegNum<39>;
+def F8  : FPR< 8,  "f8">, DwarfRegNum<40>;
+def F9  : FPR< 9,  "f9">, DwarfRegNum<41>;
+def F10 : FPR<10, "f10">, DwarfRegNum<42>;
+def F11 : FPR<11, "f11">, DwarfRegNum<43>;
+def F12 : FPR<12, "f12">, DwarfRegNum<44>;
+def F13 : FPR<13, "f13">, DwarfRegNum<45>;
+def F14 : FPR<14, "f14">, DwarfRegNum<46>;
+def F15 : FPR<15, "f15">, DwarfRegNum<47>;
+def F16 : FPR<16, "f16">, DwarfRegNum<48>;
+def F17 : FPR<17, "f17">, DwarfRegNum<49>;
+def F18 : FPR<18, "f18">, DwarfRegNum<50>;
+def F19 : FPR<19, "f19">, DwarfRegNum<51>;
+def F20 : FPR<20, "f20">, DwarfRegNum<52>;
+def F21 : FPR<21, "f21">, DwarfRegNum<53>;
+def F22 : FPR<22, "f22">, DwarfRegNum<54>;
+def F23 : FPR<23, "f23">, DwarfRegNum<55>;
+def F24 : FPR<24, "f24">, DwarfRegNum<56>;
+def F25 : FPR<25, "f25">, DwarfRegNum<57>;
+def F26 : FPR<26, "f26">, DwarfRegNum<58>;
+def F27 : FPR<27, "f27">, DwarfRegNum<59>;
+def F28 : FPR<28, "f28">, DwarfRegNum<60>;
+def F29 : FPR<29, "f29">, DwarfRegNum<61>;
+def F30 : FPR<30, "f30">, DwarfRegNum<62>;
+def F31 : FPR<31, "f31">, DwarfRegNum<63>;
+
+// Vector registers
+def V0  : VR< 0,  "v0">, DwarfRegNum<77>;
+def V1  : VR< 1,  "v1">, DwarfRegNum<78>;
+def V2  : VR< 2,  "v2">, DwarfRegNum<79>;
+def V3  : VR< 3,  "v3">, DwarfRegNum<80>;
+def V4  : VR< 4,  "v4">, DwarfRegNum<81>;
+def V5  : VR< 5,  "v5">, DwarfRegNum<82>;
+def V6  : VR< 6,  "v6">, DwarfRegNum<83>;
+def V7  : VR< 7,  "v7">, DwarfRegNum<84>;
+def V8  : VR< 8,  "v8">, DwarfRegNum<85>;
+def V9  : VR< 9,  "v9">, DwarfRegNum<86>;
+def V10 : VR<10, "v10">, DwarfRegNum<87>;
+def V11 : VR<11, "v11">, DwarfRegNum<88>;
+def V12 : VR<12, "v12">, DwarfRegNum<89>;
+def V13 : VR<13, "v13">, DwarfRegNum<90>;
+def V14 : VR<14, "v14">, DwarfRegNum<91>;
+def V15 : VR<15, "v15">, DwarfRegNum<92>;
+def V16 : VR<16, "v16">, DwarfRegNum<93>;
+def V17 : VR<17, "v17">, DwarfRegNum<94>;
+def V18 : VR<18, "v18">, DwarfRegNum<95>;
+def V19 : VR<19, "v19">, DwarfRegNum<96>;
+def V20 : VR<20, "v20">, DwarfRegNum<97>;
+def V21 : VR<21, "v21">, DwarfRegNum<98>;
+def V22 : VR<22, "v22">, DwarfRegNum<99>;
+def V23 : VR<23, "v23">, DwarfRegNum<100>;
+def V24 : VR<24, "v24">, DwarfRegNum<101>;
+def V25 : VR<25, "v25">, DwarfRegNum<102>;
+def V26 : VR<26, "v26">, DwarfRegNum<103>;
+def V27 : VR<27, "v27">, DwarfRegNum<104>;
+def V28 : VR<28, "v28">, DwarfRegNum<105>;
+def V29 : VR<29, "v29">, DwarfRegNum<106>;
+def V30 : VR<30, "v30">, DwarfRegNum<107>;
+def V31 : VR<31, "v31">, DwarfRegNum<108>;
+
+// Condition registers
+def CR0 : CR<0, "cr0">, DwarfRegNum<68>;
+def CR1 : CR<1, "cr1">, DwarfRegNum<69>;
+def CR2 : CR<2, "cr2">, DwarfRegNum<70>;
+def CR3 : CR<3, "cr3">, DwarfRegNum<71>;
+def CR4 : CR<4, "cr4">, DwarfRegNum<72>;
+def CR5 : CR<5, "cr5">, DwarfRegNum<73>;
+def CR6 : CR<6, "cr6">, DwarfRegNum<74>;
+def CR7 : CR<7, "cr7">, DwarfRegNum<75>;
+
+// Condition register bits
+def CR0LT : CRBIT< 0, "0">, DwarfRegNum<0>;
+def CR0GT : CRBIT< 1, "1">, DwarfRegNum<0>;
+def CR0EQ : CRBIT< 2, "2">, DwarfRegNum<0>;
+def CR0UN : CRBIT< 3, "3">, DwarfRegNum<0>;
+def CR1LT : CRBIT< 4, "4">, DwarfRegNum<0>;
+def CR1GT : CRBIT< 5, "5">, DwarfRegNum<0>;
+def CR1EQ : CRBIT< 6, "6">, DwarfRegNum<0>;
+def CR1UN : CRBIT< 7, "7">, DwarfRegNum<0>;
+def CR2LT : CRBIT< 8, "8">, DwarfRegNum<0>;
+def CR2GT : CRBIT< 9, "9">, DwarfRegNum<0>;
+def CR2EQ : CRBIT<10, "10">, DwarfRegNum<0>;
+def CR2UN : CRBIT<11, "11">, DwarfRegNum<0>;
+def CR3LT : CRBIT<12, "12">, DwarfRegNum<0>;
+def CR3GT : CRBIT<13, "13">, DwarfRegNum<0>;
+def CR3EQ : CRBIT<14, "14">, DwarfRegNum<0>;
+def CR3UN : CRBIT<15, "15">, DwarfRegNum<0>;
+def CR4LT : CRBIT<16, "16">, DwarfRegNum<0>;
+def CR4GT : CRBIT<17, "17">, DwarfRegNum<0>;
+def CR4EQ : CRBIT<18, "18">, DwarfRegNum<0>;
+def CR4UN : CRBIT<19, "19">, DwarfRegNum<0>;
+def CR5LT : CRBIT<20, "20">, DwarfRegNum<0>;
+def CR5GT : CRBIT<21, "21">, DwarfRegNum<0>;
+def CR5EQ : CRBIT<22, "22">, DwarfRegNum<0>;
+def CR5UN : CRBIT<23, "23">, DwarfRegNum<0>;
+def CR6LT : CRBIT<24, "24">, DwarfRegNum<0>;
+def CR6GT : CRBIT<25, "25">, DwarfRegNum<0>;
+def CR6EQ : CRBIT<26, "26">, DwarfRegNum<0>;
+def CR6UN : CRBIT<27, "27">, DwarfRegNum<0>;
+def CR7LT : CRBIT<28, "28">, DwarfRegNum<0>;
+def CR7GT : CRBIT<29, "29">, DwarfRegNum<0>;
+def CR7EQ : CRBIT<30, "30">, DwarfRegNum<0>;
+def CR7UN : CRBIT<31, "31">, DwarfRegNum<0>;
+
+def : SubRegSet<1, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0LT, CR1LT, CR2LT, CR3LT, CR4LT, CR5LT, CR6LT, CR7LT]>;
+def : SubRegSet<2, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0GT, CR1GT, CR2GT, CR3GT, CR4GT, CR5GT, CR6GT, CR7GT]>;
+def : SubRegSet<3, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0EQ, CR1EQ, CR2EQ, CR3EQ, CR4EQ, CR5EQ, CR6EQ, CR7EQ]>;
+def : SubRegSet<4, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0UN, CR1UN, CR2UN, CR3UN, CR4UN, CR5UN, CR6UN, CR7UN]>;
+
+// Link register
+def LR  : SPR<8, "lr">, DwarfRegNum<65>;
+//let Aliases = [LR] in
+def LR8 : SPR<8, "lr">, DwarfRegNum<65>;
+
+// Count register
+def CTR  : SPR<9, "ctr">, DwarfRegNum<66>;
+def CTR8 : SPR<9, "ctr">, DwarfRegNum<66>;
+
+// VRsave register
+def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<107>;
+
+/// Register classes
+// Allocate volatiles first
+// then nonvolatiles in reverse order since stmw/lmw save from rN to r31
+def GPRC : RegisterClass<"PPC", [i32], 32,
+     [R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12,
+      R30, R29, R28, R27, R26, R25, R24, R23, R22, R21, R20, R19, R18, R17,
+      R16, R15, R14, R13, R31, R0, R1, LR]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GPRCClass::iterator
+    GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
+      // In Linux, r2 is reserved for the OS.
+      if (!MF.getTarget().getSubtarget<PPCSubtarget>().isDarwin())
+        return begin()+1;
+
+      return begin();
+    }
+    GPRCClass::iterator
+    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
+      // On PPC64, r13 is the thread pointer.  Never allocate this register.
+      // Note that this is overconservative, as it also prevents allocation of
+      // R31 when the FP is not needed.
+      if (MF.getTarget().getSubtarget<PPCSubtarget>().isPPC64())
+        return end()-5;  // don't allocate R13, R31, R0, R1, LR
+        
+      if (needsFP(MF))
+        return end()-4;  // don't allocate R31, R0, R1, LR
+      else
+        return end()-3;  // don't allocate R0, R1, LR
+    }
+  }];
+}
+def G8RC : RegisterClass<"PPC", [i64], 64,
+     [X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12,
+      X30, X29, X28, X27, X26, X25, X24, X23, X22, X21, X20, X19, X18, X17,
+      X16, X15, X14, X31, X13, X0, X1, LR8]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    G8RCClass::iterator
+    G8RCClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    G8RCClass::iterator
+    G8RCClass::allocation_order_end(const MachineFunction &MF) const {
+      if (needsFP(MF))
+        return end()-5;
+      else
+        return end()-4;
+    }
+  }];
+}
+
+
+
+def F8RC : RegisterClass<"PPC", [f64], 64, [F0, F1, F2, F3, F4, F5, F6, F7,
+  F8, F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21,
+  F22, F23, F24, F25, F26, F27, F28, F29, F30, F31]>;
+def F4RC : RegisterClass<"PPC", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7,
+  F8, F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21,
+  F22, F23, F24, F25, F26, F27, F28, F29, F30, F31]>;
+
+def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128,
+ [V2, V3, V4, V5, V0, V1, 
+  V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21,
+  V22, V23, V24, V25, V26, V27, V28, V29, V30, V31]>;
+
+def CRRC : RegisterClass<"PPC", [i32], 32, [CR0, CR1, CR5, CR6, CR7, CR2, 
+  CR3, CR4]>;
+  

diff --git a/lib/Target/PowerPC/PPCRelocations.h b/lib/Target/PowerPC/PPCRelocations.h
new file mode 100644
index 0000000..261622f
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRelocations.h

@@ -0,0 +1,56 @@
+//===- PPCRelocations.h - PPC32 Code Relocations ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC 32-bit target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC32RELOCATIONS_H
+#define PPC32RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+// Hack to rid us of a PPC pre-processor symbol which is erroneously
+// defined in a PowerPC header file (bug in Linux/PPC)
+#ifdef PPC
+#undef PPC
+#endif
+
+namespace llvm {
+  namespace PPC {
+    enum RelocationType {
+      // reloc_vanilla - A standard relocation, where the address of the
+      // relocated object completely overwrites the address of the relocation.
+      reloc_vanilla,
+    
+      // reloc_pcrel_bx - PC relative relocation, for the b or bl instructions.
+      reloc_pcrel_bx,
+
+      // reloc_pcrel_bcx - PC relative relocation, for BLT,BLE,BEQ,BGE,BGT,BNE,
+      // and other bcx instructions.
+      reloc_pcrel_bcx,
+
+      // reloc_absolute_high - Absolute relocation, for the loadhi instruction
+      // (which is really addis).  Add the high 16-bits of the specified global
+      // address into the low 16-bits of the instruction.
+      reloc_absolute_high,
+
+      // reloc_absolute_low - Absolute relocation, for the la instruction (which
+      // is really an addi).  Add the low 16-bits of the specified global
+      // address into the low 16-bits of the instruction.
+      reloc_absolute_low,
+      
+      // reloc_absolute_low_ix - Absolute relocation for the 64-bit load/store
+      // instruction which have two implicit zero bits.
+      reloc_absolute_low_ix
+    };
+  }
+}
+
+#endif

diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
new file mode 100644
index 0000000..0e0fd82
--- /dev/null
+++ b/lib/Target/PowerPC/PPCSchedule.td

@@ -0,0 +1,508 @@
+//===- PPCSchedule.td - PowerPC Scheduling Definitions -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across PowerPC chips sets
+//
+def BPU    : FuncUnit; // Branch unit
+def SLU    : FuncUnit; // Store/load unit
+def SRU    : FuncUnit; // special register unit
+def IU1    : FuncUnit; // integer unit 1 (simple)
+def IU2    : FuncUnit; // integer unit 2 (complex)
+def IU3    : FuncUnit; // integer unit 3 (7450 simple)
+def IU4    : FuncUnit; // integer unit 4 (7450 simple)
+def FPU1   : FuncUnit; // floating point unit 1
+def FPU2   : FuncUnit; // floating point unit 2
+def VPU    : FuncUnit; // vector permutation unit
+def VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def VFPU   : FuncUnit; // vector floating point unit
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for PowerPC
+//
+def IntGeneral   : InstrItinClass;
+def IntCompare   : InstrItinClass;
+def IntDivD      : InstrItinClass;
+def IntDivW      : InstrItinClass;
+def IntMFFS      : InstrItinClass;
+def IntMFVSCR    : InstrItinClass;
+def IntMTFSB0    : InstrItinClass;
+def IntMTSRD     : InstrItinClass;
+def IntMulHD     : InstrItinClass;
+def IntMulHW     : InstrItinClass;
+def IntMulHWU    : InstrItinClass;
+def IntMulLI     : InstrItinClass;
+def IntRFID      : InstrItinClass;
+def IntRotateD   : InstrItinClass;
+def IntRotate    : InstrItinClass;
+def IntShift     : InstrItinClass;
+def IntTrapD     : InstrItinClass;
+def IntTrapW     : InstrItinClass;
+def BrB          : InstrItinClass;
+def BrCR         : InstrItinClass;
+def BrMCR        : InstrItinClass;
+def BrMCRX       : InstrItinClass;
+def LdStDCBA     : InstrItinClass;
+def LdStDCBF     : InstrItinClass;
+def LdStDCBI     : InstrItinClass;
+def LdStGeneral  : InstrItinClass;
+def LdStDSS      : InstrItinClass;
+def LdStICBI     : InstrItinClass;
+def LdStUX       : InstrItinClass;
+def LdStLD       : InstrItinClass;
+def LdStLDARX    : InstrItinClass;
+def LdStLFD      : InstrItinClass;
+def LdStLFDU     : InstrItinClass;
+def LdStLHA      : InstrItinClass;
+def LdStLMW      : InstrItinClass;
+def LdStLVecX    : InstrItinClass;
+def LdStLWA      : InstrItinClass;
+def LdStLWARX    : InstrItinClass;
+def LdStSLBIA    : InstrItinClass;
+def LdStSLBIE    : InstrItinClass;
+def LdStSTD      : InstrItinClass;
+def LdStSTDCX    : InstrItinClass;
+def LdStSTVEBX   : InstrItinClass;
+def LdStSTWCX    : InstrItinClass;
+def LdStSync     : InstrItinClass;
+def SprISYNC     : InstrItinClass;
+def SprMFSR      : InstrItinClass;
+def SprMTMSR     : InstrItinClass;
+def SprMTSR      : InstrItinClass;
+def SprTLBSYNC   : InstrItinClass;
+def SprMFCR      : InstrItinClass;
+def SprMFMSR     : InstrItinClass;
+def SprMFSPR     : InstrItinClass;
+def SprMFTB      : InstrItinClass;
+def SprMTSPR     : InstrItinClass;
+def SprMTSRIN    : InstrItinClass;
+def SprRFI       : InstrItinClass;
+def SprSC        : InstrItinClass;
+def FPGeneral    : InstrItinClass;
+def FPCompare    : InstrItinClass;
+def FPDivD       : InstrItinClass;
+def FPDivS       : InstrItinClass;
+def FPFused      : InstrItinClass;
+def FPRes        : InstrItinClass;
+def FPSqrt       : InstrItinClass;
+def VecGeneral   : InstrItinClass;
+def VecFP        : InstrItinClass;
+def VecFPCompare : InstrItinClass;
+def VecComplex   : InstrItinClass;
+def VecPerm      : InstrItinClass;
+def VecFPRound   : InstrItinClass;
+def VecVSL       : InstrItinClass;
+def VecVSR       : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "PPCScheduleG3.td"
+include "PPCScheduleG4.td"
+include "PPCScheduleG4Plus.td"
+include "PPCScheduleG5.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction to itinerary class map - When add new opcodes to the supported
+// set, refer to the following table to determine which itinerary class the
+// opcode belongs.
+//
+//    opcode     itinerary class
+//    ======     ===============
+//    add        IntGeneral
+//    addc       IntGeneral
+//    adde       IntGeneral
+//    addi       IntGeneral
+//    addic      IntGeneral
+//    addic.     IntGeneral
+//    addis      IntGeneral
+//    addme      IntGeneral
+//    addze      IntGeneral
+//    and        IntGeneral
+//    andc       IntGeneral
+//    andi.      IntGeneral
+//    andis.     IntGeneral
+//    b          BrB
+//    bc         BrB
+//    bcctr      BrB
+//    bclr       BrB
+//    cmp        IntCompare
+//    cmpi       IntCompare
+//    cmpl       IntCompare
+//    cmpli      IntCompare
+//    cntlzd     IntRotateD
+//    cntlzw     IntGeneral
+//    crand      BrCR
+//    crandc     BrCR
+//    creqv      BrCR
+//    crnand     BrCR
+//    crnor      BrCR
+//    cror       BrCR
+//    crorc      BrCR
+//    crxor      BrCR
+//    dcba       LdStDCBA
+//    dcbf       LdStDCBF
+//    dcbi       LdStDCBI
+//    dcbst      LdStDCBF
+//    dcbt       LdStGeneral
+//    dcbtst     LdStGeneral
+//    dcbz       LdStDCBF
+//    divd       IntDivD
+//    divdu      IntDivD
+//    divw       IntDivW
+//    divwu      IntDivW
+//    dss        LdStDSS
+//    dst        LdStDSS
+//    dstst      LdStDSS
+//    eciwx      LdStGeneral
+//    ecowx      LdStGeneral
+//    eieio      LdStGeneral
+//    eqv        IntGeneral
+//    extsb      IntGeneral
+//    extsh      IntGeneral
+//    extsw      IntRotateD
+//    fabs       FPGeneral
+//    fadd       FPGeneral
+//    fadds      FPGeneral
+//    fcfid      FPGeneral
+//    fcmpo      FPCompare
+//    fcmpu      FPCompare
+//    fctid      FPGeneral
+//    fctidz     FPGeneral
+//    fctiw      FPGeneral
+//    fctiwz     FPGeneral
+//    fdiv       FPDivD
+//    fdivs      FPDivS
+//    fmadd      FPFused
+//    fmadds     FPGeneral
+//    fmr        FPGeneral
+//    fmsub      FPFused
+//    fmsubs     FPGeneral
+//    fmul       FPFused
+//    fmuls      FPGeneral
+//    fnabs      FPGeneral
+//    fneg       FPGeneral
+//    fnmadd     FPFused
+//    fnmadds    FPGeneral
+//    fnmsub     FPFused
+//    fnmsubs    FPGeneral
+//    fres       FPRes
+//    frsp       FPGeneral
+//    frsqrte    FPGeneral
+//    fsel       FPGeneral
+//    fsqrt      FPSqrt
+//    fsqrts     FPSqrt
+//    fsub       FPGeneral
+//    fsubs      FPGeneral
+//    icbi       LdStICBI
+//    isync      SprISYNC
+//    lbz        LdStGeneral
+//    lbzu       LdStGeneral
+//    lbzux      LdStUX
+//    lbzx       LdStGeneral
+//    ld         LdStLD
+//    ldarx      LdStLDARX
+//    ldu        LdStLD
+//    ldux       LdStLD
+//    ldx        LdStLD
+//    lfd        LdStLFD
+//    lfdu       LdStLFDU
+//    lfdux      LdStLFDU
+//    lfdx       LdStLFDU
+//    lfs        LdStLFDU
+//    lfsu       LdStLFDU
+//    lfsux      LdStLFDU
+//    lfsx       LdStLFDU
+//    lha        LdStLHA
+//    lhau       LdStLHA
+//    lhaux      LdStLHA
+//    lhax       LdStLHA
+//    lhbrx      LdStGeneral
+//    lhz        LdStGeneral
+//    lhzu       LdStGeneral
+//    lhzux      LdStUX
+//    lhzx       LdStGeneral
+//    lmw        LdStLMW
+//    lswi       LdStLMW
+//    lswx       LdStLMW
+//    lvebx      LdStLVecX
+//    lvehx      LdStLVecX
+//    lvewx      LdStLVecX
+//    lvsl       LdStLVecX
+//    lvsr       LdStLVecX
+//    lvx        LdStLVecX
+//    lvxl       LdStLVecX
+//    lwa        LdStLWA
+//    lwarx      LdStLWARX
+//    lwaux      LdStLHA
+//    lwax       LdStLHA
+//    lwbrx      LdStGeneral
+//    lwz        LdStGeneral
+//    lwzu       LdStGeneral
+//    lwzux      LdStUX
+//    lwzx       LdStGeneral
+//    mcrf       BrMCR
+//    mcrfs      FPGeneral
+//    mcrxr      BrMCRX
+//    mfcr       SprMFCR
+//    mffs       IntMFFS
+//    mfmsr      SprMFMSR
+//    mfspr      SprMFSPR
+//    mfsr       SprMFSR
+//    mfsrin     SprMFSR
+//    mftb       SprMFTB
+//    mfvscr     IntMFVSCR
+//    mtcrf      BrMCRX
+//    mtfsb0     IntMTFSB0
+//    mtfsb1     IntMTFSB0
+//    mtfsf      IntMTFSB0
+//    mtfsfi     IntMTFSB0
+//    mtmsr      SprMTMSR
+//    mtmsrd     LdStLD
+//    mtspr      SprMTSPR
+//    mtsr       SprMTSR
+//    mtsrd      IntMTSRD
+//    mtsrdin    IntMTSRD
+//    mtsrin     SprMTSRIN
+//    mtvscr     IntMFVSCR
+//    mulhd      IntMulHD
+//    mulhdu     IntMulHD
+//    mulhw      IntMulHW
+//    mulhwu     IntMulHWU
+//    mulld      IntMulHD
+//    mulli      IntMulLI
+//    mullw      IntMulHW
+//    nand       IntGeneral
+//    neg        IntGeneral
+//    nor        IntGeneral
+//    or         IntGeneral
+//    orc        IntGeneral
+//    ori        IntGeneral
+//    oris       IntGeneral
+//    rfi        SprRFI
+//    rfid       IntRFID
+//    rldcl      IntRotateD
+//    rldcr      IntRotateD
+//    rldic      IntRotateD
+//    rldicl     IntRotateD
+//    rldicr     IntRotateD
+//    rldimi     IntRotateD
+//    rlwimi     IntRotate
+//    rlwinm     IntGeneral
+//    rlwnm      IntGeneral
+//    sc         SprSC
+//    slbia      LdStSLBIA
+//    slbie      LdStSLBIE
+//    sld        IntRotateD
+//    slw        IntGeneral
+//    srad       IntRotateD
+//    sradi      IntRotateD
+//    sraw       IntShift
+//    srawi      IntShift
+//    srd        IntRotateD
+//    srw        IntGeneral
+//    stb        LdStGeneral
+//    stbu       LdStGeneral
+//    stbux      LdStGeneral
+//    stbx       LdStGeneral
+//    std        LdStSTD
+//    stdcx.     LdStSTDCX
+//    stdu       LdStSTD
+//    stdux      LdStSTD
+//    stdx       LdStSTD
+//    stfd       LdStUX
+//    stfdu      LdStUX
+//    stfdux     LdStUX
+//    stfdx      LdStUX
+//    stfiwx     LdStUX
+//    stfs       LdStUX
+//    stfsu      LdStUX
+//    stfsux     LdStUX
+//    stfsx      LdStUX
+//    sth        LdStGeneral
+//    sthbrx     LdStGeneral
+//    sthu       LdStGeneral
+//    sthux      LdStGeneral
+//    sthx       LdStGeneral
+//    stmw       LdStLMW
+//    stswi      LdStLMW
+//    stswx      LdStLMW
+//    stvebx     LdStSTVEBX
+//    stvehx     LdStSTVEBX
+//    stvewx     LdStSTVEBX
+//    stvx       LdStSTVEBX
+//    stvxl      LdStSTVEBX
+//    stw        LdStGeneral
+//    stwbrx     LdStGeneral
+//    stwcx.     LdStSTWCX
+//    stwu       LdStGeneral
+//    stwux      LdStGeneral
+//    stwx       LdStGeneral
+//    subf       IntGeneral
+//    subfc      IntGeneral
+//    subfe      IntGeneral
+//    subfic     IntGeneral
+//    subfme     IntGeneral
+//    subfze     IntGeneral
+//    sync       LdStSync
+//    td         IntTrapD
+//    tdi        IntTrapD
+//    tlbia      LdStSLBIA
+//    tlbie      LdStDCBF
+//    tlbsync    SprTLBSYNC
+//    tw         IntTrapW
+//    twi        IntTrapW
+//    vaddcuw    VecGeneral
+//    vaddfp     VecFP
+//    vaddsbs    VecGeneral
+//    vaddshs    VecGeneral
+//    vaddsws    VecGeneral
+//    vaddubm    VecGeneral
+//    vaddubs    VecGeneral
+//    vadduhm    VecGeneral
+//    vadduhs    VecGeneral
+//    vadduwm    VecGeneral
+//    vadduws    VecGeneral
+//    vand       VecGeneral
+//    vandc      VecGeneral
+//    vavgsb     VecGeneral
+//    vavgsh     VecGeneral
+//    vavgsw     VecGeneral
+//    vavgub     VecGeneral
+//    vavguh     VecGeneral
+//    vavguw     VecGeneral
+//    vcfsx      VecFP
+//    vcfux      VecFP
+//    vcmpbfp    VecFPCompare
+//    vcmpeqfp   VecFPCompare
+//    vcmpequb   VecGeneral
+//    vcmpequh   VecGeneral
+//    vcmpequw   VecGeneral
+//    vcmpgefp   VecFPCompare
+//    vcmpgtfp   VecFPCompare
+//    vcmpgtsb   VecGeneral
+//    vcmpgtsh   VecGeneral
+//    vcmpgtsw   VecGeneral
+//    vcmpgtub   VecGeneral
+//    vcmpgtuh   VecGeneral
+//    vcmpgtuw   VecGeneral
+//    vctsxs     VecFP
+//    vctuxs     VecFP
+//    vexptefp   VecFP
+//    vlogefp    VecFP
+//    vmaddfp    VecFP
+//    vmaxfp     VecFPCompare
+//    vmaxsb     VecGeneral
+//    vmaxsh     VecGeneral
+//    vmaxsw     VecGeneral
+//    vmaxub     VecGeneral
+//    vmaxuh     VecGeneral
+//    vmaxuw     VecGeneral
+//    vmhaddshs  VecComplex
+//    vmhraddshs VecComplex
+//    vminfp     VecFPCompare
+//    vminsb     VecGeneral
+//    vminsh     VecGeneral
+//    vminsw     VecGeneral
+//    vminub     VecGeneral
+//    vminuh     VecGeneral
+//    vminuw     VecGeneral
+//    vmladduhm  VecComplex
+//    vmrghb     VecPerm
+//    vmrghh     VecPerm
+//    vmrghw     VecPerm
+//    vmrglb     VecPerm
+//    vmrglh     VecPerm
+//    vmrglw     VecPerm
+//    vmsubfp    VecFP
+//    vmsummbm   VecComplex
+//    vmsumshm   VecComplex
+//    vmsumshs   VecComplex
+//    vmsumubm   VecComplex
+//    vmsumuhm   VecComplex
+//    vmsumuhs   VecComplex
+//    vmulesb    VecComplex
+//    vmulesh    VecComplex
+//    vmuleub    VecComplex
+//    vmuleuh    VecComplex
+//    vmulosb    VecComplex
+//    vmulosh    VecComplex
+//    vmuloub    VecComplex
+//    vmulouh    VecComplex
+//    vnor       VecGeneral
+//    vor        VecGeneral
+//    vperm      VecPerm
+//    vpkpx      VecPerm
+//    vpkshss    VecPerm
+//    vpkshus    VecPerm
+//    vpkswss    VecPerm
+//    vpkswus    VecPerm
+//    vpkuhum    VecPerm
+//    vpkuhus    VecPerm
+//    vpkuwum    VecPerm
+//    vpkuwus    VecPerm
+//    vrefp      VecFPRound
+//    vrfim      VecFPRound
+//    vrfin      VecFPRound
+//    vrfip      VecFPRound
+//    vrfiz      VecFPRound
+//    vrlb       VecGeneral
+//    vrlh       VecGeneral
+//    vrlw       VecGeneral
+//    vrsqrtefp  VecFP
+//    vsel       VecGeneral
+//    vsl        VecVSL
+//    vslb       VecGeneral
+//    vsldoi     VecPerm
+//    vslh       VecGeneral
+//    vslo       VecPerm
+//    vslw       VecGeneral
+//    vspltb     VecPerm
+//    vsplth     VecPerm
+//    vspltisb   VecPerm
+//    vspltish   VecPerm
+//    vspltisw   VecPerm
+//    vspltw     VecPerm
+//    vsr        VecVSR
+//    vsrab      VecGeneral
+//    vsrah      VecGeneral
+//    vsraw      VecGeneral
+//    vsrb       VecGeneral
+//    vsrh       VecGeneral
+//    vsro       VecPerm
+//    vsrw       VecGeneral
+//    vsubcuw    VecGeneral
+//    vsubfp     VecFP
+//    vsubsbs    VecGeneral
+//    vsubshs    VecGeneral
+//    vsubsws    VecGeneral
+//    vsububm    VecGeneral
+//    vsububs    VecGeneral
+//    vsubuhm    VecGeneral
+//    vsubuhs    VecGeneral
+//    vsubuwm    VecGeneral
+//    vsubuws    VecGeneral
+//    vsum2sws   VecComplex
+//    vsum4sbs   VecComplex
+//    vsum4shs   VecComplex
+//    vsum4ubs   VecComplex
+//    vsumsws    VecComplex
+//    vupkhpx    VecPerm
+//    vupkhsb    VecPerm
+//    vupkhsh    VecPerm
+//    vupklpx    VecPerm
+//    vupklsb    VecPerm
+//    vupklsh    VecPerm
+//    vxor       VecGeneral
+//    xor        IntGeneral
+//    xori       IntGeneral
+//    xoris      IntGeneral
+//

diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
new file mode 100644
index 0000000..fbb9f6f
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG3.td

@@ -0,0 +1,63 @@
+//===- PPCScheduleG3.td - PPC G3 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G3 (750) processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def G3Itineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<6, [IU1]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU1]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
+  InstrItinData<LdStDCBA    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<8, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<2, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<10, [FPU1]>]>
+]>;

diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
new file mode 100644
index 0000000..d0e4456
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG4.td

@@ -0,0 +1,73 @@
+//===- PPCScheduleG4.td - PPC G4 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4 (7400) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4Itineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<6, [IU1]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU1]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<8, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<8, [SRU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<10, [FPU1]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecComplex  , [InstrStage<3, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<1, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecVSL      , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecVSR      , [InstrStage<1, [VIU1]>]>
+]>;

diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
new file mode 100644
index 0000000..b40a8a5
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td

@@ -0,0 +1,76 @@
+//===- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4+ (7450) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4PlusItineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntDivW     , [InstrStage<23, [IU2]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<2, [VFPU]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<4, [IU2]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<4, [IU2]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<2, [IU2]>]>,
+  InstrItinData<BrMCR       , [InstrStage<2, [IU2]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<2, [IU2]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<LdStUX      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<37, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<4, [IU2]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<4, [IU2]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<5, [IU2]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprRFI      , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<SprSC       , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<35, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<21, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<14, [FPU1]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>,
+  InstrItinData<VecComplex  , [InstrStage<4, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<2, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<4, [VIU1]>]>,
+  InstrItinData<VecVSL      , [InstrStage<2, [VPU]>]>,
+  InstrItinData<VecVSR      , [InstrStage<2, [VPU]>]>
+]>;

diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
new file mode 100644
index 0000000..ff4be2c
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG5.td

@@ -0,0 +1,83 @@
+//===- PPCScheduleG5.td - PPC G5 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G5 (970) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G5Itineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<3, [IU1, IU2]>]>,
+  InstrItinData<IntDivD     , [InstrStage<68, [IU1]>]>,
+  InstrItinData<IntDivW     , [InstrStage<36, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<6, [IU2]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<1, [VFPU]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<IntMulHD    , [InstrStage<7, [IU1, IU2]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1, IU2]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<5, [IU1, IU2]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<4, [IU1, IU2]>]>,
+  InstrItinData<IntRFID     , [InstrStage<1, [IU2]>]>,
+  InstrItinData<IntRotateD  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntRotate   , [InstrStage<4, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntTrapD    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<4, [BPU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<2, [BPU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<3, [BPU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<10, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<40, [SLU]>]>,
+  InstrItinData<LdStUX      , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLD      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLDARX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<64, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSLBIA   , [InstrStage<40, [SLU]>]>, // needs work
+  InstrItinData<LdStSLBIE   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<40, [SLU]>]>, // needs work
+  InstrItinData<SprMFSR     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<10, [IU2]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<8, [IU2]>]>,
+  InstrItinData<SprSC       , [InstrStage<1, [IU2]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPCompare   , [InstrStage<8, [FPU1, FPU2]>]>,
+  InstrItinData<FPDivD      , [InstrStage<33, [FPU1, FPU2]>]>,
+  InstrItinData<FPDivS      , [InstrStage<33, [FPU1, FPU2]>]>,
+  InstrItinData<FPFused     , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPRes       , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPSqrt      , [InstrStage<40, [FPU1, FPU2]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<2, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<8, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>,
+  InstrItinData<VecComplex  , [InstrStage<5, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<3, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<8, [VFPU]>]>,
+  InstrItinData<VecVSL      , [InstrStage<2, [VIU1]>]>,
+  InstrItinData<VecVSR      , [InstrStage<3, [VPU]>]>
+]>;

diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
new file mode 100644
index 0000000..4419d20
--- /dev/null
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp

@@ -0,0 +1,141 @@
+//===- PowerPCSubtarget.cpp - PPC Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Begeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCSubtarget.h"
+#include "PPC.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetMachine.h"
+#include "PPCGenSubtarget.inc"
+using namespace llvm;
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_host.h>
+#include <mach/host_info.h>
+#include <mach/machine.h>
+
+/// GetCurrentPowerPCFeatures - Returns the current CPUs features.
+static const char *GetCurrentPowerPCCPU() {
+  host_basic_info_data_t hostInfo;
+  mach_msg_type_number_t infoCount;
+
+  infoCount = HOST_BASIC_INFO_COUNT;
+  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, 
+            &infoCount);
+            
+  if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic";
+
+  switch(hostInfo.cpu_subtype) {
+  case CPU_SUBTYPE_POWERPC_601:   return "601";
+  case CPU_SUBTYPE_POWERPC_602:   return "602";
+  case CPU_SUBTYPE_POWERPC_603:   return "603";
+  case CPU_SUBTYPE_POWERPC_603e:  return "603e";
+  case CPU_SUBTYPE_POWERPC_603ev: return "603ev";
+  case CPU_SUBTYPE_POWERPC_604:   return "604";
+  case CPU_SUBTYPE_POWERPC_604e:  return "604e";
+  case CPU_SUBTYPE_POWERPC_620:   return "620";
+  case CPU_SUBTYPE_POWERPC_750:   return "750";
+  case CPU_SUBTYPE_POWERPC_7400:  return "7400";
+  case CPU_SUBTYPE_POWERPC_7450:  return "7450";
+  case CPU_SUBTYPE_POWERPC_970:   return "970";
+  default: ;
+  }
+  
+  return "generic";
+}
+#endif
+
+
+PPCSubtarget::PPCSubtarget(const TargetMachine &tm, const Module &M,
+                           const std::string &FS, bool is64Bit)
+  : TM(tm)
+  , StackAlignment(16)
+  , IsGigaProcessor(false)
+  , Has64BitSupport(false)
+  , Use64BitRegs(false)
+  , IsPPC64(is64Bit)
+  , HasAltivec(false)
+  , HasFSQRT(false)
+  , HasSTFIWX(false)
+  , IsDarwin(false)
+  , HasLazyResolverStubs(false) {
+
+  // Determine default and user specified characteristics
+  std::string CPU = "generic";
+#if defined(__APPLE__)
+  CPU = GetCurrentPowerPCCPU();
+#endif
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+
+  // If we are generating code for ppc64, verify that options make sense.
+  if (is64Bit) {
+    if (!has64BitSupport()) {
+      cerr << "PPC: Generation of 64-bit code for a 32-bit processor "
+           << "requested.  Ignoring 32-bit processor feature.\n";
+      Has64BitSupport = true;
+    }
+    // Silently force 64-bit register use on ppc64.
+    Use64BitRegs = true;
+  }
+  
+  // If the user requested use of 64-bit regs, but the cpu selected doesn't
+  // support it, warn and ignore.
+  if (use64BitRegs() && !has64BitSupport()) {
+    cerr << "PPC: 64-bit registers requested on CPU without support.  "
+         << "Disabling 64-bit register use.\n";
+    Use64BitRegs = false;
+  }
+  
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  const std::string& TT = M.getTargetTriple();
+  if (TT.length() > 5) {
+    IsDarwin = TT.find("-darwin") != std::string::npos;
+  } else if (TT.empty()) {
+#if defined(__APPLE__)
+    IsDarwin = true;
+#endif
+  }
+
+  // Set up darwin-specific properties.
+  if (IsDarwin) {
+    HasLazyResolverStubs = true;
+    AsmFlavor = NewMnemonic;
+  } else {
+    AsmFlavor = OldMnemonic;
+  }
+}
+
+/// SetJITMode - This is called to inform the subtarget info that we are
+/// producing code for the JIT.
+void PPCSubtarget::SetJITMode() {
+  // JIT mode doesn't want lazy resolver stubs, it knows exactly where
+  // everything is.  This matters for PPC64, which codegens in PIC mode without
+  // stubs.
+  HasLazyResolverStubs = false;
+}
+
+
+/// hasLazyResolverStub - Return true if accesses to the specified global have
+/// to go through a dyld lazy resolution stub.  This means that an extra load
+/// is required to get the address of the global.
+bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
+  // We never hae stubs if HasLazyResolverStubs=false or if in static mode.
+  if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static)
+    return false;
+  
+  return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+         (GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode());
+}

diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
new file mode 100644
index 0000000..d1e135c
--- /dev/null
+++ b/lib/Target/PowerPC/PPCSubtarget.h

@@ -0,0 +1,146 @@
+//=====-- PPCSubtarget.h - Define Subtarget for the PPC -------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Begeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPCSUBTARGET_H
+#define POWERPCSUBTARGET_H
+
+#include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+
+namespace PPC {
+  // -m directive values.
+  enum {
+    DIR_32,
+    DIR_601, 
+    DIR_602, 
+    DIR_603, 
+    DIR_7400,
+    DIR_750, 
+    DIR_970, 
+    DIR_64  
+  };
+}
+
+class Module;
+class GlobalValue;
+class TargetMachine;
+  
+class PPCSubtarget : public TargetSubtarget {
+public:
+  enum AsmWriterFlavorTy {
+    OldMnemonic, NewMnemonic, Unset
+  };
+protected:
+  const TargetMachine &TM;
+  
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned StackAlignment;
+  
+  /// Selected instruction itineraries (one entry per itinerary class.)
+  InstrItineraryData InstrItins;
+  
+  /// Which cpu directive was used.
+  unsigned DarwinDirective;
+
+  /// AsmFlavor - Which PPC asm dialect to use.
+  AsmWriterFlavorTy AsmFlavor;
+
+  /// Used by the ISel to turn in optimizations for POWER4-derived architectures
+  bool IsGigaProcessor;
+  bool Has64BitSupport;
+  bool Use64BitRegs;
+  bool IsPPC64;
+  bool HasAltivec;
+  bool HasFSQRT;
+  bool HasSTFIWX;
+  bool IsDarwin;
+  bool HasLazyResolverStubs;
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  PPCSubtarget(const TargetMachine &TM, const Module &M,
+               const std::string &FS, bool is64Bit);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU);
+  
+  /// SetJITMode - This is called to inform the subtarget info that we are
+  /// producing code for the JIT.
+  void SetJITMode();
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return StackAlignment; }
+  
+  /// getDarwinDirective - Returns the -m directive specified for the cpu.
+  ///
+  unsigned getDarwinDirective() const { return DarwinDirective; }
+  
+  /// getInstrItins - Return the instruction itineraies based on subtarget 
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  /// getTargetDataString - Return the pointer size and type alignment
+  /// properties of this subtarget.
+  const char *getTargetDataString() const {
+    return isPPC64() ? "E-p:64:64-f64:32:64-i64:32:64"
+                     : "E-p:32:32-f64:32:64-i64:32:64";
+  }
+
+  /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
+  ///
+  bool isPPC64() const { return IsPPC64; }
+  
+  /// has64BitSupport - Return true if the selected CPU supports 64-bit
+  /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
+  bool has64BitSupport() const { return Has64BitSupport; }
+  
+  /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
+  /// registers in 32-bit mode when possible.  This can only true if
+  /// has64BitSupport() returns true.
+  bool use64BitRegs() const { return Use64BitRegs; }
+  
+  /// hasLazyResolverStub - Return true if accesses to the specified global have
+  /// to go through a dyld lazy resolution stub.  This means that an extra load
+  /// is required to get the address of the global.
+  bool hasLazyResolverStub(const GlobalValue *GV) const;
+  
+  // Specific obvious features.
+  bool hasFSQRT() const { return HasFSQRT; }
+  bool hasSTFIWX() const { return HasSTFIWX; }
+  bool hasAltivec() const { return HasAltivec; }
+  bool isGigaProcessor() const { return IsGigaProcessor; }
+
+  bool isDarwin() const { return IsDarwin; }
+
+  bool isMachoABI() const { return IsDarwin || IsPPC64; }
+  bool isELF32_ABI() const { return !IsDarwin && !IsPPC64; }
+
+  unsigned getAsmFlavor() const {
+    return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0;
+  }
+};
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/PowerPC/PPCTargetAsmInfo.cpp b/lib/Target/PowerPC/PPCTargetAsmInfo.cpp
new file mode 100644
index 0000000..01c78b7
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetAsmInfo.cpp

@@ -0,0 +1,96 @@
+//===-- PPCTargetAsmInfo.cpp - PPC asm properties ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the DarwinTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetAsmInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Function.h"
+using namespace llvm;
+
+PPCTargetAsmInfo::PPCTargetAsmInfo(const PPCTargetMachine &TM) {
+  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+  
+  ZeroDirective = "\t.space\t";
+  SetDirective = "\t.set";
+  Data64bitsDirective = isPPC64 ? "\t.quad\t" : 0;  
+  AlignmentIsInBytes = false;
+  LCOMMDirective = "\t.lcomm\t";
+  InlineAsmStart = "# InlineAsm Start";
+  InlineAsmEnd = "# InlineAsm End";
+  AssemblerDialect = TM.getSubtargetImpl()->getAsmFlavor();
+  
+  NeedsSet = true;
+  AddressSize = isPPC64 ? 8 : 4;
+  DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug";
+  DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug";
+  DwarfLineSection = ".section __DWARF,__debug_line,regular,debug";
+  DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug";
+  DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug";
+  DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug";
+  DwarfStrSection = ".section __DWARF,__debug_str,regular,debug";
+  DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug";
+  DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug";
+  DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug";
+  DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug";
+  DwarfEHFrameSection =
+  ".section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support";
+  DwarfExceptionSection = ".section __DATA,__gcc_except_tab";
+}
+
+DarwinTargetAsmInfo::DarwinTargetAsmInfo(const PPCTargetMachine &TM)
+: PPCTargetAsmInfo(TM)
+{
+  PCSymbol = ".";
+  CommentString = ";";
+  GlobalPrefix = "_";
+  PrivateGlobalPrefix = "L";
+  ConstantPoolSection = "\t.const\t";
+  JumpTableDataSection = ".const";
+  GlobalDirective = "\t.globl\t";
+  CStringSection = "\t.cstring";
+  FourByteConstantSection = "\t.literal4\n";
+  EightByteConstantSection = "\t.literal8\n";
+  ReadOnlySection = "\t.const\n";
+  if (TM.getRelocationModel() == Reloc::Static) {
+    StaticCtorsSection = ".constructor";
+    StaticDtorsSection = ".destructor";
+  } else {
+    StaticCtorsSection = ".mod_init_func";
+    StaticDtorsSection = ".mod_term_func";
+  }
+  UsedDirective = "\t.no_dead_strip\t";
+  WeakRefDirective = "\t.weak_reference\t";
+  HiddenDirective = "\t.private_extern\t";
+  SupportsExceptionHandling = true;
+  
+  // In non-PIC modes, emit a special label before jump tables so that the
+  // linker can perform more accurate dead code stripping.
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    // Emit a local label that is preserved until the linker runs.
+    JumpTableSpecialLabelPrefix = "l";
+  }
+}
+
+LinuxTargetAsmInfo::LinuxTargetAsmInfo(const PPCTargetMachine &TM)
+: PPCTargetAsmInfo(TM)
+{
+  CommentString = "#";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = "";
+  ConstantPoolSection = "\t.section .rodata.cst4\t";
+  JumpTableDataSection = ".section .rodata.cst4";
+  CStringSection = "\t.section\t.rodata";
+  StaticCtorsSection = ".section\t.ctors,\"aw\",@progbits";
+  StaticDtorsSection = ".section\t.dtors,\"aw\",@progbits";
+  UsedDirective = "\t# .no_dead_strip\t";
+  WeakRefDirective = "\t.weak\t";
+}

diff --git a/lib/Target/PowerPC/PPCTargetAsmInfo.h b/lib/Target/PowerPC/PPCTargetAsmInfo.h
new file mode 100644
index 0000000..6a680e2
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetAsmInfo.h

@@ -0,0 +1,38 @@
+//=====-- PPCTargetAsmInfo.h - PPC asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the DarwinTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCTARGETASMINFO_H
+#define PPCTARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class PPCTargetMachine;
+  
+  struct PPCTargetAsmInfo : public TargetAsmInfo {
+    PPCTargetAsmInfo(const PPCTargetMachine &TM);
+  };
+
+  struct DarwinTargetAsmInfo : public PPCTargetAsmInfo {
+    DarwinTargetAsmInfo(const PPCTargetMachine &TM);
+  };
+
+  struct LinuxTargetAsmInfo : public PPCTargetAsmInfo {
+    LinuxTargetAsmInfo(const PPCTargetMachine &TM);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
new file mode 100644
index 0000000..57c8437
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp

@@ -0,0 +1,166 @@
+//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCTargetAsmInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+namespace {
+  // Register the targets
+  RegisterTarget<PPC32TargetMachine>
+  X("ppc32", "  PowerPC 32");
+  RegisterTarget<PPC64TargetMachine>
+  Y("ppc64", "  PowerPC 64");
+}
+
+const TargetAsmInfo *PPCTargetMachine::createTargetAsmInfo() const {
+  if (Subtarget.isDarwin())
+    return new DarwinTargetAsmInfo(*this);
+  else
+    return new LinuxTargetAsmInfo(*this);
+}
+
+unsigned PPC32TargetMachine::getJITMatchQuality() {
+#if defined(__POWERPC__) || defined (__ppc__) || defined(_POWER) || defined(__PPC__)
+  if (sizeof(void*) == 4)
+    return 10;
+#endif
+  return 0;
+}
+unsigned PPC64TargetMachine::getJITMatchQuality() {
+#if defined(__POWERPC__) || defined (__ppc__) || defined(_POWER) || defined(__PPC__)
+  if (sizeof(void*) == 8)
+    return 10;
+#endif
+  return 0;
+}
+
+unsigned PPC32TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "powerpc-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 8 && std::string(TT.begin(), TT.begin()+8) == "powerpc-")
+    return 20;
+  
+  // If the target triple is something non-powerpc, we don't match.
+  if (!TT.empty()) return 0;
+  
+  if (M.getEndianness()  == Module::BigEndian &&
+      M.getPointerSize() == Module::Pointer32)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+  
+  return getJITMatchQuality()/2;
+}
+
+unsigned PPC64TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "powerpc64-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 10 && std::string(TT.begin(), TT.begin()+10) == "powerpc64-")
+    return 20;
+  
+  if (M.getEndianness()  == Module::BigEndian &&
+      M.getPointerSize() == Module::Pointer64)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+  
+  return getJITMatchQuality()/2;
+}
+
+
+PPCTargetMachine::PPCTargetMachine(const Module &M, const std::string &FS,
+                                   bool is64Bit)
+  : Subtarget(*this, M, FS, is64Bit),
+    DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this),
+    FrameInfo(*this, is64Bit), JITInfo(*this, is64Bit), TLInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData()), MachOWriterInfo(*this) {
+
+  if (getRelocationModel() == Reloc::Default)
+    if (Subtarget.isDarwin())
+      setRelocationModel(Reloc::DynamicNoPIC);
+    else
+      setRelocationModel(Reloc::Static);
+}
+
+/// Override this for PowerPC.  Tail merging happily breaks up instruction issue
+/// groups, which typically degrades performance.
+const bool PPCTargetMachine::getEnableTailMergeDefault() const { return false; }
+
+PPC32TargetMachine::PPC32TargetMachine(const Module &M, const std::string &FS) 
+  : PPCTargetMachine(M, FS, false) {
+}
+
+
+PPC64TargetMachine::PPC64TargetMachine(const Module &M, const std::string &FS)
+  : PPCTargetMachine(M, FS, true) {
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool PPCTargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) {
+  // Install an instruction selector.
+  PM.add(createPPCISelDag(*this));
+  return false;
+}
+
+bool PPCTargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) {
+  
+  // Must run branch selection immediately preceding the asm printer.
+  PM.add(createPPCBranchSelectionPass());
+  return false;
+}
+
+bool PPCTargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                          std::ostream &Out) {
+  PM.add(createPPCAsmPrinterPass(Out, *this));
+  return false;
+}
+
+bool PPCTargetMachine::addCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                      MachineCodeEmitter &MCE) {
+  // The JIT should use the static relocation model in ppc32 mode, PIC in ppc64.
+  // FIXME: This should be moved to TargetJITInfo!!
+  if (Subtarget.isPPC64()) {
+    // We use PIC codegen in ppc64 mode, because otherwise we'd have to use many
+    // instructions to materialize arbitrary global variable + function +
+    // constant pool addresses.
+    setRelocationModel(Reloc::PIC_);
+  } else {
+    setRelocationModel(Reloc::Static);
+  }
+  
+  // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
+  // writing?
+  Subtarget.SetJITMode();
+  
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCCodeEmitterPass(*this, MCE));
+  return false;
+}
+
+bool PPCTargetMachine::addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                            MachineCodeEmitter &MCE) {
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCCodeEmitterPass(*this, MCE));
+  return false;
+}

diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
new file mode 100644
index 0000000..10c5b7b
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetMachine.h

@@ -0,0 +1,101 @@
+//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC -----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_TARGETMACHINE_H
+#define PPC_TARGETMACHINE_H
+
+#include "PPCFrameInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCJITInfo.h"
+#include "PPCInstrInfo.h"
+#include "PPCISelLowering.h"
+#include "PPCMachOWriterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+class PassManager;
+class GlobalValue;
+
+/// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets.
+///
+class PPCTargetMachine : public LLVMTargetMachine {
+  PPCSubtarget        Subtarget;
+  const TargetData    DataLayout;       // Calculates type size & alignment
+  PPCInstrInfo        InstrInfo;
+  PPCFrameInfo        FrameInfo;
+  PPCJITInfo          JITInfo;
+  PPCTargetLowering   TLInfo;
+  InstrItineraryData  InstrItins;
+  PPCMachOWriterInfo  MachOWriterInfo;
+
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+public:
+  PPCTargetMachine(const Module &M, const std::string &FS, bool is64Bit);
+
+  virtual const PPCInstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual       TargetJITInfo    *getJITInfo()         { return &JITInfo; }
+  virtual       PPCTargetLowering *getTargetLowering() const { 
+   return const_cast<PPCTargetLowering*>(&TLInfo); 
+  }
+  virtual const MRegisterInfo    *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  
+  virtual const TargetData    *getTargetData() const    { return &DataLayout; }
+  virtual const PPCSubtarget  *getSubtargetImpl() const { return &Subtarget; }
+  virtual const InstrItineraryData getInstrItineraryData() const {  
+    return InstrItins;
+  }
+  virtual const PPCMachOWriterInfo *getMachOWriterInfo() const {
+    return &MachOWriterInfo;
+  }
+  
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);
+  virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast);
+  virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                  std::ostream &Out);
+  virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast,
+                              MachineCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                    MachineCodeEmitter &MCE);
+  virtual const bool getEnableTailMergeDefault() const;
+};
+
+/// PPC32TargetMachine - PowerPC 32-bit target machine.
+///
+class PPC32TargetMachine : public PPCTargetMachine {
+public:
+  PPC32TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+/// PPC64TargetMachine - PowerPC 64-bit target machine.
+///
+class PPC64TargetMachine : public PPCTargetMachine {
+public:
+  PPC64TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
new file mode 100644
index 0000000..69e60fc
--- /dev/null
+++ b/lib/Target/PowerPC/README.txt

@@ -0,0 +1,664 @@
+//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
+
+TODO:
+* gpr0 allocation
+* implement do-loop -> bdnz transform
+* __builtin_return_address not supported on PPC
+
+===-------------------------------------------------------------------------===
+
+Support 'update' load/store instructions.  These are cracked on the G5, but are
+still a codesize win.
+
+With preinc enabled, this:
+
+long *%test4(long *%X, long *%dest) {
+        %Y = getelementptr long* %X, int 4
+        %A = load long* %Y
+        store long %A, long* %dest
+        ret long* %Y
+}
+
+compiles to:
+
+_test4:
+        mr r2, r3
+        lwzu r5, 32(r2)
+        lwz r3, 36(r3)
+        stw r5, 0(r4)
+        stw r3, 4(r4)
+        mr r3, r2
+        blr 
+
+with -sched=list-burr, I get:
+
+_test4:
+        lwz r2, 36(r3)
+        lwzu r5, 32(r3)
+        stw r2, 4(r4)
+        stw r5, 0(r4)
+        blr 
+
+===-------------------------------------------------------------------------===
+
+We compile the hottest inner loop of viterbi to:
+
+        li r6, 0
+        b LBB1_84       ;bb432.i
+LBB1_83:        ;bb420.i
+        lbzx r8, r5, r7
+        addi r6, r7, 1
+        stbx r8, r4, r7
+LBB1_84:        ;bb432.i
+        mr r7, r6
+        cmplwi cr0, r7, 143
+        bne cr0, LBB1_83        ;bb420.i
+
+The CBE manages to produce:
+
+	li r0, 143
+	mtctr r0
+loop:
+	lbzx r2, r2, r11
+	stbx r0, r2, r9
+	addi r2, r2, 1
+	bdz later
+	b loop
+
+This could be much better (bdnz instead of bdz) but it still beats us.  If we
+produced this with bdnz, the loop would be a single dispatch group.
+
+===-------------------------------------------------------------------------===
+
+Compile:
+
+void foo(int *P) {
+ if (P)  *P = 0;
+}
+
+into:
+
+_foo:
+        cmpwi cr0,r3,0
+        beqlr cr0
+        li r0,0
+        stw r0,0(r3)
+        blr
+
+This is effectively a simple form of predication.
+
+===-------------------------------------------------------------------------===
+
+Lump the constant pool for each function into ONE pic object, and reference
+pieces of it as offsets from the start.  For functions like this (contrived
+to have lots of constants obviously):
+
+double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
+
+We generate:
+
+_X:
+        lis r2, ha16(.CPI_X_0)
+        lfd f0, lo16(.CPI_X_0)(r2)
+        lis r2, ha16(.CPI_X_1)
+        lfd f2, lo16(.CPI_X_1)(r2)
+        fmadd f0, f1, f0, f2
+        lis r2, ha16(.CPI_X_2)
+        lfd f1, lo16(.CPI_X_2)(r2)
+        lis r2, ha16(.CPI_X_3)
+        lfd f2, lo16(.CPI_X_3)(r2)
+        fmadd f1, f0, f1, f2
+        blr
+
+It would be better to materialize .CPI_X into a register, then use immediates
+off of the register to avoid the lis's.  This is even more important in PIC 
+mode.
+
+Note that this (and the static variable version) is discussed here for GCC:
+http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
+
+===-------------------------------------------------------------------------===
+
+PIC Code Gen IPO optimization:
+
+Squish small scalar globals together into a single global struct, allowing the 
+address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
+of the GOT on targets with one).
+
+Note that this is discussed here for GCC:
+http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
+
+===-------------------------------------------------------------------------===
+
+Implement Newton-Rhapson method for improving estimate instructions to the
+correct accuracy, and implementing divide as multiply by reciprocal when it has
+more than one use.  Itanium will want this too.
+
+===-------------------------------------------------------------------------===
+
+Compile this:
+
+int %f1(int %a, int %b) {
+        %tmp.1 = and int %a, 15         ; <int> [#uses=1]
+        %tmp.3 = and int %b, 240                ; <int> [#uses=1]
+        %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
+        ret int %tmp.4
+}
+
+without a copy.  We make this currently:
+
+_f1:
+        rlwinm r2, r4, 0, 24, 27
+        rlwimi r2, r3, 0, 28, 31
+        or r3, r2, r2
+        blr
+
+The two-addr pass or RA needs to learn when it is profitable to commute an
+instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
+currently only commutes to avoid inserting a copy BEFORE the two addr instr.
+
+===-------------------------------------------------------------------------===
+
+Compile offsets from allocas:
+
+int *%test() {
+        %X = alloca { int, int }
+        %Y = getelementptr {int,int}* %X, int 0, uint 1
+        ret int* %Y
+}
+
+into a single add, not two:
+
+_test:
+        addi r2, r1, -8
+        addi r3, r2, 4
+        blr
+
+--> important for C++.
+
+===-------------------------------------------------------------------------===
+
+No loads or stores of the constants should be needed:
+
+struct foo { double X, Y; };
+void xxx(struct foo F);
+void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
+
+===-------------------------------------------------------------------------===
+
+Darwin Stub LICM optimization:
+
+Loops like this:
+  
+  for (...)  bar();
+
+Have to go through an indirect stub if bar is external or linkonce.  It would 
+be better to compile it as:
+
+     fp = &bar;
+     for (...)  fp();
+
+which only computes the address of bar once (instead of each time through the 
+stub).  This is Darwin specific and would have to be done in the code generator.
+Probably not a win on x86.
+
+===-------------------------------------------------------------------------===
+
+Simple IPO for argument passing, change:
+  void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
+
+the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
+of arguments get assigned to r3 through r10. That is, if you have a function
+foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
+argument bytes for r4 and r5. The trick then would be to shuffle the argument
+order for functions we can internalize so that the maximum number of 
+integers/pointers get passed in regs before you see any of the fp arguments.
+
+Instead of implementing this, it would actually probably be easier to just 
+implement a PPC fastcc, where we could do whatever we wanted to the CC, 
+including having this work sanely.
+
+===-------------------------------------------------------------------------===
+
+Fix Darwin FP-In-Integer Registers ABI
+
+Darwin passes doubles in structures in integer registers, which is very very 
+bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation 
+that percolates these things out of functions.
+
+Check out how horrible this is:
+http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
+
+This is an extension of "interprocedural CC unmunging" that can't be done with
+just fastcc.
+
+===-------------------------------------------------------------------------===
+
+Compile this:
+
+int foo(int a) {
+  int b = (a < 8);
+  if (b) {
+    return b * 3;     // ignore the fact that this is always 3.
+  } else {
+    return 2;
+  }
+}
+
+into something not this:
+
+_foo:
+1)      cmpwi cr7, r3, 8
+        mfcr r2, 1
+        rlwinm r2, r2, 29, 31, 31
+1)      cmpwi cr0, r3, 7
+        bgt cr0, LBB1_2 ; UnifiedReturnBlock
+LBB1_1: ; then
+        rlwinm r2, r2, 0, 31, 31
+        mulli r3, r2, 3
+        blr
+LBB1_2: ; UnifiedReturnBlock
+        li r3, 2
+        blr
+
+In particular, the two compares (marked 1) could be shared by reversing one.
+This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
+same operands (but backwards) exists.  In this case, this wouldn't save us 
+anything though, because the compares still wouldn't be shared.
+
+===-------------------------------------------------------------------------===
+
+We should custom expand setcc instead of pretending that we have it.  That
+would allow us to expose the access of the crbit after the mfcr, allowing
+that access to be trivially folded into other ops.  A simple example:
+
+int foo(int a, int b) { return (a < b) << 4; }
+
+compiles into:
+
+_foo:
+        cmpw cr7, r3, r4
+        mfcr r2, 1
+        rlwinm r2, r2, 29, 31, 31
+        slwi r3, r2, 4
+        blr
+
+===-------------------------------------------------------------------------===
+
+Fold add and sub with constant into non-extern, non-weak addresses so this:
+
+static int a;
+void bar(int b) { a = b; }
+void foo(unsigned char *c) {
+  *c = a;
+}
+
+So that 
+
+_foo:
+        lis r2, ha16(_a)
+        la r2, lo16(_a)(r2)
+        lbz r2, 3(r2)
+        stb r2, 0(r3)
+        blr
+
+Becomes
+
+_foo:
+        lis r2, ha16(_a+3)
+        lbz r2, lo16(_a+3)(r2)
+        stb r2, 0(r3)
+        blr
+
+===-------------------------------------------------------------------------===
+
+We generate really bad code for this:
+
+int f(signed char *a, _Bool b, _Bool c) {
+   signed char t = 0;
+  if (b)  t = *a;
+  if (c)  *a = t;
+}
+
+===-------------------------------------------------------------------------===
+
+This:
+int test(unsigned *P) { return *P >> 24; }
+
+Should compile to:
+
+_test:
+        lbz r3,0(r3)
+        blr
+
+not:
+
+_test:
+        lwz r2, 0(r3)
+        srwi r3, r2, 24
+        blr
+
+===-------------------------------------------------------------------------===
+
+On the G5, logical CR operations are more expensive in their three
+address form: ops that read/write the same register are half as expensive as
+those that read from two registers that are different from their destination.
+
+We should model this with two separate instructions.  The isel should generate
+the "two address" form of the instructions.  When the register allocator 
+detects that it needs to insert a copy due to the two-addresness of the CR
+logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
+we can convert to the "three address" instruction, to save code space.
+
+This only matters when we start generating cr logical ops.
+
+===-------------------------------------------------------------------------===
+
+We should compile these two functions to the same thing:
+
+#include <stdlib.h>
+void f(int a, int b, int *P) {
+  *P = (a-b)>=0?(a-b):(b-a);
+}
+void g(int a, int b, int *P) {
+  *P = abs(a-b);
+}
+
+Further, they should compile to something better than:
+
+_g:
+        subf r2, r4, r3
+        subfic r3, r2, 0
+        cmpwi cr0, r2, -1
+        bgt cr0, LBB2_2 ; entry
+LBB2_1: ; entry
+        mr r2, r3
+LBB2_2: ; entry
+        stw r2, 0(r5)
+        blr
+
+GCC produces:
+
+_g:
+        subf r4,r4,r3
+        srawi r2,r4,31
+        xor r0,r2,r4
+        subf r0,r2,r0
+        stw r0,0(r5)
+        blr
+
+... which is much nicer.
+
+This theoretically may help improve twolf slightly (used in dimbox.c:142?).
+
+===-------------------------------------------------------------------------===
+
+int foo(int N, int ***W, int **TK, int X) {
+  int t, i;
+  
+  for (t = 0; t < N; ++t)
+    for (i = 0; i < 4; ++i)
+      W[t / X][i][t % X] = TK[i][t];
+      
+  return 5;
+}
+
+We generate relatively atrocious code for this loop compared to gcc.
+
+We could also strength reduce the rem and the div:
+http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
+
+===-------------------------------------------------------------------------===
+
+float foo(float X) { return (int)(X); }
+
+Currently produces:
+
+_foo:
+        fctiwz f0, f1
+        stfd f0, -8(r1)
+        lwz r2, -4(r1)
+        extsw r2, r2
+        std r2, -16(r1)
+        lfd f0, -16(r1)
+        fcfid f0, f0
+        frsp f1, f0
+        blr
+
+We could use a target dag combine to turn the lwz/extsw into an lwa when the 
+lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
+win only.
+
+===-------------------------------------------------------------------------===
+
+We generate ugly code for this:
+
+void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
+  unsigned code = 0;
+  if(dx < -dw) code |= 1;
+  if(dx > dw)  code |= 2;
+  if(dy < -dw) code |= 4;
+  if(dy > dw)  code |= 8;
+  if(dz < -dw) code |= 16;
+  if(dz > dw)  code |= 32;
+  *ret = code;
+}
+
+===-------------------------------------------------------------------------===
+
+Complete the signed i32 to FP conversion code using 64-bit registers
+transformation, good for PI.  See PPCISelLowering.cpp, this comment:
+
+     // FIXME: disable this lowered code.  This generates 64-bit register values,
+     // and we don't model the fact that the top part is clobbered by calls.  We
+     // need to flag these together so that the value isn't live across a call.
+     //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+
+Also, if the registers are spilled to the stack, we have to ensure that all
+64-bits of them are save/restored, otherwise we will miscompile the code.  It
+sounds like we need to get the 64-bit register classes going.
+
+===-------------------------------------------------------------------------===
+
+%struct.B = type { i8, [3 x i8] }
+
+define void @bar(%struct.B* %b) {
+entry:
+        %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
+        %tmp = load i32* %tmp          ; <uint> [#uses=1]
+        %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
+        %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
+        %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
+        %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
+        %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
+        %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
+        %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
+        %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
+        %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
+        %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
+        store i32 %tmp13, i32* %tmp8
+        ret void
+}
+
+We emit:
+
+_foo:
+        lwz r2, 0(r3)
+        slwi r4, r2, 1
+        or r4, r4, r2
+        rlwimi r2, r4, 0, 0, 0
+        stw r2, 0(r3)
+        blr
+
+We could collapse a bunch of those ORs and ANDs and generate the following
+equivalent code:
+
+_foo:
+        lwz r2, 0(r3)
+        rlwinm r4, r2, 1, 0, 0
+        or r2, r2, r4
+        stw r2, 0(r3)
+        blr
+
+===-------------------------------------------------------------------------===
+
+We compile:
+
+unsigned test6(unsigned x) { 
+  return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
+}
+
+into:
+
+_test6:
+        lis r2, 255
+        rlwinm r3, r3, 16, 0, 31
+        ori r2, r2, 255
+        and r3, r3, r2
+        blr
+
+GCC gets it down to:
+
+_test6:
+        rlwinm r0,r3,16,8,15
+        rlwinm r3,r3,16,24,31
+        or r3,r3,r0
+        blr
+
+
+===-------------------------------------------------------------------------===
+
+Consider a function like this:
+
+float foo(float X) { return X + 1234.4123f; }
+
+The FP constant ends up in the constant pool, so we need to get the LR register.
+ This ends up producing code like this:
+
+_foo:
+.LBB_foo_0:     ; entry
+        mflr r11
+***     stw r11, 8(r1)
+        bl "L00000$pb"
+"L00000$pb":
+        mflr r2
+        addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
+        lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
+        fadds f1, f1, f0
+***     lwz r11, 8(r1)
+        mtlr r11
+        blr
+
+This is functional, but there is no reason to spill the LR register all the way
+to the stack (the two marked instrs): spilling it to a GPR is quite enough.
+
+Implementing this will require some codegen improvements.  Nate writes:
+
+"So basically what we need to support the "no stack frame save and restore" is a
+generalization of the LR optimization to "callee-save regs".
+
+Currently, we have LR marked as a callee-save reg.  The register allocator sees
+that it's callee save, and spills it directly to the stack.
+
+Ideally, something like this would happen:
+
+LR would be in a separate register class from the GPRs. The class of LR would be
+marked "unspillable".  When the register allocator came across an unspillable
+reg, it would ask "what is the best class to copy this into that I *can* spill"
+If it gets a class back, which it will in this case (the gprs), it grabs a free
+register of that class.  If it is then later necessary to spill that reg, so be
+it.
+
+===-------------------------------------------------------------------------===
+
+We compile this:
+int test(_Bool X) {
+  return X ? 524288 : 0;
+}
+
+to: 
+_test:
+        cmplwi cr0, r3, 0
+        lis r2, 8
+        li r3, 0
+        beq cr0, LBB1_2 ;entry
+LBB1_1: ;entry
+        mr r3, r2
+LBB1_2: ;entry
+        blr 
+
+instead of:
+_test:
+        addic r2,r3,-1
+        subfe r0,r2,r3
+        slwi r3,r0,19
+        blr
+
+This sort of thing occurs a lot due to globalopt.
+
+===-------------------------------------------------------------------------===
+
+We currently compile 32-bit bswap:
+
+declare i32 @llvm.bswap.i32(i32 %A)
+define i32 @test(i32 %A) {
+        %B = call i32 @llvm.bswap.i32(i32 %A)
+        ret i32 %B
+}
+
+to:
+
+_test:
+        rlwinm r2, r3, 24, 16, 23
+        slwi r4, r3, 24
+        rlwimi r2, r3, 8, 24, 31
+        rlwimi r4, r3, 8, 8, 15
+        rlwimi r4, r2, 0, 16, 31
+        mr r3, r4
+        blr 
+
+it would be more efficient to produce:
+
+_foo:   mr r0,r3
+        rlwinm r3,r3,8,0xffffffff
+        rlwimi r3,r0,24,0,7
+        rlwimi r3,r0,24,16,23
+        blr
+
+===-------------------------------------------------------------------------===
+
+test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
+
+__ZNK4llvm5APInt17countLeadingZerosEv:
+        ld r2, 0(r3)
+        cntlzd r2, r2
+        or r2, r2, r2     <<-- silly.
+        addi r3, r2, -64
+        blr 
+
+The dead or is a 'truncate' from 64- to 32-bits.
+
+===-------------------------------------------------------------------------===
+
+We generate horrible ppc code for this:
+
+#define N  2000000
+double   a[N],c[N];
+void simpleloop() {
+   int j;
+   for (j=0; j<N; j++)
+     c[j] = a[j];
+}
+
+LBB1_1: ;bb
+        lfdx f0, r3, r4
+        addi r5, r5, 1                 ;; Extra IV for the exit value compare.
+        stfdx f0, r2, r4
+        addi r4, r4, 8
+
+        xoris r6, r5, 30               ;; This is due to a large immediate.
+        cmplwi cr0, r6, 33920
+        bne cr0, LBB1_1
+
+===-------------------------------------------------------------------------===
+

diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt
new file mode 100644
index 0000000..143804d
--- /dev/null
+++ b/lib/Target/PowerPC/README_ALTIVEC.txt

@@ -0,0 +1,179 @@
+//===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===//
+
+Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
+registers, to generate better spill code.
+
+//===----------------------------------------------------------------------===//
+
+The first should be a single lvx from the constant pool, the second should be 
+a xor/stvx:
+
+void foo(void) {
+  int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 };
+  bar (x);
+}
+
+#include <string.h>
+void foo(void) {
+  int x[8] __attribute__((aligned(128)));
+  memset (x, 0, sizeof (x));
+  bar (x);
+}
+
+//===----------------------------------------------------------------------===//
+
+Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763
+
+When -ffast-math is on, we can use 0.0.
+
+//===----------------------------------------------------------------------===//
+
+  Consider this:
+  v4f32 Vector;
+  v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X };
+
+Since we know that "Vector" is 16-byte aligned and we know the element offset 
+of ".X", we should change the load into a lve*x instruction, instead of doing
+a load/store/lve*x sequence.
+
+//===----------------------------------------------------------------------===//
+
+For functions that use altivec AND have calls, we are VRSAVE'ing all call
+clobbered regs.
+
+//===----------------------------------------------------------------------===//
+
+Implement passing vectors by value into calls and receiving them as arguments.
+
+//===----------------------------------------------------------------------===//
+
+GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load
+of C1/C2/C3, then a load and vperm of Variable.
+
+//===----------------------------------------------------------------------===//
+
+We need a way to teach tblgen that some operands of an intrinsic are required to
+be constants.  The verifier should enforce this constraint.
+
+//===----------------------------------------------------------------------===//
+
+We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
+aligned stack slot, followed by a load/vperm.  We should probably just store it
+to a scalar stack slot, then use lvsl/vperm to load it.  If the value is already
+in memory this is a big win.
+
+//===----------------------------------------------------------------------===//
+
+extract_vector_elt of an arbitrary constant vector can be done with the 
+following instructions:
+
+vTemp = vec_splat(v0,2);    // 2 is the element the src is in.
+vec_ste(&destloc,0,vTemp);
+
+We can do an arbitrary non-constant value by using lvsr/perm/ste.
+
+//===----------------------------------------------------------------------===//
+
+If we want to tie instruction selection into the scheduler, we can do some
+constant formation with different instructions.  For example, we can generate
+"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with
+"vsplti 0" or "vxor", each of which use different execution units, thus could
+help scheduling.
+
+This is probably only reasonable for a post-pass scheduler.
+
+//===----------------------------------------------------------------------===//
+
+For this function:
+
+void test(vector float *A, vector float *B) {
+  vector float C = (vector float)vec_cmpeq(*A, *B);
+  if (!vec_any_eq(*A, *B))
+    *B = (vector float){0,0,0,0};
+  *A = C;
+}
+
+we get the following basic block:
+
+	...
+        lvx v2, 0, r4
+        lvx v3, 0, r3
+        vcmpeqfp v4, v3, v2
+        vcmpeqfp. v2, v3, v2
+        bne cr6, LBB1_2 ; cond_next
+
+The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the
+vcmpeqfp. result is used by a branch.  This can be improved.
+
+//===----------------------------------------------------------------------===//
+
+The code generated for this is truly aweful:
+
+vector float test(float a, float b) {
+ return (vector float){ 0.0, a, 0.0, 0.0}; 
+}
+
+LCPI1_0:                                        ;  float
+        .space  4
+        .text
+        .globl  _test
+        .align  4
+_test:
+        mfspr r2, 256
+        oris r3, r2, 4096
+        mtspr 256, r3
+        lis r3, ha16(LCPI1_0)
+        addi r4, r1, -32
+        stfs f1, -16(r1)
+        addi r5, r1, -16
+        lfs f0, lo16(LCPI1_0)(r3)
+        stfs f0, -32(r1)
+        lvx v2, 0, r4
+        lvx v3, 0, r5
+        vmrghw v3, v3, v2
+        vspltw v2, v2, 0
+        vmrghw v2, v2, v3
+        mtspr 256, r2
+        blr
+
+//===----------------------------------------------------------------------===//
+
+int foo(vector float *x, vector float *y) {
+        if (vec_all_eq(*x,*y)) return 3245; 
+        else return 12;
+}
+
+A predicate compare being used in a select_cc should have the same peephole
+applied to it as a predicate compare used by a br_cc.  There should be no
+mfcr here:
+
+_foo:
+        mfspr r2, 256
+        oris r5, r2, 12288
+        mtspr 256, r5
+        li r5, 12
+        li r6, 3245
+        lvx v2, 0, r4
+        lvx v3, 0, r3
+        vcmpeqfp. v2, v3, v2
+        mfcr r3, 2
+        rlwinm r3, r3, 25, 31, 31
+        cmpwi cr0, r3, 0
+        bne cr0, LBB1_2 ; entry
+LBB1_1: ; entry
+        mr r6, r5
+LBB1_2: ; entry
+        mr r3, r6
+        mtspr 256, r2
+        blr
+
+//===----------------------------------------------------------------------===//
+
+CodeGen/PowerPC/vec_constants.ll has an and operation that should be
+codegen'd to andc.  The issue is that the 'all ones' build vector is
+SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected
+which prevents the vnot pattern from matching.
+
+
+//===----------------------------------------------------------------------===//

diff --git a/lib/Target/README.txt b/lib/Target/README.txt
new file mode 100644
index 0000000..37b671f
--- /dev/null
+++ b/lib/Target/README.txt

@@ -0,0 +1,451 @@
+Target Independent Opportunities:
+
+//===---------------------------------------------------------------------===//
+
+With the recent changes to make the implicit def/use set explicit in
+machineinstrs, we should change the target descriptions for 'call' instructions
+so that the .td files don't list all the call-clobbered registers as implicit
+defs.  Instead, these should be added by the code generator (e.g. on the dag).
+
+This has a number of uses:
+
+1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
+   for their different impdef sets.
+2. Targets with multiple calling convs (e.g. x86) which have different clobber
+   sets don't need copies of call instructions.
+3. 'Interprocedural register allocation' can be done to reduce the clobber sets
+   of calls.
+
+//===---------------------------------------------------------------------===//
+
+Make the PPC branch selector target independant
+
+//===---------------------------------------------------------------------===//
+
+Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
+precision don't matter (ffastmath).  Misc/mandel will like this. :)
+
+//===---------------------------------------------------------------------===//
+
+Solve this DAG isel folding deficiency:
+
+int X, Y;
+
+void fn1(void)
+{
+  X = X | (Y << 3);
+}
+
+compiles to
+
+fn1:
+	movl Y, %eax
+	shll $3, %eax
+	orl X, %eax
+	movl %eax, X
+	ret
+
+The problem is the store's chain operand is not the load X but rather
+a TokenFactor of the load X and load Y, which prevents the folding.
+
+There are two ways to fix this:
+
+1. The dag combiner can start using alias analysis to realize that y/x
+   don't alias, making the store to X not dependent on the load from Y.
+2. The generated isel could be made smarter in the case it can't
+   disambiguate the pointers.
+
+Number 1 is the preferred solution.
+
+This has been "fixed" by a TableGen hack. But that is a short term workaround
+which will be removed once the proper fix is made.
+
+//===---------------------------------------------------------------------===//
+
+On targets with expensive 64-bit multiply, we could LSR this:
+
+for (i = ...; ++i) {
+   x = 1ULL << i;
+
+into:
+ long long tmp = 1;
+ for (i = ...; ++i, tmp+=tmp)
+   x = tmp;
+
+This would be a win on ppc32, but not x86 or ppc64.
+
+//===---------------------------------------------------------------------===//
+
+Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
+
+//===---------------------------------------------------------------------===//
+
+Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
+
+//===---------------------------------------------------------------------===//
+
+Interesting? testcase for add/shift/mul reassoc:
+
+int bar(int x, int y) {
+  return x*x*x+y+x*x*x*x*x*y*y*y*y;
+}
+int foo(int z, int n) {
+  return bar(z, n) + bar(2*z, 2*n);
+}
+
+Reassociate should handle the example in GCC PR16157.
+
+//===---------------------------------------------------------------------===//
+
+These two functions should generate the same code on big-endian systems:
+
+int g(int *j,int *l)  {  return memcmp(j,l,4);  }
+int h(int *j, int *l) {  return *j - *l; }
+
+this could be done in SelectionDAGISel.cpp, along with other special cases,
+for 1,2,4,8 bytes.
+
+//===---------------------------------------------------------------------===//
+
+It would be nice to revert this patch:
+http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
+
+And teach the dag combiner enough to simplify the code expanded before 
+legalize.  It seems plausible that this knowledge would let it simplify other
+stuff too.
+
+//===---------------------------------------------------------------------===//
+
+For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
+to the type size. It works but can be overly conservative as the alignment of
+specific vector types are target dependent.
+
+//===---------------------------------------------------------------------===//
+
+We should add 'unaligned load/store' nodes, and produce them from code like
+this:
+
+v4sf example(float *P) {
+  return (v4sf){P[0], P[1], P[2], P[3] };
+}
+
+//===---------------------------------------------------------------------===//
+
+We should constant fold vector type casts at the LLVM level, regardless of the
+cast.  Currently we cannot fold some casts because we don't have TargetData
+information in the constant folder, so we don't know the endianness of the 
+target!
+
+//===---------------------------------------------------------------------===//
+
+Add support for conditional increments, and other related patterns.  Instead
+of:
+
+	movl 136(%esp), %eax
+	cmpl $0, %eax
+	je LBB16_2	#cond_next
+LBB16_1:	#cond_true
+	incl _foo
+LBB16_2:	#cond_next
+
+emit:
+	movl	_foo, %eax
+	cmpl	$1, %edi
+	sbbl	$-1, %eax
+	movl	%eax, _foo
+
+//===---------------------------------------------------------------------===//
+
+Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
+
+Expand these to calls of sin/cos and stores:
+      double sincos(double x, double *sin, double *cos);
+      float sincosf(float x, float *sin, float *cos);
+      long double sincosl(long double x, long double *sin, long double *cos);
+
+Doing so could allow SROA of the destination pointers.  See also:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
+
+//===---------------------------------------------------------------------===//
+
+Scalar Repl cannot currently promote this testcase to 'ret long cst':
+
+        %struct.X = type { i32, i32 }
+        %struct.Y = type { %struct.X }
+
+define i64 @bar() {
+        %retval = alloca %struct.Y, align 8
+        %tmp12 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 0
+        store i32 0, i32* %tmp12
+        %tmp15 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 1
+        store i32 1, i32* %tmp15
+        %retval.upgrd.1 = bitcast %struct.Y* %retval to i64*
+        %retval.upgrd.2 = load i64* %retval.upgrd.1
+        ret i64 %retval.upgrd.2
+}
+
+it should be extended to do so.
+
+//===---------------------------------------------------------------------===//
+
+-scalarrepl should promote this to be a vector scalar.
+
+        %struct..0anon = type { <4 x float> }
+
+define void @test1(<4 x float> %V, float* %P) {
+        %u = alloca %struct..0anon, align 16
+        %tmp = getelementptr %struct..0anon* %u, i32 0, i32 0
+        store <4 x float> %V, <4 x float>* %tmp
+        %tmp1 = bitcast %struct..0anon* %u to [4 x float]*
+        %tmp.upgrd.1 = getelementptr [4 x float]* %tmp1, i32 0, i32 1
+        %tmp.upgrd.2 = load float* %tmp.upgrd.1
+        %tmp3 = mul float %tmp.upgrd.2, 2.000000e+00
+        store float %tmp3, float* %P
+        ret void
+}
+
+//===---------------------------------------------------------------------===//
+
+Turn this into a single byte store with no load (the other 3 bytes are
+unmodified):
+
+void %test(uint* %P) {
+	%tmp = load uint* %P
+        %tmp14 = or uint %tmp, 3305111552
+        %tmp15 = and uint %tmp14, 3321888767
+        store uint %tmp15, uint* %P
+        ret void
+}
+
+//===---------------------------------------------------------------------===//
+
+dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
+
+Compile:
+
+int bar(int x)
+{
+  int t = __builtin_clz(x);
+  return -(t>>5);
+}
+
+to:
+
+_bar:   addic r3,r3,-1
+        subfe r3,r3,r3
+        blr
+
+//===---------------------------------------------------------------------===//
+
+Legalize should lower ctlz like this:
+  ctlz(x) = popcnt((x-1) & ~x)
+
+on targets that have popcnt but not ctlz.  itanium, what else?
+
+//===---------------------------------------------------------------------===//
+
+quantum_sigma_x in 462.libquantum contains the following loop:
+
+      for(i=0; i<reg->size; i++)
+	{
+	  /* Flip the target bit of each basis state */
+	  reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
+	} 
+
+Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
+so cool to turn it into something like:
+
+   long long Res = ((MAX_UNSIGNED) 1 << target);
+   if (target < 32) {
+     for(i=0; i<reg->size; i++)
+       reg->node[i].state ^= Res & 0xFFFFFFFFULL;
+   } else {
+     for(i=0; i<reg->size; i++)
+       reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
+   }
+   
+... which would only do one 32-bit XOR per loop iteration instead of two.
+
+It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
+alas...
+
+//===---------------------------------------------------------------------===//
+
+This isn't recognized as bswap by instcombine:
+
+unsigned int swap_32(unsigned int v) {
+  v = ((v & 0x00ff00ffU) << 8)  | ((v & 0xff00ff00U) >> 8);
+  v = ((v & 0x0000ffffU) << 16) | ((v & 0xffff0000U) >> 16);
+  return v;
+}
+
+Nor is this (yes, it really is bswap):
+
+unsigned long reverse(unsigned v) {
+    unsigned t;
+    t = v ^ ((v << 16) | (v >> 16));
+    t &= ~0xff0000;
+    v = (v << 24) | (v >> 8);
+    return v ^ (t >> 8);
+}
+
+//===---------------------------------------------------------------------===//
+
+These should turn into single 16-bit (unaligned?) loads on little/big endian
+processors.
+
+unsigned short read_16_le(const unsigned char *adr) {
+  return adr[0] | (adr[1] << 8);
+}
+unsigned short read_16_be(const unsigned char *adr) {
+  return (adr[0] << 8) | adr[1];
+}
+
+//===---------------------------------------------------------------------===//
+
+-instcombine should handle this transform:
+   icmp pred (sdiv X / C1 ), C2
+when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands. 
+
+Currently InstCombine avoids this transform but will do it when the signs of
+the operands and the sign of the divide match. See the FIXME in 
+InstructionCombining.cpp in the visitSetCondInst method after the switch case 
+for Instruction::UDiv (around line 4447) for more details.
+
+The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
+this construct. 
+
+//===---------------------------------------------------------------------===//
+
+Instcombine misses several of these cases (see the testcase in the patch):
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg01519.html
+
+//===---------------------------------------------------------------------===//
+
+viterbi speeds up *significantly* if the various "history" related copy loops
+are turned into memcpy calls at the source level.  We need a "loops to memcpy"
+pass.
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+typedef unsigned U32;
+typedef unsigned long long U64;
+int test (U32 *inst, U64 *regs) {
+    U64 effective_addr2;
+    U32 temp = *inst;
+    int r1 = (temp >> 20) & 0xf;
+    int b2 = (temp >> 16) & 0xf;
+    effective_addr2 = temp & 0xfff;
+    if (b2) effective_addr2 += regs[b2];
+    b2 = (temp >> 12) & 0xf;
+    if (b2) effective_addr2 += regs[b2];
+    effective_addr2 &= regs[4];
+     if ((effective_addr2 & 3) == 0)
+        return 1;
+    return 0;
+}
+
+Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
+we don't eliminate the computation of the top half of effective_addr2 because
+we don't have whole-function selection dags.  On x86, this means we use one
+extra register for the function when effective_addr2 is declared as U64 than
+when it is declared U32.
+
+//===---------------------------------------------------------------------===//
+
+Promote for i32 bswap can use i64 bswap + shr.  Useful on targets with 64-bit
+regs and bswap, like itanium.
+
+//===---------------------------------------------------------------------===//
+
+LSR should know what GPR types a target has.  This code:
+
+volatile short X, Y; // globals
+
+void foo(int N) {
+  int i;
+  for (i = 0; i < N; i++) { X = i; Y = i*4; }
+}
+
+produces two identical IV's (after promotion) on PPC/ARM:
+
+LBB1_1: @bb.preheader
+        mov r3, #0
+        mov r2, r3
+        mov r1, r3
+LBB1_2: @bb
+        ldr r12, LCPI1_0
+        ldr r12, [r12]
+        strh r2, [r12]
+        ldr r12, LCPI1_1
+        ldr r12, [r12]
+        strh r3, [r12]
+        add r1, r1, #1    <- [0,+,1]
+        add r3, r3, #4
+        add r2, r2, #1    <- [0,+,1]
+        cmp r1, r0
+        bne LBB1_2      @bb
+
+
+//===---------------------------------------------------------------------===//
+
+Tail call elim should be more aggressive, checking to see if the call is
+followed by an uncond branch to an exit block.
+
+; This testcase is due to tail-duplication not wanting to copy the return
+; instruction into the terminating blocks because there was other code
+; optimized out of the function after the taildup happened.
+;RUN: llvm-upgrade < %s | llvm-as | opt -tailcallelim | llvm-dis | not grep call
+
+int %t4(int %a) {
+entry:
+        %tmp.1 = and int %a, 1
+        %tmp.2 = cast int %tmp.1 to bool
+        br bool %tmp.2, label %then.0, label %else.0
+
+then.0:
+        %tmp.5 = add int %a, -1
+        %tmp.3 = call int %t4( int %tmp.5 )
+        br label %return
+
+else.0:
+        %tmp.7 = setne int %a, 0
+        br bool %tmp.7, label %then.1, label %return
+
+then.1:
+        %tmp.11 = add int %a, -2
+        %tmp.9 = call int %t4( int %tmp.11 )
+        br label %return
+
+return:
+        %result.0 = phi int [ 0, %else.0 ], [ %tmp.3, %then.0 ],
+                            [ %tmp.9, %then.1 ]
+        ret int %result.0
+}
+
+//===---------------------------------------------------------------------===//
+
+Argument promotion should promote arguments for recursive functions, like 
+this:
+
+; RUN: llvm-upgrade < %s | llvm-as | opt -argpromotion | llvm-dis | grep x.val
+
+implementation   ; Functions:
+
+internal int %foo(int* %x) {
+entry:
+        %tmp = load int* %x
+        %tmp.foo = call int %foo(int *%x)
+        ret int %tmp.foo
+}
+
+int %bar(int* %x) {
+entry:
+        %tmp3 = call int %foo( int* %x)                ; <int>[#uses=1]
+        ret int %tmp3
+}
+
+
+

diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
new file mode 100644
index 0000000..784f1bd
--- /dev/null
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp

@@ -0,0 +1,76 @@
+//===-- DelaySlotFiller.cpp - SPARC delay slot filler ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a simple local pass that fills delay slots with NOPs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "delayslotfiller"
+#include "Sparc.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+namespace {
+  struct Filler : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    Filler(TargetMachine &tm) 
+      : MachineFunctionPass((intptr_t)&ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "SPARC Delay Slot Filler";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Sparc MachineFunctions
+///
+FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
+  return new Filler(tm);
+}
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// Currently, we fill delay slots with NOPs. We assume there is only one
+/// delay slot per delayed instruction.
+///
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+    if (TII->hasDelaySlot(I->getOpcode())) {
+      MachineBasicBlock::iterator J = I;
+      ++J;
+      BuildMI(MBB, J, TII->get(SP::NOP));
+      ++FilledSlots;
+      Changed = true;
+    }
+  return Changed;
+}

diff --git a/lib/Target/Sparc/FPMover.cpp b/lib/Target/Sparc/FPMover.cpp
new file mode 100644
index 0000000..e1c9966
--- /dev/null
+++ b/lib/Target/Sparc/FPMover.cpp

@@ -0,0 +1,138 @@
+//===-- FPMover.cpp - Sparc double-precision floating point move fixer ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand FpMOVD/FpABSD/FpNEGD instructions into their single-precision pieces.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "fpmover"
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+STATISTIC(NumFpDs , "Number of instructions translated");
+STATISTIC(NoopFpDs, "Number of noop instructions removed");
+
+namespace {
+  struct FPMover : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    TargetMachine &TM;
+    
+    static char ID;
+    FPMover(TargetMachine &tm) 
+      : MachineFunctionPass((intptr_t)&ID), TM(tm) { }
+
+    virtual const char *getPassName() const {
+      return "Sparc Double-FP Move Fixer";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F);
+  };
+  char FPMover::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcFPMoverPass - Returns a pass that turns FpMOVD
+/// instructions into FMOVS instructions
+///
+FunctionPass *llvm::createSparcFPMoverPass(TargetMachine &tm) {
+  return new FPMover(tm);
+}
+
+/// getDoubleRegPair - Given a DFP register, return the even and odd FP
+/// registers that correspond to it.
+static void getDoubleRegPair(unsigned DoubleReg, unsigned &EvenReg,
+                             unsigned &OddReg) {
+  static const unsigned EvenHalvesOfPairs[] = {
+    SP::F0, SP::F2, SP::F4, SP::F6, SP::F8, SP::F10, SP::F12, SP::F14,
+    SP::F16, SP::F18, SP::F20, SP::F22, SP::F24, SP::F26, SP::F28, SP::F30
+  };
+  static const unsigned OddHalvesOfPairs[] = {
+    SP::F1, SP::F3, SP::F5, SP::F7, SP::F9, SP::F11, SP::F13, SP::F15,
+    SP::F17, SP::F19, SP::F21, SP::F23, SP::F25, SP::F27, SP::F29, SP::F31
+  };
+  static const unsigned DoubleRegsInOrder[] = {
+    SP::D0, SP::D1, SP::D2, SP::D3, SP::D4, SP::D5, SP::D6, SP::D7, SP::D8,
+    SP::D9, SP::D10, SP::D11, SP::D12, SP::D13, SP::D14, SP::D15
+  };
+  for (unsigned i = 0; i < sizeof(DoubleRegsInOrder)/sizeof(unsigned); ++i)
+    if (DoubleRegsInOrder[i] == DoubleReg) {
+      EvenReg = EvenHalvesOfPairs[i];
+      OddReg = OddHalvesOfPairs[i];
+      return;
+    }
+  assert(0 && "Can't find reg");
+}
+
+/// runOnMachineBasicBlock - Fixup FpMOVD instructions in this MBB.
+///
+bool FPMover::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+    MachineInstr *MI = I++;
+    if (MI->getOpcode() == SP::FpMOVD || MI->getOpcode() == SP::FpABSD ||
+        MI->getOpcode() == SP::FpNEGD) {
+      Changed = true;
+      unsigned DestDReg = MI->getOperand(0).getReg();
+      unsigned SrcDReg  = MI->getOperand(1).getReg();
+      if (DestDReg == SrcDReg && MI->getOpcode() == SP::FpMOVD) {
+        MBB.erase(MI);   // Eliminate the noop copy.
+        ++NoopFpDs;
+        continue;
+      }
+      
+      unsigned EvenSrcReg = 0, OddSrcReg = 0, EvenDestReg = 0, OddDestReg = 0;
+      getDoubleRegPair(DestDReg, EvenDestReg, OddDestReg);
+      getDoubleRegPair(SrcDReg, EvenSrcReg, OddSrcReg);
+
+      const TargetInstrInfo *TII = TM.getInstrInfo();
+      if (MI->getOpcode() == SP::FpMOVD)
+        MI->setInstrDescriptor(TII->get(SP::FMOVS));
+      else if (MI->getOpcode() == SP::FpNEGD)
+        MI->setInstrDescriptor(TII->get(SP::FNEGS));
+      else if (MI->getOpcode() == SP::FpABSD)
+        MI->setInstrDescriptor(TII->get(SP::FABSS));
+      else
+        assert(0 && "Unknown opcode!");
+        
+      MI->getOperand(0).setReg(EvenDestReg);
+      MI->getOperand(1).setReg(EvenSrcReg);
+      DOUT << "FPMover: the modified instr is: " << *MI;
+      // Insert copy for the other half of the double.
+      if (DestDReg != SrcDReg) {
+        MI = BuildMI(MBB, I, TM.getInstrInfo()->get(SP::FMOVS), OddDestReg)
+          .addReg(OddSrcReg);
+        DOUT << "FPMover: the inserted instr is: " << *MI;
+      }
+      ++NumFpDs;
+    }
+  }
+  return Changed;
+}
+
+bool FPMover::runOnMachineFunction(MachineFunction &F) {
+  // If the target has V9 instructions, the fp-mover pseudos will never be
+  // emitted.  Avoid a scan of the instructions to improve compile time.
+  if (TM.getSubtarget<SparcSubtarget>().isV9())
+    return false;
+  
+  bool Changed = false;
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI)
+    Changed |= runOnMachineBasicBlock(*FI);
+  return Changed;
+}

diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile
new file mode 100644
index 0000000..8cc4add
--- /dev/null
+++ b/lib/Target/Sparc/Makefile

@@ -0,0 +1,20 @@
+##===- lib/Target/Sparc/Makefile ---------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by the LLVM research group and is distributed under
+# the University of Illinois Open Source License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMSparc
+TARGET = Sparc
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = SparcGenRegisterInfo.h.inc SparcGenRegisterNames.inc \
+                SparcGenRegisterInfo.inc SparcGenInstrNames.inc \
+                SparcGenInstrInfo.inc SparcGenAsmWriter.inc \
+                SparcGenDAGISel.inc SparcGenSubtarget.inc
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/Sparc/README.txt b/lib/Target/Sparc/README.txt
new file mode 100644
index 0000000..f7cb9b8
--- /dev/null
+++ b/lib/Target/Sparc/README.txt

@@ -0,0 +1,57 @@
+
+To-do
+-----
+
+* Keep the address of the constant pool in a register instead of forming its
+  address all of the time.
+* We can fold small constant offsets into the %hi/%lo references to constant
+  pool addresses as well.
+* When in V9 mode, register allocate %icc[0-3].
+* Emit the 'Branch on Integer Register with Prediction' instructions.  It's
+  not clear how to write a pattern for this though:
+
+float %t1(int %a, int* %p) {
+        %C = seteq int %a, 0
+        br bool %C, label %T, label %F
+T:
+        store int 123, int* %p
+        br label %F
+F:
+        ret float undef
+}
+
+codegens to this:
+
+t1:
+        save -96, %o6, %o6
+1)      subcc %i0, 0, %l0
+1)      bne .LBBt1_2    ! F
+        nop
+.LBBt1_1:       ! T
+        or %g0, 123, %l0
+        st %l0, [%i1]
+.LBBt1_2:       ! F
+        restore %g0, %g0, %g0
+        retl
+        nop
+
+1) should be replaced with a brz in V9 mode.
+
+* Same as above, but emit conditional move on register zero (p192) in V9 
+  mode.  Testcase:
+
+int %t1(int %a, int %b) {
+        %C = seteq int %a, 0
+        %D = select bool %C, int %a, int %b
+        ret int %D
+}
+
+* Emit MULX/[SU]DIVX instructions in V9 mode instead of fiddling 
+  with the Y register, if they are faster.
+
+* Codegen bswap(load)/store(bswap) -> load/store ASI
+
+* Implement frame pointer elimination, e.g. eliminate save/restore for 
+  leaf fns.
+* Fill delay slots
+

diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
new file mode 100644
index 0000000..8936afa
--- /dev/null
+++ b/lib/Target/Sparc/Sparc.h

@@ -0,0 +1,116 @@
+//===-- Sparc.h - Top-level interface for Sparc representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Sparc back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_SPARC_H
+#define TARGET_SPARC_H
+
+#include <iosfwd>
+#include <cassert>
+
+namespace llvm {
+  class FunctionPass;
+  class TargetMachine;
+
+  FunctionPass *createSparcISelDag(TargetMachine &TM);
+  FunctionPass *createSparcCodePrinterPass(std::ostream &OS, TargetMachine &TM);
+  FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createSparcFPMoverPass(TargetMachine &TM);
+} // end namespace llvm;
+
+// Defines symbolic names for Sparc registers.  This defines a mapping from
+// register name to register number.
+//
+#include "SparcGenRegisterNames.inc"
+
+// Defines symbolic names for the Sparc instructions.
+//
+#include "SparcGenInstrNames.inc"
+
+
+namespace llvm {
+  // Enums corresponding to Sparc condition codes, both icc's and fcc's.  These
+  // values must be kept in sync with the ones in the .td file.
+  namespace SPCC {
+    enum CondCodes {
+      //ICC_A   =  8   ,  // Always
+      //ICC_N   =  0   ,  // Never
+      ICC_NE  =  9   ,  // Not Equal
+      ICC_E   =  1   ,  // Equal
+      ICC_G   = 10   ,  // Greater
+      ICC_LE  =  2   ,  // Less or Equal
+      ICC_GE  = 11   ,  // Greater or Equal
+      ICC_L   =  3   ,  // Less
+      ICC_GU  = 12   ,  // Greater Unsigned
+      ICC_LEU =  4   ,  // Less or Equal Unsigned
+      ICC_CC  = 13   ,  // Carry Clear/Great or Equal Unsigned
+      ICC_CS  =  5   ,  // Carry Set/Less Unsigned
+      ICC_POS = 14   ,  // Positive
+      ICC_NEG =  6   ,  // Negative
+      ICC_VC  = 15   ,  // Overflow Clear
+      ICC_VS  =  7   ,  // Overflow Set
+      
+      //FCC_A   =  8+16,  // Always
+      //FCC_N   =  0+16,  // Never
+      FCC_U   =  7+16,  // Unordered
+      FCC_G   =  6+16,  // Greater
+      FCC_UG  =  5+16,  // Unordered or Greater
+      FCC_L   =  4+16,  // Less
+      FCC_UL  =  3+16,  // Unordered or Less
+      FCC_LG  =  2+16,  // Less or Greater
+      FCC_NE  =  1+16,  // Not Equal
+      FCC_E   =  9+16,  // Equal
+      FCC_UE  = 10+16,  // Unordered or Equal
+      FCC_GE  = 11+16,  // Greater or Equal
+      FCC_UGE = 12+16,  // Unordered or Greater or Equal
+      FCC_LE  = 13+16,  // Less or Equal
+      FCC_ULE = 14+16,  // Unordered or Less or Equal
+      FCC_O   = 15+16   // Ordered
+    };
+  }
+  
+  inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
+    switch (CC) {
+    default: assert(0 && "Unknown condition code");
+    case SPCC::ICC_NE:  return "ne";
+    case SPCC::ICC_E:   return "e";
+    case SPCC::ICC_G:   return "g";
+    case SPCC::ICC_LE:  return "le";
+    case SPCC::ICC_GE:  return "ge";
+    case SPCC::ICC_L:   return "l";
+    case SPCC::ICC_GU:  return "gu";
+    case SPCC::ICC_LEU: return "leu";
+    case SPCC::ICC_CC:  return "cc";
+    case SPCC::ICC_CS:  return "cs";
+    case SPCC::ICC_POS: return "pos";
+    case SPCC::ICC_NEG: return "neg";
+    case SPCC::ICC_VC:  return "vc";
+    case SPCC::ICC_VS:  return "vs";
+    case SPCC::FCC_U:   return "u";
+    case SPCC::FCC_G:   return "g";
+    case SPCC::FCC_UG:  return "ug";
+    case SPCC::FCC_L:   return "l";
+    case SPCC::FCC_UL:  return "ul";
+    case SPCC::FCC_LG:  return "lg";
+    case SPCC::FCC_NE:  return "ne";
+    case SPCC::FCC_E:   return "e";
+    case SPCC::FCC_UE:  return "ue";
+    case SPCC::FCC_GE:  return "ge";
+    case SPCC::FCC_UGE: return "uge";
+    case SPCC::FCC_LE:  return "le";
+    case SPCC::FCC_ULE: return "ule";
+    case SPCC::FCC_O:   return "o";
+    }       
+  }
+}  // end namespace llvm
+#endif

diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
new file mode 100644
index 0000000..1646e0e
--- /dev/null
+++ b/lib/Target/Sparc/Sparc.td

@@ -0,0 +1,80 @@
+//===- Sparc.td - Describe the Sparc Target Machine -------------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "../Target.td"
+
+//===----------------------------------------------------------------------===//
+// SPARC Subtarget features.
+//
+ 
+def FeatureV9
+  : SubtargetFeature<"v9", "IsV9", "true",
+                     "Enable SPARC-V9 instructions">;
+def FeatureV8Deprecated
+  : SubtargetFeature<"deprecated-v8", "V8DeprecatedInsts", "true",
+                     "Enable deprecated V8 instructions in V9 mode">;
+def FeatureVIS
+  : SubtargetFeature<"vis", "IsVIS", "true",
+                     "Enable UltraSPARC Visual Instruction Set extensions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "SparcRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "SparcInstrInfo.td"
+
+def SparcInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+  let TSFlagsFields = [];
+  let TSFlagsShifts = [];
+}
+
+//===----------------------------------------------------------------------===//
+// SPARC processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"v8",              []>;
+def : Proc<"supersparc",      []>;
+def : Proc<"sparclite",       []>;
+def : Proc<"f934",            []>;
+def : Proc<"hypersparc",      []>;
+def : Proc<"sparclite86x",    []>;
+def : Proc<"sparclet",        []>;
+def : Proc<"tsc701",          []>;
+def : Proc<"v9",              [FeatureV9]>;
+def : Proc<"ultrasparc",      [FeatureV9, FeatureV8Deprecated]>;
+def : Proc<"ultrasparc3",     [FeatureV9, FeatureV8Deprecated]>;
+def : Proc<"ultrasparc3-vis", [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
+
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Sparc : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = SparcInstrInfo;
+}

diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
new file mode 100644
index 0000000..1f82326
--- /dev/null
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp

@@ -0,0 +1,291 @@
+//===-- SparcAsmPrinter.cpp - Sparc LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format SPARC assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Sparc.h"
+#include "SparcInstrInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include <cctype>
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+namespace {
+  struct VISIBILITY_HIDDEN SparcAsmPrinter : public AsmPrinter {
+    SparcAsmPrinter(std::ostream &O, TargetMachine &TM, const TargetAsmInfo *T)
+      : AsmPrinter(O, TM, T) {
+    }
+
+    /// We name each basic block in a Function with a unique number, so
+    /// that we can consistently refer to them later. This is cleared
+    /// at the beginning of each call to runOnMachineFunction().
+    ///
+    typedef std::map<const Value *, unsigned> ValueMapTy;
+    ValueMapTy NumberForBB;
+
+    virtual const char *getPassName() const {
+      return "Sparc Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printMemOperand(const MachineInstr *MI, int opNum,
+                         const char *Modifier = 0);
+    void printCCOperand(const MachineInstr *MI, int opNum);
+
+    bool printInstruction(const MachineInstr *MI);  // autogenerated.
+    bool runOnMachineFunction(MachineFunction &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+  };
+} // end of anonymous namespace
+
+#include "SparcGenAsmWriter.inc"
+
+/// createSparcCodePrinterPass - Returns a pass that prints the SPARC
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.  This should work
+/// regardless of whether the function is in SSA form.
+///
+FunctionPass *llvm::createSparcCodePrinterPass(std::ostream &o,
+                                               TargetMachine &tm) {
+  return new SparcAsmPrinter(o, tm, tm.getTargetAsmInfo());
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool SparcAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // BBNumber is used here so that a given Printer will never give two
+  // BBs the same name. (If you have a better way, please let me know!)
+  static unsigned BBNumber = 0;
+
+  O << "\n\n";
+  // What's my mangled name?
+  CurrentFnName = Mang->getValueName(MF.getFunction());
+
+  // Print out the label for the function.
+  const Function *F = MF.getFunction();
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+  EmitAlignment(4, F);
+  O << "\t.globl\t" << CurrentFnName << "\n";
+  O << "\t.type\t" << CurrentFnName << ", #function\n";
+  O << CurrentFnName << ":\n";
+
+  // Number each basic block so that we can consistently refer to them
+  // in PC-relative references.
+  // FIXME: Why not use the MBB numbers?
+  NumberForBB.clear();
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    NumberForBB[I->getBasicBlock()] = BBNumber++;
+  }
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printInstruction(II);
+      ++EmittedInsts;
+    }
+  }
+
+  // We didn't modify anything.
+  return false;
+}
+
+void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand (opNum);
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+  bool CloseParen = false;
+  if (MI->getOpcode() == SP::SETHIi && !MO.isRegister() && !MO.isImmediate()) {
+    O << "%hi(";
+    CloseParen = true;
+  } else if ((MI->getOpcode() == SP::ORri || MI->getOpcode() == SP::ADDri)
+             && !MO.isRegister() && !MO.isImmediate()) {
+    O << "%lo(";
+    CloseParen = true;
+  }
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (MRegisterInfo::isPhysicalRegister(MO.getReg()))
+      O << "%" << LowercaseString (RI.get(MO.getReg()).Name);
+    else
+      O << "%reg" << MO.getReg();
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << (int)MO.getImmedValue();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMachineBasicBlock());
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    O << Mang->getValueName(MO.getGlobal());
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getConstantPoolIndex();
+    break;
+  default:
+    O << "<unknown operand type>"; abort (); break;
+  }
+  if (CloseParen) O << ")";
+}
+
+void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      const char *Modifier) {
+  printOperand(MI, opNum);
+  
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    O << ", ";
+    printOperand(MI, opNum+1);
+    return;
+  }
+  
+  if (MI->getOperand(opNum+1).isRegister() &&
+      MI->getOperand(opNum+1).getReg() == SP::G0)
+    return;   // don't print "+%g0"
+  if (MI->getOperand(opNum+1).isImmediate() &&
+      MI->getOperand(opNum+1).getImmedValue() == 0)
+    return;   // don't print "+0"
+  
+  O << "+";
+  if (MI->getOperand(opNum+1).isGlobalAddress() ||
+      MI->getOperand(opNum+1).isConstantPoolIndex()) {
+    O << "%lo(";
+    printOperand(MI, opNum+1);
+    O << ")";
+  } else {
+    printOperand(MI, opNum+1);
+  }
+}
+
+void SparcAsmPrinter::printCCOperand(const MachineInstr *MI, int opNum) {
+  int CC = (int)MI->getOperand(opNum).getImmedValue();
+  O << SPARCCondCodeToString((SPCC::CondCodes)CC);
+}
+
+
+
+bool SparcAsmPrinter::doInitialization(Module &M) {
+  Mang = new Mangler(M);
+  return false; // success
+}
+
+bool SparcAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (I->hasInitializer()) {   // External global require no code
+      // Check to see if this is a special global used by LLVM, if so, emit it.
+      if (EmitSpecialLLVMGlobal(I))
+        continue;
+      
+      O << "\n\n";
+      std::string name = Mang->getValueName(I);
+      Constant *C = I->getInitializer();
+      unsigned Size = TD->getTypeSize(C->getType());
+      unsigned Align = TD->getPrefTypeAlignment(C->getType());
+
+      if (C->isNullValue() &&
+          (I->hasLinkOnceLinkage() || I->hasInternalLinkage() ||
+           I->hasWeakLinkage() /* FIXME: Verify correct */)) {
+        SwitchToDataSection(".data", I);
+        if (I->hasInternalLinkage())
+          O << "\t.local " << name << "\n";
+
+        O << "\t.comm " << name << "," << TD->getTypeSize(C->getType())
+          << "," << Align;
+        O << "\n";
+      } else {
+        switch (I->getLinkage()) {
+        case GlobalValue::LinkOnceLinkage:
+        case GlobalValue::WeakLinkage:   // FIXME: Verify correct for weak.
+          // Nonnull linkonce -> weak
+          O << "\t.weak " << name << "\n";
+          SwitchToDataSection("", I);
+          O << "\t.section\t\".llvm.linkonce.d." << name
+            << "\",\"aw\",@progbits\n";
+          break;
+
+        case GlobalValue::AppendingLinkage:
+          // FIXME: appending linkage variables should go into a section of
+          // their name or something.  For now, just emit them as external.
+        case GlobalValue::ExternalLinkage:
+          // If external or appending, declare as a global symbol
+          O << "\t.globl " << name << "\n";
+          // FALL THROUGH
+        case GlobalValue::InternalLinkage:
+          if (C->isNullValue())
+            SwitchToDataSection(".bss", I);
+          else
+            SwitchToDataSection(".data", I);
+          break;
+        case GlobalValue::GhostLinkage:
+          cerr << "Should not have any unmaterialized functions!\n";
+          abort();
+        case GlobalValue::DLLImportLinkage:
+          cerr << "DLLImport linkage is not supported by this target!\n";
+          abort();
+        case GlobalValue::DLLExportLinkage:
+          cerr << "DLLExport linkage is not supported by this target!\n";
+          abort();
+        default:
+          assert(0 && "Unknown linkage type!");          
+        }
+
+        O << "\t.align " << Align << "\n";
+        O << "\t.type " << name << ",#object\n";
+        O << "\t.size " << name << "," << Size << "\n";
+        O << name << ":\n";
+        EmitGlobalConstant(C);
+      }
+    }
+
+  AsmPrinter::doFinalization(M);
+  return false; // success
+}

diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
new file mode 100644
index 0000000..8c8b3f8
--- /dev/null
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp

@@ -0,0 +1,1140 @@
+//===-- SparcISelDAGToDAG.cpp - A dag to dag inst selector for Sparc ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SPARC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "SparcTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+#include <set>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+namespace SPISD {
+  enum {
+    FIRST_NUMBER = ISD::BUILTIN_OP_END+SP::INSTRUCTION_LIST_END,
+    CMPICC,      // Compare two GPR operands, set icc.
+    CMPFCC,      // Compare two FP operands, set fcc.
+    BRICC,       // Branch to dest on icc condition
+    BRFCC,       // Branch to dest on fcc condition
+    SELECT_ICC,  // Select between two values using the current ICC flags.
+    SELECT_FCC,  // Select between two values using the current FCC flags.
+    
+    Hi, Lo,      // Hi/Lo operations, typically on a global address.
+    
+    FTOI,        // FP to Int within a FP register.
+    ITOF,        // Int to FP within a FP register.
+
+    CALL,        // A call instruction.
+    RET_FLAG     // Return with a flag operand.
+  };
+}
+
+/// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC
+/// condition.
+static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown integer condition code!");
+  case ISD::SETEQ:  return SPCC::ICC_E;
+  case ISD::SETNE:  return SPCC::ICC_NE;
+  case ISD::SETLT:  return SPCC::ICC_L;
+  case ISD::SETGT:  return SPCC::ICC_G;
+  case ISD::SETLE:  return SPCC::ICC_LE;
+  case ISD::SETGE:  return SPCC::ICC_GE;
+  case ISD::SETULT: return SPCC::ICC_CS;
+  case ISD::SETULE: return SPCC::ICC_LEU;
+  case ISD::SETUGT: return SPCC::ICC_GU;
+  case ISD::SETUGE: return SPCC::ICC_CC;
+  }
+}
+
+/// FPCondCCodeToFCC - Convert a DAG floatingp oint condition code to a SPARC
+/// FCC condition.
+static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Unknown fp condition code!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: return SPCC::FCC_E;
+  case ISD::SETNE:
+  case ISD::SETUNE: return SPCC::FCC_NE;
+  case ISD::SETLT:
+  case ISD::SETOLT: return SPCC::FCC_L;
+  case ISD::SETGT:
+  case ISD::SETOGT: return SPCC::FCC_G;
+  case ISD::SETLE:
+  case ISD::SETOLE: return SPCC::FCC_LE;
+  case ISD::SETGE:
+  case ISD::SETOGE: return SPCC::FCC_GE;
+  case ISD::SETULT: return SPCC::FCC_UL;
+  case ISD::SETULE: return SPCC::FCC_ULE;
+  case ISD::SETUGT: return SPCC::FCC_UG;
+  case ISD::SETUGE: return SPCC::FCC_UGE;
+  case ISD::SETUO:  return SPCC::FCC_U;
+  case ISD::SETO:   return SPCC::FCC_O;
+  case ISD::SETONE: return SPCC::FCC_LG;
+  case ISD::SETUEQ: return SPCC::FCC_UE;
+  }
+}
+
+namespace {
+  class SparcTargetLowering : public TargetLowering {
+    int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+  public:
+    SparcTargetLowering(TargetMachine &TM);
+    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+    
+    /// computeMaskedBitsForTargetNode - Determine which of the bits specified 
+    /// in Mask are known to be either zero or one and return them in the 
+    /// KnownZero/KnownOne bitsets.
+    virtual void computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                uint64_t Mask,
+                                                uint64_t &KnownZero, 
+                                                uint64_t &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+    
+    virtual std::vector<SDOperand>
+      LowerArguments(Function &F, SelectionDAG &DAG);
+    virtual std::pair<SDOperand, SDOperand>
+      LowerCallTo(SDOperand Chain, const Type *RetTy, bool RetTyIsSigned, 
+                  bool isVarArg, unsigned CC, bool isTailCall, SDOperand Callee,
+                  ArgListTy &Args, SelectionDAG &DAG);
+    virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                                       MachineBasicBlock *MBB);
+    
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+  };
+}
+
+SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
+  : TargetLowering(TM) {
+  
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, SP::IntRegsRegisterClass);
+  addRegisterClass(MVT::f32, SP::FPRegsRegisterClass);
+  addRegisterClass(MVT::f64, SP::DFPRegsRegisterClass);
+
+  // Turn FP extload into load/fextend
+  setLoadXAction(ISD::EXTLOAD, MVT::f32, Expand);
+  
+  // Custom legalize GlobalAddress nodes into LO/HI parts.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool , MVT::i32, Custom);
+  
+  // Sparc doesn't have sext_inreg, replace them with shl/sra
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+  // Sparc has no REM operation.
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+
+  // Custom expand fp<->sint
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+
+  // Expand fp<->uint
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+  
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand);
+  
+  // Sparc has no select or setcc: expand to SELECT_CC.
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f64, Expand);
+  
+  // Sparc doesn't have BRCOND either, it has BR_CC.
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+  
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  
+  // SPARC has no intrinsics for these particular operations.
+  setOperationAction(ISD::MEMMOVE, MVT::Other, Expand);
+  setOperationAction(ISD::MEMSET, MVT::Other, Expand);
+  setOperationAction(ISD::MEMCPY, MVT::Other, Expand);
+  
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  setOperationAction(ISD::LABEL, MVT::Other, Expand);
+
+  // RET must be custom lowered, to meet ABI requirements
+  setOperationAction(ISD::RET               , MVT::Other, Custom);
+  
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  // VAARG needs to be lowered to not do unaligned accesses for doubles.
+  setOperationAction(ISD::VAARG             , MVT::Other, Custom);
+  
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand); 
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+
+  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+  
+  setStackPointerRegisterToSaveRestore(SP::O6);
+
+  if (TM.getSubtarget<SparcSubtarget>().isV9()) {
+    setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+  }
+  
+  computeRegisterProperties();
+}
+
+const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case SPISD::CMPICC:     return "SPISD::CMPICC";
+  case SPISD::CMPFCC:     return "SPISD::CMPFCC";
+  case SPISD::BRICC:      return "SPISD::BRICC";
+  case SPISD::BRFCC:      return "SPISD::BRFCC";
+  case SPISD::SELECT_ICC: return "SPISD::SELECT_ICC";
+  case SPISD::SELECT_FCC: return "SPISD::SELECT_FCC";
+  case SPISD::Hi:         return "SPISD::Hi";
+  case SPISD::Lo:         return "SPISD::Lo";
+  case SPISD::FTOI:       return "SPISD::FTOI";
+  case SPISD::ITOF:       return "SPISD::ITOF";
+  case SPISD::CALL:       return "SPISD::CALL";
+  case SPISD::RET_FLAG:   return "SPISD::RET_FLAG";
+  }
+}
+
+/// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+/// be zero. Op is expected to be a target specific node. Used by DAG
+/// combiner.
+void SparcTargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                         uint64_t Mask,
+                                                         uint64_t &KnownZero, 
+                                                         uint64_t &KnownOne,
+                                                         const SelectionDAG &DAG,
+                                                         unsigned Depth) const {
+  uint64_t KnownZero2, KnownOne2;
+  KnownZero = KnownOne = 0;   // Don't know anything.
+  
+  switch (Op.getOpcode()) {
+  default: break;
+  case SPISD::SELECT_ICC:
+  case SPISD::SELECT_FCC:
+    DAG.ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne,
+                          Depth+1);
+    DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero2, KnownOne2,
+                          Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  }
+}
+
+/// LowerArguments - V8 uses a very simple ABI, where all values are passed in
+/// either one or two GPRs, including FP values.  TODO: we should pass FP values
+/// in FP registers for fastcc functions.
+std::vector<SDOperand>
+SparcTargetLowering::LowerArguments(Function &F, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SSARegMap *RegMap = MF.getSSARegMap();
+  std::vector<SDOperand> ArgValues;
+  
+  static const unsigned ArgRegs[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+  
+  const unsigned *CurArgReg = ArgRegs, *ArgRegEnd = ArgRegs+6;
+  unsigned ArgOffset = 68;
+  
+  SDOperand Root = DAG.getRoot();
+  std::vector<SDOperand> OutChains;
+
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+    MVT::ValueType ObjectVT = getValueType(I->getType());
+    
+    switch (ObjectVT) {
+    default: assert(0 && "Unhandled argument type!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      if (I->use_empty()) {                // Argument is dead.
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        ArgValues.push_back(DAG.getNode(ISD::UNDEF, ObjectVT));
+      } else if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+        unsigned VReg = RegMap->createVirtualRegister(&SP::IntRegsRegClass);
+        MF.addLiveIn(*CurArgReg++, VReg);
+        SDOperand Arg = DAG.getCopyFromReg(Root, VReg, MVT::i32);
+        if (ObjectVT != MVT::i32) {
+          unsigned AssertOp = ISD::AssertSext;
+          Arg = DAG.getNode(AssertOp, MVT::i32, Arg, 
+                            DAG.getValueType(ObjectVT));
+          Arg = DAG.getNode(ISD::TRUNCATE, ObjectVT, Arg);
+        }
+        ArgValues.push_back(Arg);
+      } else {
+        int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
+        SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+        SDOperand Load;
+        if (ObjectVT == MVT::i32) {
+          Load = DAG.getLoad(MVT::i32, Root, FIPtr, NULL, 0);
+        } else {
+          ISD::LoadExtType LoadOp = ISD::SEXTLOAD;
+
+          // Sparc is big endian, so add an offset based on the ObjectVT.
+          unsigned Offset = 4-std::max(1U, MVT::getSizeInBits(ObjectVT)/8);
+          FIPtr = DAG.getNode(ISD::ADD, MVT::i32, FIPtr,
+                              DAG.getConstant(Offset, MVT::i32));
+          Load = DAG.getExtLoad(LoadOp, MVT::i32, Root, FIPtr,
+                                NULL, 0, ObjectVT);
+          Load = DAG.getNode(ISD::TRUNCATE, ObjectVT, Load);
+        }
+        ArgValues.push_back(Load);
+      }
+      
+      ArgOffset += 4;
+      break;
+    case MVT::f32:
+      if (I->use_empty()) {                // Argument is dead.
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        ArgValues.push_back(DAG.getNode(ISD::UNDEF, ObjectVT));
+      } else if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+        // FP value is passed in an integer register.
+        unsigned VReg = RegMap->createVirtualRegister(&SP::IntRegsRegClass);
+        MF.addLiveIn(*CurArgReg++, VReg);
+        SDOperand Arg = DAG.getCopyFromReg(Root, VReg, MVT::i32);
+
+        Arg = DAG.getNode(ISD::BIT_CONVERT, MVT::f32, Arg);
+        ArgValues.push_back(Arg);
+      } else {
+        int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
+        SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+        SDOperand Load = DAG.getLoad(MVT::f32, Root, FIPtr, NULL, 0);
+        ArgValues.push_back(Load);
+      }
+      ArgOffset += 4;
+      break;
+
+    case MVT::i64:
+    case MVT::f64:
+      if (I->use_empty()) {                // Argument is dead.
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        ArgValues.push_back(DAG.getNode(ISD::UNDEF, ObjectVT));
+      } else if (/* FIXME: Apparently this isn't safe?? */
+                 0 && CurArgReg == ArgRegEnd && ObjectVT == MVT::f64 &&
+                 ((CurArgReg-ArgRegs) & 1) == 0) {
+        // If this is a double argument and the whole thing lives on the stack,
+        // and the argument is aligned, load the double straight from the stack.
+        // We can't do a load in cases like void foo([6ints], int,double),
+        // because the double wouldn't be aligned!
+        int FrameIdx = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset);
+        SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+        ArgValues.push_back(DAG.getLoad(MVT::f64, Root, FIPtr, NULL, 0));
+      } else {
+        SDOperand HiVal;
+        if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+          unsigned VRegHi = RegMap->createVirtualRegister(&SP::IntRegsRegClass);
+          MF.addLiveIn(*CurArgReg++, VRegHi);
+          HiVal = DAG.getCopyFromReg(Root, VRegHi, MVT::i32);
+        } else {
+          int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
+          SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+          HiVal = DAG.getLoad(MVT::i32, Root, FIPtr, NULL, 0);
+        }
+        
+        SDOperand LoVal;
+        if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+          unsigned VRegLo = RegMap->createVirtualRegister(&SP::IntRegsRegClass);
+          MF.addLiveIn(*CurArgReg++, VRegLo);
+          LoVal = DAG.getCopyFromReg(Root, VRegLo, MVT::i32);
+        } else {
+          int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset+4);
+          SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+          LoVal = DAG.getLoad(MVT::i32, Root, FIPtr, NULL, 0);
+        }
+        
+        // Compose the two halves together into an i64 unit.
+        SDOperand WholeValue = 
+          DAG.getNode(ISD::BUILD_PAIR, MVT::i64, LoVal, HiVal);
+        
+        // If we want a double, do a bit convert.
+        if (ObjectVT == MVT::f64)
+          WholeValue = DAG.getNode(ISD::BIT_CONVERT, MVT::f64, WholeValue);
+        
+        ArgValues.push_back(WholeValue);
+      }
+      ArgOffset += 8;
+      break;
+    }
+  }
+  
+  // Store remaining ArgRegs to the stack if this is a varargs function.
+  if (F.getFunctionType()->isVarArg()) {
+    // Remember the vararg offset for the va_start implementation.
+    VarArgsFrameOffset = ArgOffset;
+    
+    for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
+      unsigned VReg = RegMap->createVirtualRegister(&SP::IntRegsRegClass);
+      MF.addLiveIn(*CurArgReg, VReg);
+      SDOperand Arg = DAG.getCopyFromReg(DAG.getRoot(), VReg, MVT::i32);
+
+      int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset);
+      SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+
+      OutChains.push_back(DAG.getStore(DAG.getRoot(), Arg, FIPtr, NULL, 0));
+      ArgOffset += 4;
+    }
+  }
+  
+  if (!OutChains.empty())
+    DAG.setRoot(DAG.getNode(ISD::TokenFactor, MVT::Other,
+                            &OutChains[0], OutChains.size()));
+  
+  // Finally, inform the code generator which regs we return values in.
+  switch (getValueType(F.getReturnType())) {
+  default: assert(0 && "Unknown type!");
+  case MVT::isVoid: break;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    MF.addLiveOut(SP::I0);
+    break;
+  case MVT::i64:
+    MF.addLiveOut(SP::I0);
+    MF.addLiveOut(SP::I1);
+    break;
+  case MVT::f32:
+    MF.addLiveOut(SP::F0);
+    break;
+  case MVT::f64:
+    MF.addLiveOut(SP::D0);
+    break;
+  }
+  
+  return ArgValues;
+}
+
+std::pair<SDOperand, SDOperand>
+SparcTargetLowering::LowerCallTo(SDOperand Chain, const Type *RetTy,
+                                 bool RetTyIsSigned, bool isVarArg, unsigned CC,
+                                 bool isTailCall, SDOperand Callee, 
+                                 ArgListTy &Args, SelectionDAG &DAG) {
+  // Count the size of the outgoing arguments.
+  unsigned ArgsSize = 0;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+    switch (getValueType(Args[i].Ty)) {
+    default: assert(0 && "Unknown value type!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+    case MVT::f32:
+      ArgsSize += 4;
+      break;
+    case MVT::i64:
+    case MVT::f64:
+      ArgsSize += 8;
+      break;
+    }
+  }
+  if (ArgsSize > 4*6)
+    ArgsSize -= 4*6;    // Space for first 6 arguments is prereserved.
+  else
+    ArgsSize = 0;
+
+  // Keep stack frames 8-byte aligned.
+  ArgsSize = (ArgsSize+7) & ~7;
+
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(ArgsSize, getPointerTy()));
+  
+  SDOperand StackPtr;
+  std::vector<SDOperand> Stores;
+  std::vector<SDOperand> RegValuesToPass;
+  unsigned ArgOffset = 68;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+    SDOperand Val = Args[i].Node;
+    MVT::ValueType ObjectVT = Val.getValueType();
+    SDOperand ValToStore(0, 0);
+    unsigned ObjSize;
+    switch (ObjectVT) {
+    default: assert(0 && "Unhandled argument type!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16: {
+      // Promote the integer to 32-bits.  If the input type is signed, use a
+      // sign extend, otherwise use a zero extend.
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+      if (Args[i].isSExt)
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (Args[i].isZExt)
+        ExtendKind = ISD::ZERO_EXTEND;
+      Val = DAG.getNode(ExtendKind, MVT::i32, Val);
+      // FALL THROUGH
+    }
+    case MVT::i32:
+      ObjSize = 4;
+
+      if (RegValuesToPass.size() >= 6) {
+        ValToStore = Val;
+      } else {
+        RegValuesToPass.push_back(Val);
+      }
+      break;
+    case MVT::f32:
+      ObjSize = 4;
+      if (RegValuesToPass.size() >= 6) {
+        ValToStore = Val;
+      } else {
+        // Convert this to a FP value in an int reg.
+        Val = DAG.getNode(ISD::BIT_CONVERT, MVT::i32, Val);
+        RegValuesToPass.push_back(Val);
+      }
+      break;
+    case MVT::f64:
+      ObjSize = 8;
+      // If we can store this directly into the outgoing slot, do so.  We can
+      // do this when all ArgRegs are used and if the outgoing slot is aligned.
+      // FIXME: McGill/misr fails with this.
+      if (0 && RegValuesToPass.size() >= 6 && ((ArgOffset-68) & 7) == 0) {
+        ValToStore = Val;
+        break;
+      }
+      
+      // Otherwise, convert this to a FP value in int regs.
+      Val = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Val);
+      // FALL THROUGH
+    case MVT::i64:
+      ObjSize = 8;
+      if (RegValuesToPass.size() >= 6) {
+        ValToStore = Val;    // Whole thing is passed in memory.
+        break;
+      }
+      
+      // Split the value into top and bottom part.  Top part goes in a reg.
+      SDOperand Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, getPointerTy(), Val, 
+                                 DAG.getConstant(1, MVT::i32));
+      SDOperand Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, getPointerTy(), Val,
+                                 DAG.getConstant(0, MVT::i32));
+      RegValuesToPass.push_back(Hi);
+      
+      if (RegValuesToPass.size() >= 6) {
+        ValToStore = Lo;
+        ArgOffset += 4;
+        ObjSize = 4;
+      } else {
+        RegValuesToPass.push_back(Lo);
+      }
+      break;
+    }
+    
+    if (ValToStore.Val) {
+      if (!StackPtr.Val) {
+        StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+      }
+      SDOperand PtrOff = DAG.getConstant(ArgOffset, getPointerTy());
+      PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff);
+      Stores.push_back(DAG.getStore(Chain, ValToStore, PtrOff, NULL, 0));
+    }
+    ArgOffset += ObjSize;
+  }
+  
+  // Emit all stores, make sure the occur before any copies into physregs.
+  if (!Stores.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &Stores[0],Stores.size());
+  
+  static const unsigned ArgRegs[] = {
+    SP::O0, SP::O1, SP::O2, SP::O3, SP::O4, SP::O5
+  };
+  
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into O[0-5].
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegValuesToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, ArgRegs[i], RegValuesToPass[i], InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  std::vector<MVT::ValueType> NodeTys;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+  SDOperand Ops[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(SPISD::CALL, NodeTys, Ops, InFlag.Val ? 3 : 2);
+  InFlag = Chain.getValue(1);
+  
+  MVT::ValueType RetTyVT = getValueType(RetTy);
+  SDOperand RetVal;
+  if (RetTyVT != MVT::isVoid) {
+    switch (RetTyVT) {
+    default: assert(0 && "Unknown value type to return!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16: {
+      RetVal = DAG.getCopyFromReg(Chain, SP::O0, MVT::i32, InFlag);
+      Chain = RetVal.getValue(1);
+      
+      // Add a note to keep track of whether it is sign or zero extended.
+      ISD::NodeType AssertKind = ISD::AssertZext;
+      if (RetTyIsSigned)
+        AssertKind = ISD::AssertSext;
+      RetVal = DAG.getNode(AssertKind, MVT::i32, RetVal, 
+                           DAG.getValueType(RetTyVT));
+      RetVal = DAG.getNode(ISD::TRUNCATE, RetTyVT, RetVal);
+      break;
+    }
+    case MVT::i32:
+      RetVal = DAG.getCopyFromReg(Chain, SP::O0, MVT::i32, InFlag);
+      Chain = RetVal.getValue(1);
+      break;
+    case MVT::f32:
+      RetVal = DAG.getCopyFromReg(Chain, SP::F0, MVT::f32, InFlag);
+      Chain = RetVal.getValue(1);
+      break;
+    case MVT::f64:
+      RetVal = DAG.getCopyFromReg(Chain, SP::D0, MVT::f64, InFlag);
+      Chain = RetVal.getValue(1);
+      break;
+    case MVT::i64:
+      SDOperand Lo = DAG.getCopyFromReg(Chain, SP::O1, MVT::i32, InFlag);
+      SDOperand Hi = DAG.getCopyFromReg(Lo.getValue(1), SP::O0, MVT::i32, 
+                                        Lo.getValue(2));
+      RetVal = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi);
+      Chain = Hi.getValue(1);
+      break;
+    }
+  }
+  
+  Chain = DAG.getNode(ISD::CALLSEQ_END, MVT::Other, Chain,
+                      DAG.getConstant(ArgsSize, getPointerTy()));
+  
+  return std::make_pair(RetVal, Chain);
+}
+
+// Look at LHS/RHS/CC and see if they are a lowered setcc instruction.  If so
+// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
+static void LookThroughSetCC(SDOperand &LHS, SDOperand &RHS,
+                             ISD::CondCode CC, unsigned &SPCC) {
+  if (isa<ConstantSDNode>(RHS) && cast<ConstantSDNode>(RHS)->getValue() == 0 &&
+      CC == ISD::SETNE && 
+      ((LHS.getOpcode() == SPISD::SELECT_ICC &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
+       (LHS.getOpcode() == SPISD::SELECT_FCC &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
+      isa<ConstantSDNode>(LHS.getOperand(0)) &&
+      isa<ConstantSDNode>(LHS.getOperand(1)) &&
+      cast<ConstantSDNode>(LHS.getOperand(0))->getValue() == 1 &&
+      cast<ConstantSDNode>(LHS.getOperand(1))->getValue() == 0) {
+    SDOperand CMPCC = LHS.getOperand(3);
+    SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getValue();
+    LHS = CMPCC.getOperand(0);
+    RHS = CMPCC.getOperand(1);
+  }
+}
+
+
+SDOperand SparcTargetLowering::
+LowerOperation(SDOperand Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Should not custom lower this!");
+  case ISD::GlobalTLSAddress:
+    assert(0 && "TLS not implemented for Sparc.");
+  case ISD::GlobalAddress: {
+    GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+    SDOperand GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+    SDOperand Hi = DAG.getNode(SPISD::Hi, MVT::i32, GA);
+    SDOperand Lo = DAG.getNode(SPISD::Lo, MVT::i32, GA);
+    return DAG.getNode(ISD::ADD, MVT::i32, Lo, Hi);
+  }
+  case ISD::ConstantPool: {
+    Constant *C = cast<ConstantPoolSDNode>(Op)->getConstVal();
+    SDOperand CP = DAG.getTargetConstantPool(C, MVT::i32,
+                                  cast<ConstantPoolSDNode>(Op)->getAlignment());
+    SDOperand Hi = DAG.getNode(SPISD::Hi, MVT::i32, CP);
+    SDOperand Lo = DAG.getNode(SPISD::Lo, MVT::i32, CP);
+    return DAG.getNode(ISD::ADD, MVT::i32, Lo, Hi);
+  }
+  case ISD::FP_TO_SINT:
+    // Convert the fp value to integer in an FP register.
+    assert(Op.getValueType() == MVT::i32);
+    Op = DAG.getNode(SPISD::FTOI, MVT::f32, Op.getOperand(0));
+    return DAG.getNode(ISD::BIT_CONVERT, MVT::i32, Op);
+  case ISD::SINT_TO_FP: {
+    assert(Op.getOperand(0).getValueType() == MVT::i32);
+    SDOperand Tmp = DAG.getNode(ISD::BIT_CONVERT, MVT::f32, Op.getOperand(0));
+    // Convert the int value to FP in an FP register.
+    return DAG.getNode(SPISD::ITOF, Op.getValueType(), Tmp);
+  }
+  case ISD::BR_CC: {
+    SDOperand Chain = Op.getOperand(0);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+    SDOperand LHS = Op.getOperand(2);
+    SDOperand RHS = Op.getOperand(3);
+    SDOperand Dest = Op.getOperand(4);
+    unsigned Opc, SPCC = ~0U;
+    
+    // If this is a br_cc of a "setcc", and if the setcc got lowered into
+    // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+    LookThroughSetCC(LHS, RHS, CC, SPCC);
+    
+    // Get the condition flag.
+    SDOperand CompareFlag;
+    if (LHS.getValueType() == MVT::i32) {
+      std::vector<MVT::ValueType> VTs;
+      VTs.push_back(MVT::i32);
+      VTs.push_back(MVT::Flag);
+      SDOperand Ops[2] = { LHS, RHS };
+      CompareFlag = DAG.getNode(SPISD::CMPICC, VTs, Ops, 2).getValue(1);
+      if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+      Opc = SPISD::BRICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, MVT::Flag, LHS, RHS);
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      Opc = SPISD::BRFCC;
+    }
+    return DAG.getNode(Opc, MVT::Other, Chain, Dest,
+                       DAG.getConstant(SPCC, MVT::i32), CompareFlag);
+  }
+  case ISD::SELECT_CC: {
+    SDOperand LHS = Op.getOperand(0);
+    SDOperand RHS = Op.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+    SDOperand TrueVal = Op.getOperand(2);
+    SDOperand FalseVal = Op.getOperand(3);
+    unsigned Opc, SPCC = ~0U;
+
+    // If this is a select_cc of a "setcc", and if the setcc got lowered into
+    // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+    LookThroughSetCC(LHS, RHS, CC, SPCC);
+    
+    SDOperand CompareFlag;
+    if (LHS.getValueType() == MVT::i32) {
+      std::vector<MVT::ValueType> VTs;
+      VTs.push_back(LHS.getValueType());   // subcc returns a value
+      VTs.push_back(MVT::Flag);
+      SDOperand Ops[2] = { LHS, RHS };
+      CompareFlag = DAG.getNode(SPISD::CMPICC, VTs, Ops, 2).getValue(1);
+      Opc = SPISD::SELECT_ICC;
+      if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, MVT::Flag, LHS, RHS);
+      Opc = SPISD::SELECT_FCC;
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    }
+    return DAG.getNode(Opc, TrueVal.getValueType(), TrueVal, FalseVal, 
+                       DAG.getConstant(SPCC, MVT::i32), CompareFlag);
+  }
+  case ISD::VASTART: {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDOperand Offset = DAG.getNode(ISD::ADD, MVT::i32,
+                                   DAG.getRegister(SP::I6, MVT::i32),
+                                DAG.getConstant(VarArgsFrameOffset, MVT::i32));
+    SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2));
+    return DAG.getStore(Op.getOperand(0), Offset, 
+                        Op.getOperand(1), SV->getValue(), SV->getOffset());
+  }
+  case ISD::VAARG: {
+    SDNode *Node = Op.Val;
+    MVT::ValueType VT = Node->getValueType(0);
+    SDOperand InChain = Node->getOperand(0);
+    SDOperand VAListPtr = Node->getOperand(1);
+    SrcValueSDNode *SV = cast<SrcValueSDNode>(Node->getOperand(2));
+    SDOperand VAList = DAG.getLoad(getPointerTy(), InChain, VAListPtr,
+                                   SV->getValue(), SV->getOffset());
+    // Increment the pointer, VAList, to the next vaarg
+    SDOperand NextPtr = DAG.getNode(ISD::ADD, getPointerTy(), VAList, 
+                                    DAG.getConstant(MVT::getSizeInBits(VT)/8, 
+                                                    getPointerTy()));
+    // Store the incremented VAList to the legalized pointer
+    InChain = DAG.getStore(VAList.getValue(1), NextPtr,
+                           VAListPtr, SV->getValue(), SV->getOffset());
+    // Load the actual argument out of the pointer VAList, unless this is an
+    // f64 load.
+    if (VT != MVT::f64) {
+      return DAG.getLoad(VT, InChain, VAList, NULL, 0);
+    } else {
+      // Otherwise, load it as i64, then do a bitconvert.
+      SDOperand V = DAG.getLoad(MVT::i64, InChain, VAList, NULL, 0);
+      std::vector<MVT::ValueType> Tys;
+      Tys.push_back(MVT::f64);
+      Tys.push_back(MVT::Other);
+      // Bit-Convert the value to f64.
+      SDOperand Ops[2] = { DAG.getNode(ISD::BIT_CONVERT, MVT::f64, V),
+                           V.getValue(1) };
+      return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 2);
+    }
+  }
+  case ISD::DYNAMIC_STACKALLOC: {
+    SDOperand Chain = Op.getOperand(0);  // Legalize the chain.
+    SDOperand Size  = Op.getOperand(1);  // Legalize the size.
+    
+    unsigned SPReg = SP::O6;
+    SDOperand SP = DAG.getCopyFromReg(Chain, SPReg, MVT::i32);
+    SDOperand NewSP = DAG.getNode(ISD::SUB, MVT::i32, SP, Size);    // Value
+    Chain = DAG.getCopyToReg(SP.getValue(1), SPReg, NewSP);      // Output chain
+
+    // The resultant pointer is actually 16 words from the bottom of the stack,
+    // to provide a register spill area.
+    SDOperand NewVal = DAG.getNode(ISD::ADD, MVT::i32, NewSP,
+                                   DAG.getConstant(96, MVT::i32));
+    std::vector<MVT::ValueType> Tys;
+    Tys.push_back(MVT::i32);
+    Tys.push_back(MVT::Other);
+    SDOperand Ops[2] = { NewVal, Chain };
+    return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 2);
+  }
+  case ISD::RET: {
+    SDOperand Copy;
+    
+    switch(Op.getNumOperands()) {
+    default:
+      assert(0 && "Do not know how to return this many arguments!");
+      abort();
+    case 1: 
+      return SDOperand(); // ret void is legal
+    case 3: {
+      unsigned ArgReg;
+      switch(Op.getOperand(1).getValueType()) {
+      default: assert(0 && "Unknown type to return!");
+      case MVT::i32: ArgReg = SP::I0; break;
+      case MVT::f32: ArgReg = SP::F0; break;
+      case MVT::f64: ArgReg = SP::D0; break;
+      }
+      Copy = DAG.getCopyToReg(Op.getOperand(0), ArgReg, Op.getOperand(1),
+                              SDOperand());
+      break;
+    }
+    case 5:
+      Copy = DAG.getCopyToReg(Op.getOperand(0), SP::I0, Op.getOperand(3), 
+                              SDOperand());
+      Copy = DAG.getCopyToReg(Copy, SP::I1, Op.getOperand(1), Copy.getValue(1));
+      break;
+    }
+    return DAG.getNode(SPISD::RET_FLAG, MVT::Other, Copy, Copy.getValue(1));
+  }
+  // Frame & Return address.  Currently unimplemented
+  case ISD::RETURNADDR:         break;
+  case ISD::FRAMEADDR:          break;
+  }
+  return SDOperand();
+}
+
+MachineBasicBlock *
+SparcTargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                             MachineBasicBlock *BB) {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  unsigned BROpcode;
+  unsigned CC;
+  // Figure out the conditional branch opcode to use for this select_cc.
+  switch (MI->getOpcode()) {
+  default: assert(0 && "Unknown SELECT_CC!");
+  case SP::SELECT_CC_Int_ICC:
+  case SP::SELECT_CC_FP_ICC:
+  case SP::SELECT_CC_DFP_ICC:
+    BROpcode = SP::BCOND;
+    break;
+  case SP::SELECT_CC_Int_FCC:
+  case SP::SELECT_CC_FP_FCC:
+  case SP::SELECT_CC_DFP_FCC:
+    BROpcode = SP::FBCOND;
+    break;
+  }
+
+  CC = (SPCC::CondCodes)MI->getOperand(3).getImmedValue();
+  
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  ilist<MachineBasicBlock>::iterator It = BB;
+  ++It;
+  
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   [f]bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB);
+  BuildMI(BB, TII.get(BROpcode)).addMBB(sinkMBB).addImm(CC);
+  MachineFunction *F = BB->getParent();
+  F->getBasicBlockList().insert(It, copy0MBB);
+  F->getBasicBlockList().insert(It, sinkMBB);
+  // Update machine-CFG edges by first adding all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  for(MachineBasicBlock::succ_iterator i = BB->succ_begin(), 
+      e = BB->succ_end(); i != e; ++i)
+    sinkMBB->addSuccessor(*i);
+  // Next, remove all successors of the current block, and add the true
+  // and fallthrough blocks as its successors.
+  while(!BB->succ_empty())
+    BB->removeSuccessor(BB->succ_begin());
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+  
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+  
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+  
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(BB, TII.get(SP::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+  
+  delete MI;   // The pseudo instruction is gone now.
+  return BB;
+}
+  
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===--------------------------------------------------------------------===//
+/// SparcDAGToDAGISel - SPARC specific code to select SPARC machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class SparcDAGToDAGISel : public SelectionDAGISel {
+  SparcTargetLowering Lowering;
+
+  /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const SparcSubtarget &Subtarget;
+public:
+  SparcDAGToDAGISel(TargetMachine &TM)
+    : SelectionDAGISel(Lowering), Lowering(TM),
+      Subtarget(TM.getSubtarget<SparcSubtarget>()) {
+  }
+
+  SDNode *Select(SDOperand Op);
+
+  // Complex Pattern Selectors.
+  bool SelectADDRrr(SDOperand Op, SDOperand N, SDOperand &R1, SDOperand &R2);
+  bool SelectADDRri(SDOperand Op, SDOperand N, SDOperand &Base,
+                    SDOperand &Offset);
+  
+  /// InstructionSelectBasicBlock - This callback is invoked by
+  /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+  virtual void InstructionSelectBasicBlock(SelectionDAG &DAG);
+  
+  virtual const char *getPassName() const {
+    return "SPARC DAG->DAG Pattern Instruction Selection";
+  } 
+  
+  // Include the pieces autogenerated from the target description.
+#include "SparcGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+/// InstructionSelectBasicBlock - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void SparcDAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
+  DEBUG(BB->dump());
+  
+  // Select target instructions for the DAG.
+  DAG.setRoot(SelectRoot(DAG.getRoot()));
+  DAG.RemoveDeadNodes();
+  
+  // Emit machine code to BB. 
+  ScheduleAndEmitDAG(DAG);
+}
+
+bool SparcDAGToDAGISel::SelectADDRri(SDOperand Op, SDOperand Addr,
+                                     SDOperand &Base, SDOperand &Offset) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+  
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (Predicate_simm13(CN)) {
+        if (FrameIndexSDNode *FIN = 
+                dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+          // Constant offset from frame ref.
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+        } else {
+          Base = Addr.getOperand(0);
+        }
+        Offset = CurDAG->getTargetConstant(CN->getValue(), MVT::i32);
+        return true;
+      }
+    }
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(1);
+      Offset = Addr.getOperand(0).getOperand(0);
+      return true;
+    }
+    if (Addr.getOperand(1).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(0);
+      Offset = Addr.getOperand(1).getOperand(0);
+      return true;
+    }
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool SparcDAGToDAGISel::SelectADDRrr(SDOperand Op, SDOperand Addr,
+                                     SDOperand &R1,  SDOperand &R2) {
+  if (Addr.getOpcode() == ISD::FrameIndex) return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+  
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (isa<ConstantSDNode>(Addr.getOperand(1)) &&
+        Predicate_simm13(Addr.getOperand(1).Val))
+      return false;  // Let the reg+imm pattern catch this!
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo ||
+        Addr.getOperand(1).getOpcode() == SPISD::Lo)
+      return false;  // Let the reg+imm pattern catch this!
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    return true;
+  }
+
+  R1 = Addr;
+  R2 = CurDAG->getRegister(SP::G0, MVT::i32);
+  return true;
+}
+
+SDNode *SparcDAGToDAGISel::Select(SDOperand Op) {
+  SDNode *N = Op.Val;
+  if (N->getOpcode() >= ISD::BUILTIN_OP_END &&
+      N->getOpcode() < SPISD::FIRST_NUMBER)
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::SDIV:
+  case ISD::UDIV: {
+    // FIXME: should use a custom expander to expose the SRA to the dag.
+    SDOperand DivLHS = N->getOperand(0);
+    SDOperand DivRHS = N->getOperand(1);
+    AddToISelQueue(DivLHS);
+    AddToISelQueue(DivRHS);
+    
+    // Set the Y register to the high-part.
+    SDOperand TopPart;
+    if (N->getOpcode() == ISD::SDIV) {
+      TopPart = SDOperand(CurDAG->getTargetNode(SP::SRAri, MVT::i32, DivLHS,
+                                   CurDAG->getTargetConstant(31, MVT::i32)), 0);
+    } else {
+      TopPart = CurDAG->getRegister(SP::G0, MVT::i32);
+    }
+    TopPart = SDOperand(CurDAG->getTargetNode(SP::WRYrr, MVT::Flag, TopPart,
+                                     CurDAG->getRegister(SP::G0, MVT::i32)), 0);
+
+    // FIXME: Handle div by immediate.
+    unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
+    return CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS,
+                                TopPart);
+  }    
+  case ISD::MULHU:
+  case ISD::MULHS: {
+    // FIXME: Handle mul by immediate.
+    SDOperand MulLHS = N->getOperand(0);
+    SDOperand MulRHS = N->getOperand(1);
+    AddToISelQueue(MulLHS);
+    AddToISelQueue(MulRHS);
+    unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr;
+    SDNode *Mul = CurDAG->getTargetNode(Opcode, MVT::i32, MVT::Flag,
+                                        MulLHS, MulRHS);
+    // The high part is in the Y register.
+    return CurDAG->SelectNodeTo(N, SP::RDY, MVT::i32, SDOperand(Mul, 1));
+    return NULL;
+  }
+  }
+  
+  return SelectCode(Op);
+}
+
+
+/// createSparcISelDag - This pass converts a legalized DAG into a 
+/// SPARC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSparcISelDag(TargetMachine &TM) {
+  return new SparcDAGToDAGISel(TM);
+}

diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td
new file mode 100644
index 0000000..f463ab8
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrFormats.td

@@ -0,0 +1,113 @@
+//===- SparcInstrFormats.td - Sparc Instruction Formats ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+class InstSP<dag ops, string asmstr, list<dag> pattern> : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "SP";
+
+  bits<2> op;
+  let Inst{31-30} = op;               // Top two bits are the 'op' field
+  
+  dag OperandList = ops;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #2 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+// Format 2 instructions
+class F2<dag ops, string asmstr, list<dag> pattern>
+   : InstSP<ops, asmstr, pattern> {
+  bits<3>  op2;
+  bits<22> imm22;
+  let op          = 0;    // op = 0
+  let Inst{24-22} = op2;
+  let Inst{21-0}  = imm22;
+}
+
+// Specific F2 classes: SparcV8 manual, page 44
+//
+class F2_1<bits<3> op2Val, dag ops, string asmstr, list<dag> pattern>
+   : F2<ops, asmstr, pattern> {
+  bits<5>  rd;
+
+  let op2         = op2Val;
+
+  let Inst{29-25} = rd;
+}
+
+class F2_2<bits<4> condVal, bits<3> op2Val, dag ops, string asmstr, 
+           list<dag> pattern> : F2<ops, asmstr, pattern> {
+  bits<4>   cond;
+  bit       annul = 0;     // currently unused
+
+  let cond        = condVal;
+  let op2         = op2Val;
+
+  let Inst{29}    = annul;
+  let Inst{28-25} = cond;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #3 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+class F3<dag ops, string asmstr, list<dag> pattern>
+    : InstSP<ops, asmstr, pattern> {
+  bits<5> rd;
+  bits<6> op3;
+  bits<5> rs1;
+  let op{1} = 1;   // Op = 2 or 3
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+  let Inst{18-14} = rs1;
+}
+
+// Specific F3 classes: SparcV8 manual, page 44
+//
+class F3_1<bits<2> opVal, bits<6> op3val, dag ops,
+           string asmstr, list<dag> pattern> : F3<ops, asmstr, pattern> {
+  bits<8> asi = 0; // asi not currently used
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 0;     // i field = 0
+  let Inst{12-5} = asi;   // address space identifier
+  let Inst{4-0}  = rs2;
+}
+
+class F3_2<bits<2> opVal, bits<6> op3val, dag ops, 
+           string asmstr, list<dag> pattern> : F3<ops, asmstr, pattern> {
+  bits<13> simm13;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 1;     // i field = 1
+  let Inst{12-0} = simm13;
+}
+
+// floating-point
+class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag ops,
+           string asmstr, list<dag> pattern> : F3<ops, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+

diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
new file mode 100644
index 0000000..a8c822a
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp

@@ -0,0 +1,108 @@
+//===- SparcInstrInfo.cpp - Sparc Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcInstrInfo.h"
+#include "Sparc.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "SparcGenInstrInfo.inc"
+using namespace llvm;
+
+SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
+  : TargetInstrInfo(SparcInsts, sizeof(SparcInsts)/sizeof(SparcInsts[0])),
+    RI(ST, *this) {
+}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImmediate() && op.getImmedValue() == 0;
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+///
+bool SparcInstrInfo::isMoveInstr(const MachineInstr &MI,
+                                 unsigned &SrcReg, unsigned &DstReg) const {
+  // We look for 3 kinds of patterns here:
+  // or with G0 or 0
+  // add with G0 or 0
+  // fmovs or FpMOVD (pseudo double move).
+  if (MI.getOpcode() == SP::ORrr || MI.getOpcode() == SP::ADDrr) {
+    if (MI.getOperand(1).getReg() == SP::G0) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(2).getReg();
+      return true;
+    } else if (MI.getOperand(2).getReg() == SP::G0) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  } else if ((MI.getOpcode() == SP::ORri || MI.getOpcode() == SP::ADDri) &&
+             isZeroImm(MI.getOperand(2)) && MI.getOperand(1).isRegister()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  } else if (MI.getOpcode() == SP::FMOVS || MI.getOpcode() == SP::FpMOVD ||
+             MI.getOpcode() == SP::FMOVD) {
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+  return false;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned SparcInstrInfo::isLoadFromStackSlot(MachineInstr *MI,
+                                             int &FrameIndex) const {
+  if (MI->getOpcode() == SP::LDri ||
+      MI->getOpcode() == SP::LDFri ||
+      MI->getOpcode() == SP::LDDFri) {
+    if (MI->getOperand(1).isFrameIndex() && MI->getOperand(2).isImmediate() &&
+        MI->getOperand(2).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned SparcInstrInfo::isStoreToStackSlot(MachineInstr *MI,
+                                            int &FrameIndex) const {
+  if (MI->getOpcode() == SP::STri ||
+      MI->getOpcode() == SP::STFri ||
+      MI->getOpcode() == SP::STDFri) {
+    if (MI->getOperand(0).isFrameIndex() && MI->getOperand(1).isImmediate() &&
+        MI->getOperand(1).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(0).getFrameIndex();
+      return MI->getOperand(2).getReg();
+    }
+  }
+  return 0;
+}
+
+unsigned
+SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const std::vector<MachineOperand> &Cond)const{
+  // Can only insert uncond branches so far.
+  assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!");
+  BuildMI(&MBB, get(SP::BA)).addMBB(TBB);
+  return 1;
+}

diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
new file mode 100644
index 0000000..3fb50ff
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrInfo.h

@@ -0,0 +1,73 @@
+//===- SparcInstrInfo.h - Sparc Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCINSTRUCTIONINFO_H
+#define SPARCINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "SparcRegisterInfo.h"
+
+namespace llvm {
+
+/// SPII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace SPII {
+  enum {
+    Pseudo = (1<<0),
+    Load = (1<<1),
+    Store = (1<<2),
+    DelaySlot = (1<<3)
+  };
+}
+
+class SparcInstrInfo : public TargetInstrInfo {
+  const SparcRegisterInfo RI;
+public:
+  SparcInstrInfo(SparcSubtarget &ST);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and
+  /// leave the source and dest operands in the passed parameters.
+  ///
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg) const;
+  
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  
+  
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const std::vector<MachineOperand> &Cond) const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
new file mode 100644
index 0000000..434e8d7
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrInfo.td

@@ -0,0 +1,776 @@
+//===- SparcInstrInfo.td - Target Description for Sparc Target ------------===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Sparc instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "SparcInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+// HasV9 - This predicate is true when the target processor supports V9
+// instructions.  Note that the machine may be running in 32-bit mode.
+def HasV9   : Predicate<"Subtarget.isV9()">;
+
+// HasNoV9 - This predicate is true when the target doesn't have V9
+// instructions.  Use of this is just a hack for the isel not having proper
+// costs for V8 instructions that are more expensive than their V9 ones.
+def HasNoV9 : Predicate<"!Subtarget.isV9()">;
+
+// HasVIS - This is true when the target processor has VIS extensions.
+def HasVIS : Predicate<"Subtarget.isVIS()">;
+
+// UseDeprecatedInsts - This predicate is true when the target processor is a
+// V8, or when it is V9 but the V8 deprecated instructions are efficient enough
+// to use when appropriate.  In either of these cases, the instruction selector
+// will pick deprecated instructions.
+def UseDeprecatedInsts : Predicate<"Subtarget.useDeprecatedV8Instructions()">;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def simm11  : PatLeaf<(imm), [{
+  // simm11 predicate - True if the imm fits in a 11-bit sign extended field.
+  return (((int)N->getValue() << (32-11)) >> (32-11)) == (int)N->getValue();
+}]>;
+
+def simm13  : PatLeaf<(imm), [{
+  // simm13 predicate - True if the imm fits in a 13-bit sign extended field.
+  return (((int)N->getValue() << (32-13)) >> (32-13)) == (int)N->getValue();
+}]>;
+
+def LO10 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((unsigned)N->getValue() & 1023, MVT::i32);
+}]>;
+
+def HI22 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return CurDAG->getTargetConstant((unsigned)N->getValue() >> 10, MVT::i32);
+}]>;
+
+def SETHIimm : PatLeaf<(imm), [{
+  return (((unsigned)N->getValue() >> 10) << 10) == (unsigned)N->getValue();
+}], HI22>;
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], []>;
+
+// Address operands
+def MEMrr : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops IntRegs, IntRegs);
+}
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops IntRegs, i32imm);
+}
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+def calltarget : Operand<i32>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printCCOperand" in
+  def CCOp : Operand<i32>;
+
+def SDTSPcmpfcc : 
+SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPbrcc : 
+SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+def SDTSPselectcc :
+SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>;
+def SDTSPFTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDTSPITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+
+def SPcmpicc : SDNode<"SPISD::CMPICC", SDTIntBinOp, [SDNPOutFlag]>;
+def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutFlag]>;
+def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInFlag]>;
+def SPbrfcc : SDNode<"SPISD::BRFCC", SDTSPbrcc, [SDNPHasChain, SDNPInFlag]>;
+
+def SPhi    : SDNode<"SPISD::Hi", SDTIntUnaryOp>;
+def SPlo    : SDNode<"SPISD::Lo", SDTIntUnaryOp>;
+
+def SPftoi  : SDNode<"SPISD::FTOI", SDTSPFTOI>;
+def SPitof  : SDNode<"SPISD::ITOF", SDTSPITOF>;
+
+def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInFlag]>;
+def SPselectfcc : SDNode<"SPISD::SELECT_FCC", SDTSPselectcc, [SDNPInFlag]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_SPCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>;
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeq,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeq,
+                           [SDNPHasChain, SDNPOutFlag]>;
+
+def SDT_SPCall    : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"SPISD::CALL", SDT_SPCall,
+	                   [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def SDT_SPRetFlag : SDTypeProfile<0, 0, []>;
+def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRetFlag,
+	                   [SDNPHasChain, SDNPOptInFlag]>;
+
+//===----------------------------------------------------------------------===//
+// SPARC Flag Conditions
+//===----------------------------------------------------------------------===//
+
+// Note that these values must be kept in sync with the CCOp::CondCode enum
+// values.
+class ICC_VAL<int N> : PatLeaf<(i32 N)>;
+def ICC_NE  : ICC_VAL< 9>;  // Not Equal
+def ICC_E   : ICC_VAL< 1>;  // Equal
+def ICC_G   : ICC_VAL<10>;  // Greater
+def ICC_LE  : ICC_VAL< 2>;  // Less or Equal
+def ICC_GE  : ICC_VAL<11>;  // Greater or Equal
+def ICC_L   : ICC_VAL< 3>;  // Less
+def ICC_GU  : ICC_VAL<12>;  // Greater Unsigned
+def ICC_LEU : ICC_VAL< 4>;  // Less or Equal Unsigned
+def ICC_CC  : ICC_VAL<13>;  // Carry Clear/Great or Equal Unsigned
+def ICC_CS  : ICC_VAL< 5>;  // Carry Set/Less Unsigned
+def ICC_POS : ICC_VAL<14>;  // Positive
+def ICC_NEG : ICC_VAL< 6>;  // Negative
+def ICC_VC  : ICC_VAL<15>;  // Overflow Clear
+def ICC_VS  : ICC_VAL< 7>;  // Overflow Set
+
+class FCC_VAL<int N> : PatLeaf<(i32 N)>;
+def FCC_U   : FCC_VAL<23>;  // Unordered
+def FCC_G   : FCC_VAL<22>;  // Greater
+def FCC_UG  : FCC_VAL<21>;  // Unordered or Greater
+def FCC_L   : FCC_VAL<20>;  // Less
+def FCC_UL  : FCC_VAL<19>;  // Unordered or Less
+def FCC_LG  : FCC_VAL<18>;  // Less or Greater
+def FCC_NE  : FCC_VAL<17>;  // Not Equal
+def FCC_E   : FCC_VAL<25>;  // Equal
+def FCC_UE  : FCC_VAL<24>;  // Unordered or Equal
+def FCC_GE  : FCC_VAL<25>;  // Greater or Equal
+def FCC_UGE : FCC_VAL<26>;  // Unordered or Greater or Equal
+def FCC_LE  : FCC_VAL<27>;  // Less or Equal
+def FCC_ULE : FCC_VAL<28>;  // Unordered or Less or Equal
+def FCC_O   : FCC_VAL<29>;  // Ordered
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+/// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
+multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode> {
+  def rr  : F3_1<2, Op3Val, 
+                 (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"),
+                 [(set IntRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+  def ri  : F3_2<2, Op3Val,
+                 (ops IntRegs:$dst, IntRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"),
+                 [(set IntRegs:$dst, (OpNode IntRegs:$b, simm13:$c))]>;
+}
+
+/// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
+/// pattern.
+multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
+  def rr  : F3_1<2, Op3Val, 
+                 (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"), []>;
+  def ri  : F3_2<2, Op3Val,
+                 (ops IntRegs:$dst, IntRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo instructions.
+class Pseudo<dag ops, string asmstr, list<dag> pattern>
+   : InstSP<ops, asmstr, pattern>;
+
+def ADJCALLSTACKDOWN : Pseudo<(ops i32imm:$amt),
+                               "!ADJCALLSTACKDOWN $amt",
+                               [(callseq_start imm:$amt)]>, Imp<[O6],[O6]>;
+def ADJCALLSTACKUP : Pseudo<(ops i32imm:$amt),
+                            "!ADJCALLSTACKUP $amt",
+                            [(callseq_end imm:$amt)]>, Imp<[O6],[O6]>;
+def IMPLICIT_DEF_Int : Pseudo<(ops IntRegs:$dst),
+                              "!IMPLICIT_DEF $dst",
+                              [(set IntRegs:$dst, (undef))]>;
+def IMPLICIT_DEF_FP  : Pseudo<(ops FPRegs:$dst), "!IMPLICIT_DEF $dst",
+                              [(set FPRegs:$dst, (undef))]>;
+def IMPLICIT_DEF_DFP : Pseudo<(ops DFPRegs:$dst), "!IMPLICIT_DEF $dst",
+                              [(set DFPRegs:$dst, (undef))]>;
+                              
+// FpMOVD/FpNEGD/FpABSD - These are lowered to single-precision ops by the 
+// fpmover pass.
+let Predicates = [HasNoV9] in {  // Only emit these in V8 mode.
+  def FpMOVD : Pseudo<(ops DFPRegs:$dst, DFPRegs:$src),
+                      "!FpMOVD $src, $dst", []>;
+  def FpNEGD : Pseudo<(ops DFPRegs:$dst, DFPRegs:$src),
+                      "!FpNEGD $src, $dst",
+                      [(set DFPRegs:$dst, (fneg DFPRegs:$src))]>;
+  def FpABSD : Pseudo<(ops DFPRegs:$dst, DFPRegs:$src),
+                      "!FpABSD $src, $dst",
+                      [(set DFPRegs:$dst, (fabs DFPRegs:$src))]>;
+}
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded by the
+// scheduler into a branch sequence.  This has to handle all permutations of
+// selection between i32/f32/f64 on ICC and FCC.
+let usesCustomDAGSchedInserter = 1 in {   // Expanded by the scheduler.
+  def SELECT_CC_Int_ICC
+   : Pseudo<(ops IntRegs:$dst, IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_ICC PSEUDO!",
+            [(set IntRegs:$dst, (SPselecticc IntRegs:$T, IntRegs:$F,
+                                             imm:$Cond))]>;
+  def SELECT_CC_Int_FCC
+   : Pseudo<(ops IntRegs:$dst, IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_FCC PSEUDO!",
+            [(set IntRegs:$dst, (SPselectfcc IntRegs:$T, IntRegs:$F,
+                                             imm:$Cond))]>;
+  def SELECT_CC_FP_ICC
+   : Pseudo<(ops FPRegs:$dst, FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_ICC PSEUDO!",
+            [(set FPRegs:$dst, (SPselecticc FPRegs:$T, FPRegs:$F,
+                                            imm:$Cond))]>;
+  def SELECT_CC_FP_FCC
+   : Pseudo<(ops FPRegs:$dst, FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_FCC PSEUDO!",
+            [(set FPRegs:$dst, (SPselectfcc FPRegs:$T, FPRegs:$F,
+                                            imm:$Cond))]>;
+  def SELECT_CC_DFP_ICC
+   : Pseudo<(ops DFPRegs:$dst, DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_ICC PSEUDO!",
+            [(set DFPRegs:$dst, (SPselecticc DFPRegs:$T, DFPRegs:$F,
+                                             imm:$Cond))]>;
+  def SELECT_CC_DFP_FCC
+   : Pseudo<(ops DFPRegs:$dst, DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_FCC PSEUDO!",
+            [(set DFPRegs:$dst, (SPselectfcc DFPRegs:$T, DFPRegs:$F,
+                                             imm:$Cond))]>;
+}
+
+
+// Section A.3 - Synthetic Instructions, p. 85
+// special cases of JMPL:
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, noResults = 1 in {
+  let rd = O7.Num, rs1 = G0.Num, simm13 = 8 in
+    def RETL: F3_2<2, 0b111000, (ops), "retl", [(retflag)]>;
+}
+
+// Section B.1 - Load Integer Instructions, p. 90
+def LDSBrr : F3_1<3, 0b001001,
+                  (ops IntRegs:$dst, MEMrr:$addr),
+                  "ldsb [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi8 ADDRrr:$addr))]>;
+def LDSBri : F3_2<3, 0b001001,
+                  (ops IntRegs:$dst, MEMri:$addr),
+                  "ldsb [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi8 ADDRri:$addr))]>;
+def LDSHrr : F3_1<3, 0b001010,
+                  (ops IntRegs:$dst, MEMrr:$addr),
+                  "ldsh [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi16 ADDRrr:$addr))]>;
+def LDSHri : F3_2<3, 0b001010,
+                  (ops IntRegs:$dst, MEMri:$addr),
+                  "ldsh [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi16 ADDRri:$addr))]>;
+def LDUBrr : F3_1<3, 0b000001,
+                  (ops IntRegs:$dst, MEMrr:$addr),
+                  "ldub [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi8 ADDRrr:$addr))]>;
+def LDUBri : F3_2<3, 0b000001,
+                  (ops IntRegs:$dst, MEMri:$addr),
+                  "ldub [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi8 ADDRri:$addr))]>;
+def LDUHrr : F3_1<3, 0b000010,
+                  (ops IntRegs:$dst, MEMrr:$addr),
+                  "lduh [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi16 ADDRrr:$addr))]>;
+def LDUHri : F3_2<3, 0b000010,
+                  (ops IntRegs:$dst, MEMri:$addr),
+                  "lduh [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi16 ADDRri:$addr))]>;
+def LDrr   : F3_1<3, 0b000000,
+                  (ops IntRegs:$dst, MEMrr:$addr),
+                  "ld [$addr], $dst",
+                  [(set IntRegs:$dst, (load ADDRrr:$addr))]>;
+def LDri   : F3_2<3, 0b000000,
+                  (ops IntRegs:$dst, MEMri:$addr),
+                  "ld [$addr], $dst",
+                  [(set IntRegs:$dst, (load ADDRri:$addr))]>;
+
+// Section B.2 - Load Floating-point Instructions, p. 92
+def LDFrr  : F3_1<3, 0b100000,
+                  (ops FPRegs:$dst, MEMrr:$addr),
+                  "ld [$addr], $dst",
+                  [(set FPRegs:$dst, (load ADDRrr:$addr))]>;
+def LDFri  : F3_2<3, 0b100000,
+                  (ops FPRegs:$dst, MEMri:$addr),
+                  "ld [$addr], $dst",
+                  [(set FPRegs:$dst, (load ADDRri:$addr))]>;
+def LDDFrr : F3_1<3, 0b100011,
+                  (ops DFPRegs:$dst, MEMrr:$addr),
+                  "ldd [$addr], $dst",
+                  [(set DFPRegs:$dst, (load ADDRrr:$addr))]>;
+def LDDFri : F3_2<3, 0b100011,
+                  (ops DFPRegs:$dst, MEMri:$addr),
+                  "ldd [$addr], $dst",
+                  [(set DFPRegs:$dst, (load ADDRri:$addr))]>;
+
+// Section B.4 - Store Integer Instructions, p. 95
+def STBrr : F3_1<3, 0b000101,
+                 (ops MEMrr:$addr, IntRegs:$src),
+                 "stb $src, [$addr]",
+                 [(truncstorei8 IntRegs:$src, ADDRrr:$addr)]>;
+def STBri : F3_2<3, 0b000101,
+                 (ops MEMri:$addr, IntRegs:$src),
+                 "stb $src, [$addr]",
+                 [(truncstorei8 IntRegs:$src, ADDRri:$addr)]>;
+def STHrr : F3_1<3, 0b000110,
+                 (ops MEMrr:$addr, IntRegs:$src),
+                 "sth $src, [$addr]",
+                 [(truncstorei16 IntRegs:$src, ADDRrr:$addr)]>;
+def STHri : F3_2<3, 0b000110,
+                 (ops MEMri:$addr, IntRegs:$src),
+                 "sth $src, [$addr]",
+                 [(truncstorei16 IntRegs:$src, ADDRri:$addr)]>;
+def STrr  : F3_1<3, 0b000100,
+                 (ops MEMrr:$addr, IntRegs:$src),
+                 "st $src, [$addr]",
+                 [(store IntRegs:$src, ADDRrr:$addr)]>;
+def STri  : F3_2<3, 0b000100,
+                 (ops MEMri:$addr, IntRegs:$src),
+                 "st $src, [$addr]",
+                 [(store IntRegs:$src, ADDRri:$addr)]>;
+
+// Section B.5 - Store Floating-point Instructions, p. 97
+def STFrr   : F3_1<3, 0b100100,
+                   (ops MEMrr:$addr, FPRegs:$src),
+                   "st $src, [$addr]",
+                   [(store FPRegs:$src, ADDRrr:$addr)]>;
+def STFri   : F3_2<3, 0b100100,
+                   (ops MEMri:$addr, FPRegs:$src),
+                   "st $src, [$addr]",
+                   [(store FPRegs:$src, ADDRri:$addr)]>;
+def STDFrr  : F3_1<3, 0b100111,
+                   (ops MEMrr:$addr, DFPRegs:$src),
+                   "std  $src, [$addr]",
+                   [(store DFPRegs:$src, ADDRrr:$addr)]>;
+def STDFri  : F3_2<3, 0b100111,
+                   (ops MEMri:$addr, DFPRegs:$src),
+                   "std $src, [$addr]",
+                   [(store DFPRegs:$src, ADDRri:$addr)]>;
+
+// Section B.9 - SETHI Instruction, p. 104
+def SETHIi: F2_1<0b100,
+                 (ops IntRegs:$dst, i32imm:$src),
+                 "sethi $src, $dst",
+                 [(set IntRegs:$dst, SETHIimm:$src)]>;
+
+// Section B.10 - NOP Instruction, p. 105
+// (It's a special case of SETHI)
+let rd = 0, imm22 = 0 in
+  def NOP : F2_1<0b100, (ops), "nop", []>;
+
+// Section B.11 - Logical Instructions, p. 106
+defm AND    : F3_12<"and", 0b000001, and>;
+
+def ANDNrr  : F3_1<2, 0b000101,
+                   (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c),
+                   "andn $b, $c, $dst",
+                   [(set IntRegs:$dst, (and IntRegs:$b, (not IntRegs:$c)))]>;
+def ANDNri  : F3_2<2, 0b000101,
+                   (ops IntRegs:$dst, IntRegs:$b, i32imm:$c),
+                   "andn $b, $c, $dst", []>;
+
+defm OR     : F3_12<"or", 0b000010, or>;
+
+def ORNrr   : F3_1<2, 0b000110,
+                   (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c),
+                   "orn $b, $c, $dst",
+                   [(set IntRegs:$dst, (or IntRegs:$b, (not IntRegs:$c)))]>;
+def ORNri   : F3_2<2, 0b000110,
+                   (ops IntRegs:$dst, IntRegs:$b, i32imm:$c),
+                   "orn $b, $c, $dst", []>;
+defm XOR    : F3_12<"xor", 0b000011, xor>;
+
+def XNORrr  : F3_1<2, 0b000111,
+                   (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c),
+                   "xnor $b, $c, $dst",
+                   [(set IntRegs:$dst, (not (xor IntRegs:$b, IntRegs:$c)))]>;
+def XNORri  : F3_2<2, 0b000111,
+                   (ops IntRegs:$dst, IntRegs:$b, i32imm:$c),
+                   "xnor $b, $c, $dst", []>;
+
+// Section B.12 - Shift Instructions, p. 107
+defm SLL : F3_12<"sll", 0b100101, shl>;
+defm SRL : F3_12<"srl", 0b100110, srl>;
+defm SRA : F3_12<"sra", 0b100111, sra>;
+
+// Section B.13 - Add Instructions, p. 108
+defm ADD   : F3_12<"add", 0b000000, add>;
+
+// "LEA" forms of add (patterns to make tblgen happy)
+def LEA_ADDri   : F3_2<2, 0b000000,
+                   (ops IntRegs:$dst, MEMri:$addr),
+                   "add ${addr:arith}, $dst",
+                   [(set IntRegs:$dst, ADDRri:$addr)]>;
+                   
+defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
+defm ADDX  : F3_12<"addx", 0b001000, adde>;
+
+// Section B.15 - Subtract Instructions, p. 110
+defm SUB    : F3_12  <"sub"  , 0b000100, sub>;
+defm SUBX   : F3_12  <"subx" , 0b001100, sube>;
+defm SUBCC  : F3_12  <"subcc", 0b010100, SPcmpicc>;
+
+def SUBXCCrr: F3_1<2, 0b011100, 
+                   (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c),
+                   "subxcc $b, $c, $dst", []>;
+
+// Section B.18 - Multiply Instructions, p. 113
+defm UMUL : F3_12np<"umul", 0b001010>;
+defm SMUL : F3_12  <"smul", 0b001011, mul>;
+
+
+// Section B.19 - Divide Instructions, p. 115
+defm UDIV : F3_12np<"udiv", 0b001110>;
+defm SDIV : F3_12np<"sdiv", 0b001111>;
+
+// Section B.20 - SAVE and RESTORE, p. 117
+defm SAVE    : F3_12np<"save"   , 0b111100>;
+defm RESTORE : F3_12np<"restore", 0b111101>;
+
+// Section B.21 - Branch on Integer Condition Codes Instructions, p. 119
+
+// conditional branch class:
+class BranchSP<bits<4> cc, dag ops, string asmstr, list<dag> pattern>
+ : F2_2<cc, 0b010, ops, asmstr, pattern> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let noResults = 1;
+}
+
+let isBarrier = 1 in
+  def BA   : BranchSP<0b1000, (ops brtarget:$dst),
+                      "ba $dst",
+                      [(br bb:$dst)]>;
+                      
+// FIXME: the encoding for the JIT should look at the condition field.
+def BCOND : BranchSP<0, (ops brtarget:$dst, CCOp:$cc),
+                     "b$cc $dst",
+                     [(SPbricc bb:$dst, imm:$cc)]>;
+
+
+// Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
+
+// floating-point conditional branch class:
+class FPBranchSP<bits<4> cc, dag ops, string asmstr, list<dag> pattern>
+ : F2_2<cc, 0b110, ops, asmstr, pattern> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let noResults = 1;
+}
+
+// FIXME: the encoding for the JIT should look at the condition field.
+def FBCOND  : FPBranchSP<0, (ops brtarget:$dst, CCOp:$cc),
+                      "fb$cc $dst",
+                      [(SPbrfcc bb:$dst, imm:$cc)]>;
+
+
+// Section B.24 - Call and Link Instruction, p. 125
+// This is the only Format 1 instruction
+let Uses = [O0, O1, O2, O3, O4, O5],
+    hasDelaySlot = 1, isCall = 1, noResults = 1,
+    Defs = [O0, O1, O2, O3, O4, O5, O7, G1, G2, G3, G4, G5, G6, G7,
+    D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15] in { 
+  def CALL : InstSP<(ops calltarget:$dst),
+                    "call $dst", []> {
+    bits<30> disp;
+    let op = 1;
+    let Inst{29-0} = disp;
+  }
+  
+  // indirect calls
+  def JMPLrr : F3_1<2, 0b111000,
+                    (ops MEMrr:$ptr),
+                    "call $ptr",
+                    [(call  ADDRrr:$ptr)]>;
+  def JMPLri : F3_2<2, 0b111000,
+                    (ops MEMri:$ptr),
+                    "call $ptr",
+                    [(call  ADDRri:$ptr)]>;
+}
+
+// Section B.28 - Read State Register Instructions
+def RDY : F3_1<2, 0b101000,
+               (ops IntRegs:$dst),
+               "rd %y, $dst", []>;
+
+// Section B.29 - Write State Register Instructions
+def WRYrr : F3_1<2, 0b110000,
+                 (ops IntRegs:$b, IntRegs:$c),
+                 "wr $b, $c, %y", []>;
+def WRYri : F3_2<2, 0b110000,
+                 (ops IntRegs:$b, i32imm:$c),
+                 "wr $b, $c, %y", []>;
+
+// Convert Integer to Floating-point Instructions, p. 141
+def FITOS : F3_3<2, 0b110100, 0b011000100,
+                 (ops FPRegs:$dst, FPRegs:$src),
+                 "fitos $src, $dst",
+                 [(set FPRegs:$dst, (SPitof FPRegs:$src))]>;
+def FITOD : F3_3<2, 0b110100, 0b011001000, 
+                 (ops DFPRegs:$dst, FPRegs:$src),
+                 "fitod $src, $dst",
+                 [(set DFPRegs:$dst, (SPitof FPRegs:$src))]>;
+
+// Convert Floating-point to Integer Instructions, p. 142
+def FSTOI : F3_3<2, 0b110100, 0b011010001,
+                 (ops FPRegs:$dst, FPRegs:$src),
+                 "fstoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi FPRegs:$src))]>;
+def FDTOI : F3_3<2, 0b110100, 0b011010010,
+                 (ops FPRegs:$dst, DFPRegs:$src),
+                 "fdtoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi DFPRegs:$src))]>;
+
+// Convert between Floating-point Formats Instructions, p. 143
+def FSTOD : F3_3<2, 0b110100, 0b011001001, 
+                 (ops DFPRegs:$dst, FPRegs:$src),
+                 "fstod $src, $dst",
+                 [(set DFPRegs:$dst, (fextend FPRegs:$src))]>;
+def FDTOS : F3_3<2, 0b110100, 0b011000110,
+                 (ops FPRegs:$dst, DFPRegs:$src),
+                 "fdtos $src, $dst",
+                 [(set FPRegs:$dst, (fround DFPRegs:$src))]>;
+
+// Floating-point Move Instructions, p. 144
+def FMOVS : F3_3<2, 0b110100, 0b000000001,
+                 (ops FPRegs:$dst, FPRegs:$src),
+                 "fmovs $src, $dst", []>;
+def FNEGS : F3_3<2, 0b110100, 0b000000101, 
+                 (ops FPRegs:$dst, FPRegs:$src),
+                 "fnegs $src, $dst",
+                 [(set FPRegs:$dst, (fneg FPRegs:$src))]>;
+def FABSS : F3_3<2, 0b110100, 0b000001001, 
+                 (ops FPRegs:$dst, FPRegs:$src),
+                 "fabss $src, $dst",
+                 [(set FPRegs:$dst, (fabs FPRegs:$src))]>;
+
+
+// Floating-point Square Root Instructions, p.145
+def FSQRTS : F3_3<2, 0b110100, 0b000101001, 
+                  (ops FPRegs:$dst, FPRegs:$src),
+                  "fsqrts $src, $dst",
+                  [(set FPRegs:$dst, (fsqrt FPRegs:$src))]>;
+def FSQRTD : F3_3<2, 0b110100, 0b000101010, 
+                  (ops DFPRegs:$dst, DFPRegs:$src),
+                  "fsqrtd $src, $dst",
+                  [(set DFPRegs:$dst, (fsqrt DFPRegs:$src))]>;
+
+
+
+// Floating-point Add and Subtract Instructions, p. 146
+def FADDS  : F3_3<2, 0b110100, 0b001000001,
+                  (ops FPRegs:$dst, FPRegs:$src1, FPRegs:$src2),
+                  "fadds $src1, $src2, $dst",
+                  [(set FPRegs:$dst, (fadd FPRegs:$src1, FPRegs:$src2))]>;
+def FADDD  : F3_3<2, 0b110100, 0b001000010,
+                  (ops DFPRegs:$dst, DFPRegs:$src1, DFPRegs:$src2),
+                  "faddd $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fadd DFPRegs:$src1, DFPRegs:$src2))]>;
+def FSUBS  : F3_3<2, 0b110100, 0b001000101,
+                  (ops FPRegs:$dst, FPRegs:$src1, FPRegs:$src2),
+                  "fsubs $src1, $src2, $dst",
+                  [(set FPRegs:$dst, (fsub FPRegs:$src1, FPRegs:$src2))]>;
+def FSUBD  : F3_3<2, 0b110100, 0b001000110,
+                  (ops DFPRegs:$dst, DFPRegs:$src1, DFPRegs:$src2),
+                  "fsubd $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fsub DFPRegs:$src1, DFPRegs:$src2))]>;
+
+// Floating-point Multiply and Divide Instructions, p. 147
+def FMULS  : F3_3<2, 0b110100, 0b001001001,
+                  (ops FPRegs:$dst, FPRegs:$src1, FPRegs:$src2),
+                  "fmuls $src1, $src2, $dst",
+                  [(set FPRegs:$dst, (fmul FPRegs:$src1, FPRegs:$src2))]>;
+def FMULD  : F3_3<2, 0b110100, 0b001001010,
+                  (ops DFPRegs:$dst, DFPRegs:$src1, DFPRegs:$src2),
+                  "fmuld $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fmul DFPRegs:$src1, DFPRegs:$src2))]>;
+def FSMULD : F3_3<2, 0b110100, 0b001101001,
+                  (ops DFPRegs:$dst, FPRegs:$src1, FPRegs:$src2),
+                  "fsmuld $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fmul (fextend FPRegs:$src1),
+                                            (fextend FPRegs:$src2)))]>;
+def FDIVS  : F3_3<2, 0b110100, 0b001001101,
+                 (ops FPRegs:$dst, FPRegs:$src1, FPRegs:$src2),
+                 "fdivs $src1, $src2, $dst",
+                 [(set FPRegs:$dst, (fdiv FPRegs:$src1, FPRegs:$src2))]>;
+def FDIVD  : F3_3<2, 0b110100, 0b001001110,
+                 (ops DFPRegs:$dst, DFPRegs:$src1, DFPRegs:$src2),
+                 "fdivd $src1, $src2, $dst",
+                 [(set DFPRegs:$dst, (fdiv DFPRegs:$src1, DFPRegs:$src2))]>;
+
+// Floating-point Compare Instructions, p. 148
+// Note: the 2nd template arg is different for these guys.
+// Note 2: the result of a FCMP is not available until the 2nd cycle
+// after the instr is retired, but there is no interlock. This behavior
+// is modelled with a forced noop after the instruction.
+def FCMPS  : F3_3<2, 0b110101, 0b001010001,
+                  (ops FPRegs:$src1, FPRegs:$src2),
+                  "fcmps $src1, $src2\n\tnop",
+                  [(SPcmpfcc FPRegs:$src1, FPRegs:$src2)]>;
+def FCMPD  : F3_3<2, 0b110101, 0b001010010,
+                  (ops DFPRegs:$src1, DFPRegs:$src2),
+                  "fcmpd $src1, $src2\n\tnop",
+                  [(SPcmpfcc DFPRegs:$src1, DFPRegs:$src2)]>;
+
+
+//===----------------------------------------------------------------------===//
+// V9 Instructions
+//===----------------------------------------------------------------------===//
+
+// V9 Conditional Moves.
+let Predicates = [HasV9], isTwoAddress = 1 in {
+  // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
+  // FIXME: Add instruction encodings for the JIT some day.
+  def MOVICCrr
+    : Pseudo<(ops IntRegs:$dst, IntRegs:$T, IntRegs:$F, CCOp:$cc),
+             "mov$cc %icc, $F, $dst",
+             [(set IntRegs:$dst,
+                         (SPselecticc IntRegs:$F, IntRegs:$T, imm:$cc))]>;
+  def MOVICCri
+    : Pseudo<(ops IntRegs:$dst, IntRegs:$T, i32imm:$F, CCOp:$cc),
+             "mov$cc %icc, $F, $dst",
+             [(set IntRegs:$dst,
+                          (SPselecticc simm11:$F, IntRegs:$T, imm:$cc))]>;
+
+  def MOVFCCrr
+    : Pseudo<(ops IntRegs:$dst, IntRegs:$T, IntRegs:$F, CCOp:$cc),
+             "mov$cc %fcc0, $F, $dst",
+             [(set IntRegs:$dst,
+                         (SPselectfcc IntRegs:$F, IntRegs:$T, imm:$cc))]>;
+  def MOVFCCri
+    : Pseudo<(ops IntRegs:$dst, IntRegs:$T, i32imm:$F, CCOp:$cc),
+             "mov$cc %fcc0, $F, $dst",
+             [(set IntRegs:$dst,
+                          (SPselectfcc simm11:$F, IntRegs:$T, imm:$cc))]>;
+
+  def FMOVS_ICC
+    : Pseudo<(ops FPRegs:$dst, FPRegs:$T, FPRegs:$F, CCOp:$cc),
+             "fmovs$cc %icc, $F, $dst",
+             [(set FPRegs:$dst,
+                         (SPselecticc FPRegs:$F, FPRegs:$T, imm:$cc))]>;
+  def FMOVD_ICC
+    : Pseudo<(ops DFPRegs:$dst, DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
+             "fmovd$cc %icc, $F, $dst",
+             [(set DFPRegs:$dst,
+                         (SPselecticc DFPRegs:$F, DFPRegs:$T, imm:$cc))]>;
+  def FMOVS_FCC
+    : Pseudo<(ops FPRegs:$dst, FPRegs:$T, FPRegs:$F, CCOp:$cc),
+             "fmovs$cc %fcc0, $F, $dst",
+             [(set FPRegs:$dst,
+                         (SPselectfcc FPRegs:$F, FPRegs:$T, imm:$cc))]>;
+  def FMOVD_FCC
+    : Pseudo<(ops DFPRegs:$dst, DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
+             "fmovd$cc %fcc0, $F, $dst",
+             [(set DFPRegs:$dst,
+                         (SPselectfcc DFPRegs:$F, DFPRegs:$T, imm:$cc))]>;
+
+}
+
+// Floating-Point Move Instructions, p. 164 of the V9 manual.
+let Predicates = [HasV9] in {
+  def FMOVD : F3_3<2, 0b110100, 0b000000010,
+                   (ops DFPRegs:$dst, DFPRegs:$src),
+                   "fmovd $src, $dst", []>;
+  def FNEGD : F3_3<2, 0b110100, 0b000000110, 
+                   (ops DFPRegs:$dst, DFPRegs:$src),
+                   "fnegd $src, $dst",
+                   [(set DFPRegs:$dst, (fneg DFPRegs:$src))]>;
+  def FABSD : F3_3<2, 0b110100, 0b000001010, 
+                   (ops DFPRegs:$dst, DFPRegs:$src),
+                   "fabsd $src, $dst",
+                   [(set DFPRegs:$dst, (fabs DFPRegs:$src))]>;
+}
+
+// POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
+// the top 32-bits before using it.  To do this clearing, we use a SLLri X,0.
+def POPCrr : F3_1<2, 0b101110, 
+                  (ops IntRegs:$dst, IntRegs:$src),
+                  "popc $src, $dst", []>, Requires<[HasV9]>;
+def : Pat<(ctpop IntRegs:$src),
+          (POPCrr (SLLri IntRegs:$src, 0))>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Small immediates.
+def : Pat<(i32 simm13:$val),
+          (ORri G0, imm:$val)>;
+// Arbitrary immediates.
+def : Pat<(i32 imm:$val),
+          (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
+
+// subc
+def : Pat<(subc IntRegs:$b, IntRegs:$c),
+          (SUBCCrr IntRegs:$b, IntRegs:$c)>;
+def : Pat<(subc IntRegs:$b, simm13:$val),
+          (SUBCCri IntRegs:$b, imm:$val)>;
+
+// Global addresses, constant pool entries
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORri G0, tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORri G0, tconstpool:$in)>;
+
+// Add reg, lo.  This is used when taking the addr of a global/constpool entry.
+def : Pat<(add IntRegs:$r, (SPlo tglobaladdr:$in)),
+          (ADDri IntRegs:$r, tglobaladdr:$in)>;
+def : Pat<(add IntRegs:$r, (SPlo tconstpool:$in)),
+          (ADDri IntRegs:$r, tconstpool:$in)>;
+
+// Calls: 
+def : Pat<(call tglobaladdr:$dst),
+          (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+          (CALL texternalsym:$dst)>;
+
+def : Pat<(ret), (RETL)>;
+
+// Map integer extload's to zextloads.
+def : Pat<(i32 (extloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi8 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
+
+// zextload bool -> zextload byte
+def : Pat<(i32 (zextloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+
+// truncstore bool -> truncstore byte.
+def : Pat<(truncstorei1 IntRegs:$src, ADDRrr:$addr),
+          (STBrr ADDRrr:$addr, IntRegs:$src)>;
+def : Pat<(truncstorei1 IntRegs:$src, ADDRri:$addr), 
+          (STBri ADDRri:$addr, IntRegs:$src)>;

diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
new file mode 100644
index 0000000..1981b4f
--- /dev/null
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp

@@ -0,0 +1,276 @@
+//===- SparcRegisterInfo.cpp - SPARC Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPARC implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "SparcRegisterInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st,
+                                     const TargetInstrInfo &tii)
+  : SparcGenRegisterInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
+    Subtarget(st), TII(tii) {
+}
+
+void SparcRegisterInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, int FI,
+                    const TargetRegisterClass *RC) const {
+  // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
+  if (RC == SP::IntRegsRegisterClass)
+    BuildMI(MBB, I, TII.get(SP::STri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, false, false, true);
+  else if (RC == SP::FPRegsRegisterClass)
+    BuildMI(MBB, I, TII.get(SP::STFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, false, false, true);
+  else if (RC == SP::DFPRegsRegisterClass)
+    BuildMI(MBB, I, TII.get(SP::STDFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, false, false, true);
+  else
+    assert(0 && "Can't store this register to stack slot");
+}
+
+void SparcRegisterInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const {
+  if (RC == SP::IntRegsRegisterClass)
+    BuildMI(MBB, I, TII.get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == SP::FPRegsRegisterClass)
+    BuildMI(MBB, I, TII.get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == SP::DFPRegsRegisterClass)
+    BuildMI(MBB, I, TII.get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0);
+  else
+    assert(0 && "Can't load this register from stack slot");
+}
+
+void SparcRegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned DestReg, unsigned SrcReg,
+                                     const TargetRegisterClass *RC) const {
+  if (RC == SP::IntRegsRegisterClass)
+    BuildMI(MBB, I, TII.get(SP::ORrr), DestReg).addReg(SP::G0).addReg(SrcReg);
+  else if (RC == SP::FPRegsRegisterClass)
+    BuildMI(MBB, I, TII.get(SP::FMOVS), DestReg).addReg(SrcReg);
+  else if (RC == SP::DFPRegsRegisterClass)
+    BuildMI(MBB, I, TII.get(Subtarget.isV9() ? SP::FMOVD : SP::FpMOVD),DestReg)
+      .addReg(SrcReg);
+  else
+    assert (0 && "Can't copy this register");
+}
+
+void SparcRegisterInfo::reMaterialize(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned DestReg,
+                                      const MachineInstr *Orig) const {
+  MachineInstr *MI = Orig->clone();
+  MI->getOperand(0).setReg(DestReg);
+  MBB.insert(I, MI);
+}
+
+MachineInstr *SparcRegisterInfo::foldMemoryOperand(MachineInstr* MI,
+                                                   unsigned OpNum,
+                                                   int FI) const {
+  bool isFloat = false;
+  MachineInstr *NewMI = NULL;
+  switch (MI->getOpcode()) {
+  case SP::ORrr:
+    if (MI->getOperand(1).isRegister() && MI->getOperand(1).getReg() == SP::G0&&
+        MI->getOperand(0).isRegister() && MI->getOperand(2).isRegister()) {
+      if (OpNum == 0)    // COPY -> STORE
+        NewMI = BuildMI(TII.get(SP::STri)).addFrameIndex(FI).addImm(0)
+                                   .addReg(MI->getOperand(2).getReg());
+      else               // COPY -> LOAD
+        NewMI = BuildMI(TII.get(SP::LDri), MI->getOperand(0).getReg())
+                      .addFrameIndex(FI).addImm(0);
+    }
+    break;
+  case SP::FMOVS:
+    isFloat = true;
+    // FALLTHROUGH
+  case SP::FMOVD:
+    if (OpNum == 0)  // COPY -> STORE
+      NewMI = BuildMI(TII.get(isFloat ? SP::STFri : SP::STDFri))
+               .addFrameIndex(FI).addImm(0).addReg(MI->getOperand(1).getReg());
+    else             // COPY -> LOAD
+      NewMI = BuildMI(TII.get(isFloat ? SP::LDFri : SP::LDDFri),
+                     MI->getOperand(0).getReg()).addFrameIndex(FI).addImm(0);
+    break;
+  }
+
+  if (NewMI)
+    NewMI->copyKillDeadInfo(MI);
+  return NewMI;
+}
+
+const unsigned* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(SP::G2);
+  Reserved.set(SP::G3);
+  Reserved.set(SP::G4);
+  Reserved.set(SP::O6);
+  Reserved.set(SP::I6);
+  Reserved.set(SP::I7);
+  Reserved.set(SP::G0);
+  Reserved.set(SP::G5);
+  Reserved.set(SP::G6);
+  Reserved.set(SP::G7);
+  return Reserved;
+}
+
+
+const TargetRegisterClass* const*
+SparcRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
+  return CalleeSavedRegClasses;
+}
+
+bool SparcRegisterInfo::hasFP(const MachineFunction &MF) const {
+  return false;
+}
+
+void SparcRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MachineInstr &MI = *I;
+  int Size = MI.getOperand(0).getImmedValue();
+  if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
+    Size = -Size;
+  if (Size)
+    BuildMI(MBB, I, TII.get(SP::ADDri), SP::O6).addReg(SP::O6).addImm(Size);
+  MBB.erase(I);
+}
+
+void SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  while (!MI.getOperand(i).isFrameIndex()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getFrameIndex();
+
+  // Addressable stack objects are accessed using neg. offsets from %fp
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MI.getOperand(i+1).getImmedValue();
+
+  // Replace frame index with a frame pointer reference.
+  if (Offset >= -4096 && Offset <= 4095) {
+    // If the offset is small enough to fit in the immediate field, directly
+    // encode it.
+    MI.getOperand(i).ChangeToRegister(SP::I6, false);
+    MI.getOperand(i+1).ChangeToImmediate(Offset);
+  } else {
+    // Otherwise, emit a G1 = SETHI %hi(offset).  FIXME: it would be better to 
+    // scavenge a register here instead of reserving G1 all of the time.
+    unsigned OffHi = (unsigned)Offset >> 10U;
+    BuildMI(*MI.getParent(), II, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
+    // Emit G1 = G1 + I6
+    BuildMI(*MI.getParent(), II, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+      .addReg(SP::I6);
+    // Insert: G1+%lo(offset) into the user.
+    MI.getOperand(i).ChangeToRegister(SP::G1, false);
+    MI.getOperand(i+1).ChangeToImmediate(Offset & ((1 << 10)-1));
+  }
+}
+
+void SparcRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {}
+
+void SparcRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  int NumBytes = (int) MFI->getStackSize();
+
+  // Emit the correct save instruction based on the number of bytes in
+  // the frame. Minimum stack frame size according to V8 ABI is:
+  //   16 words for register window spill
+  //    1 word for address of returned aggregate-value
+  // +  6 words for passing parameters on the stack
+  // ----------
+  //   23 words * 4 bytes per word = 92 bytes
+  NumBytes += 92;
+  // Round up to next doubleword boundary -- a double-word boundary
+  // is required by the ABI.
+  NumBytes = (NumBytes + 7) & ~7;
+  NumBytes = -NumBytes;
+  
+  if (NumBytes >= -4096) {
+    BuildMI(MBB, MBB.begin(), TII.get(SP::SAVEri),
+            SP::O6).addImm(NumBytes).addReg(SP::O6);
+  } else {
+    MachineBasicBlock::iterator InsertPt = MBB.begin();
+    // Emit this the hard way.  This clobbers G1 which we always know is 
+    // available here.
+    unsigned OffHi = (unsigned)NumBytes >> 10U;
+    BuildMI(MBB, InsertPt, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
+    // Emit G1 = G1 + I6
+    BuildMI(MBB, InsertPt, TII.get(SP::ORri), SP::G1)
+      .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
+    BuildMI(MBB, InsertPt, TII.get(SP::SAVErr), SP::O6)
+      .addReg(SP::O6).addReg(SP::G1);
+  }
+}
+
+void SparcRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert(MBBI->getOpcode() == SP::RETL &&
+         "Can only put epilog before 'retl' instruction!");
+  BuildMI(MBB, MBBI, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
+    .addReg(SP::G0);
+}
+
+unsigned SparcRegisterInfo::getRARegister() const {
+  assert(0 && "What is the return address register");
+  return 0;
+}
+
+unsigned SparcRegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  assert(0 && "What is the frame register");
+  return SP::G1;
+}
+
+unsigned SparcRegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned SparcRegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+#include "SparcGenRegisterInfo.inc"
+

diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
new file mode 100644
index 0000000..451964b
--- /dev/null
+++ b/lib/Target/Sparc/SparcRegisterInfo.h

@@ -0,0 +1,86 @@
+//===- SparcRegisterInfo.h - Sparc Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCREGISTERINFO_H
+#define SPARCREGISTERINFO_H
+
+#include "llvm/Target/MRegisterInfo.h"
+#include "SparcGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class SparcSubtarget;
+class TargetInstrInfo;
+class Type;
+
+struct SparcRegisterInfo : public SparcGenRegisterInfo {
+  SparcSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+  
+  SparcRegisterInfo(SparcSubtarget &st, const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, int FrameIndex,
+                           const TargetRegisterClass *RC) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC) const;
+
+  void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *RC) const;
+  
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, const MachineInstr *Orig) const;
+
+  virtual MachineInstr* foldMemoryOperand(MachineInstr* MI,
+                                          unsigned OpNum,
+                                          int FrameIndex) const;
+
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
new file mode 100644
index 0000000..8e2f444
--- /dev/null
+++ b/lib/Target/Sparc/SparcRegisterInfo.td

@@ -0,0 +1,158 @@
+//===- SparcRegisterInfo.td - Sparc Register defs ----------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Sparc register file 
+//===----------------------------------------------------------------------===//
+
+class SparcReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "SP";
+}
+
+// Registers are identified with 5-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<5> num, string n> : SparcReg<n> {
+  let Num = num;
+}
+// Rf - 32-bit floating-point registers
+class Rf<bits<5> num, string n> : SparcReg<n> {
+  let Num = num;
+}
+// Rd - Slots in the FP register file for 64-bit floating-point values.
+class Rd<bits<5> num, string n, list<Register> subregs> : SparcReg<n> {
+  let Num = num;
+  let SubRegs = subregs;
+}
+
+// Integer registers
+def G0 : Ri< 0, "G0">, DwarfRegNum<0>;
+def G1 : Ri< 1, "G1">, DwarfRegNum<1>;
+def G2 : Ri< 2, "G2">, DwarfRegNum<2>; 
+def G3 : Ri< 3, "G3">, DwarfRegNum<3>;
+def G4 : Ri< 4, "G4">, DwarfRegNum<4>;
+def G5 : Ri< 5, "G5">, DwarfRegNum<5>; 
+def G6 : Ri< 6, "G6">, DwarfRegNum<6>;
+def G7 : Ri< 7, "G7">, DwarfRegNum<7>;
+def O0 : Ri< 8, "O0">, DwarfRegNum<8>;
+def O1 : Ri< 9, "O1">, DwarfRegNum<9>;
+def O2 : Ri<10, "O2">, DwarfRegNum<10>; 
+def O3 : Ri<11, "O3">, DwarfRegNum<11>;
+def O4 : Ri<12, "O4">, DwarfRegNum<12>;
+def O5 : Ri<13, "O5">, DwarfRegNum<13>; 
+def O6 : Ri<14, "O6">, DwarfRegNum<14>;
+def O7 : Ri<15, "O7">, DwarfRegNum<15>;
+def L0 : Ri<16, "L0">, DwarfRegNum<16>;
+def L1 : Ri<17, "L1">, DwarfRegNum<17>;
+def L2 : Ri<18, "L2">, DwarfRegNum<18>; 
+def L3 : Ri<19, "L3">, DwarfRegNum<19>;
+def L4 : Ri<20, "L4">, DwarfRegNum<20>;
+def L5 : Ri<21, "L5">, DwarfRegNum<21>; 
+def L6 : Ri<22, "L6">, DwarfRegNum<22>;
+def L7 : Ri<23, "L7">, DwarfRegNum<23>;
+def I0 : Ri<24, "I0">, DwarfRegNum<24>;
+def I1 : Ri<25, "I1">, DwarfRegNum<25>;
+def I2 : Ri<26, "I2">, DwarfRegNum<26>; 
+def I3 : Ri<27, "I3">, DwarfRegNum<27>;
+def I4 : Ri<28, "I4">, DwarfRegNum<28>;
+def I5 : Ri<29, "I5">, DwarfRegNum<29>; 
+def I6 : Ri<30, "I6">, DwarfRegNum<30>;
+def I7 : Ri<31, "I7">, DwarfRegNum<31>;
+
+// Floating-point registers
+def F0  : Rf< 0,  "F0">, DwarfRegNum<32>;
+def F1  : Rf< 1,  "F1">, DwarfRegNum<33>;
+def F2  : Rf< 2,  "F2">, DwarfRegNum<34>; 
+def F3  : Rf< 3,  "F3">, DwarfRegNum<35>;
+def F4  : Rf< 4,  "F4">, DwarfRegNum<36>;
+def F5  : Rf< 5,  "F5">, DwarfRegNum<37>; 
+def F6  : Rf< 6,  "F6">, DwarfRegNum<38>;
+def F7  : Rf< 7,  "F7">, DwarfRegNum<39>;
+def F8  : Rf< 8,  "F8">, DwarfRegNum<40>; 
+def F9  : Rf< 9,  "F9">, DwarfRegNum<41>;
+def F10 : Rf<10, "F10">, DwarfRegNum<42>;
+def F11 : Rf<11, "F11">, DwarfRegNum<43>; 
+def F12 : Rf<12, "F12">, DwarfRegNum<44>;
+def F13 : Rf<13, "F13">, DwarfRegNum<45>;
+def F14 : Rf<14, "F14">, DwarfRegNum<46>; 
+def F15 : Rf<15, "F15">, DwarfRegNum<47>;
+def F16 : Rf<16, "F16">, DwarfRegNum<48>;
+def F17 : Rf<17, "F17">, DwarfRegNum<49>; 
+def F18 : Rf<18, "F18">, DwarfRegNum<50>;
+def F19 : Rf<19, "F19">, DwarfRegNum<51>;
+def F20 : Rf<20, "F20">, DwarfRegNum<52>; 
+def F21 : Rf<21, "F21">, DwarfRegNum<53>;
+def F22 : Rf<22, "F22">, DwarfRegNum<54>;
+def F23 : Rf<23, "F23">, DwarfRegNum<55>;
+def F24 : Rf<24, "F24">, DwarfRegNum<56>;
+def F25 : Rf<25, "F25">, DwarfRegNum<57>;
+def F26 : Rf<26, "F26">, DwarfRegNum<58>; 
+def F27 : Rf<27, "F27">, DwarfRegNum<59>;
+def F28 : Rf<28, "F28">, DwarfRegNum<60>;
+def F29 : Rf<29, "F29">, DwarfRegNum<61>; 
+def F30 : Rf<30, "F30">, DwarfRegNum<62>;
+def F31 : Rf<31, "F31">, DwarfRegNum<63>;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0  : Rd< 0,  "F0", [F0,   F1]>, DwarfRegNum<32>;
+def D1  : Rd< 2,  "F2", [F2,   F3]>, DwarfRegNum<34>; 
+def D2  : Rd< 4,  "F4", [F4,   F5]>, DwarfRegNum<36>;
+def D3  : Rd< 6,  "F6", [F6,   F7]>, DwarfRegNum<38>; 
+def D4  : Rd< 8,  "F8", [F8,   F9]>, DwarfRegNum<40>;
+def D5  : Rd<10, "F10", [F10, F11]>, DwarfRegNum<42>;
+def D6  : Rd<12, "F12", [F12, F13]>, DwarfRegNum<44>;
+def D7  : Rd<14, "F14", [F14, F15]>, DwarfRegNum<46>; 
+def D8  : Rd<16, "F16", [F16, F17]>, DwarfRegNum<48>;
+def D9  : Rd<18, "F18", [F18, F19]>, DwarfRegNum<50>; 
+def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<52>;
+def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<54>;
+def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<56>;
+def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<58>; 
+def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<60>;
+def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<62>;
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+def IntRegs : RegisterClass<"SP", [i32], 32, [L0, L1, L2, L3, L4, L5, L6, L7,
+                                     I0, I1, I2, I3, I4, I5,
+                                     O0, O1, O2, O3, O4, O5, O7,
+
+   // FIXME: G1 reserved for now for large imm generation by frame code.
+                                     G1,
+                                     // Non-allocatable regs:
+                                     G2, G3, G4, // FIXME: OK for use only in
+                                                 // applications, not libraries.
+                                     O6, // stack ptr
+                                     I6, // frame ptr
+                                     I7, // return address
+                                     G0, // constant zero
+                                     G5, G6, G7 // reserved for kernel
+                                     ]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    IntRegsClass::iterator
+    IntRegsClass::allocation_order_end(const MachineFunction &MF) const {
+      // FIXME: These special regs should be taken out of the regclass!
+      return end()-10  // Don't allocate special registers
+         -1;  // FIXME: G1 reserved for large imm generation by frame code.
+    }
+  }];
+}
+
+def FPRegs : RegisterClass<"SP", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7, F8,
+  F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, F22,
+  F23, F24, F25, F26, F27, F28, F29, F30, F31]>;
+
+def DFPRegs : RegisterClass<"SP", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7,
+  D8, D9, D10, D11, D12, D13, D14, D15]>;

diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
new file mode 100644
index 0000000..9940fcf
--- /dev/null
+++ b/lib/Target/Sparc/SparcSubtarget.cpp

@@ -0,0 +1,43 @@
+//===- SparcSubtarget.cpp - SPARC Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPARC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcSubtarget.h"
+#include "SparcGenSubtarget.inc"
+using namespace llvm;
+
+// FIXME: temporary.
+#include "llvm/Support/CommandLine.h"
+namespace {
+  cl::opt<bool> EnableV9("enable-sparc-v9-insts", cl::Hidden,
+                          cl::desc("Enable V9 instructions in the V8 target"));
+}
+
+SparcSubtarget::SparcSubtarget(const Module &M, const std::string &FS) {
+  // Set the default features.
+  IsV9 = false;
+  V8DeprecatedInsts = false;
+  IsVIS = false;
+  
+  // Determine default and user specified characteristics
+  std::string CPU = "generic";
+
+  // FIXME: autodetect host here!
+  CPU = "v9";   // What is a good way to detect V9?
+  
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+
+  // Unless explicitly enabled, disable the V9 instructions.
+  if (!EnableV9)
+    IsV9 = false;
+}

diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
new file mode 100644
index 0000000..59da95a
--- /dev/null
+++ b/lib/Target/Sparc/SparcSubtarget.h

@@ -0,0 +1,42 @@
+//=====-- SparcSubtarget.h - Define Subtarget for the SPARC ----*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SPARC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_SUBTARGET_H
+#define SPARC_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+  class Module;
+  
+class SparcSubtarget : public TargetSubtarget {
+  bool IsV9;
+  bool V8DeprecatedInsts;
+  bool IsVIS;
+public:
+  SparcSubtarget(const Module &M, const std::string &FS);
+
+  bool isV9() const { return IsV9; }
+  bool isVIS() const { return IsVIS; }
+  bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU);
+  
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Sparc/SparcTargetAsmInfo.cpp b/lib/Target/Sparc/SparcTargetAsmInfo.cpp
new file mode 100644
index 0000000..01f7f11
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetAsmInfo.cpp

@@ -0,0 +1,25 @@
+//===-- SparcTargetAsmInfo.cpp - Sparc asm properties -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SparcTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetAsmInfo.h"
+
+using namespace llvm;
+
+SparcTargetAsmInfo::SparcTargetAsmInfo(const SparcTargetMachine &TM) {
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  ZeroDirective = "\t.skip\t";
+  CommentString = "!";
+  ConstantPoolSection = "\t.section \".rodata\",#alloc\n";
+}

diff --git a/lib/Target/Sparc/SparcTargetAsmInfo.h b/lib/Target/Sparc/SparcTargetAsmInfo.h
new file mode 100644
index 0000000..6b2dc59
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetAsmInfo.h

@@ -0,0 +1,31 @@
+//=====-- SparcTargetAsmInfo.h - Sparc asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcTargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETASMINFO_H
+#define SPARCTARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class SparcTargetMachine;
+
+  struct SparcTargetAsmInfo : public TargetAsmInfo {
+    SparcTargetAsmInfo(const SparcTargetMachine &TM);
+  };
+
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
new file mode 100644
index 0000000..e0206d8
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp

@@ -0,0 +1,83 @@
+//===-- SparcTargetMachine.cpp - Define TargetMachine for Sparc -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetAsmInfo.h"
+#include "SparcTargetMachine.h"
+#include "Sparc.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+using namespace llvm;
+
+namespace {
+  // Register the target.
+  RegisterTarget<SparcTargetMachine> X("sparc", "  SPARC");
+}
+
+const TargetAsmInfo *SparcTargetMachine::createTargetAsmInfo() const {
+  return new SparcTargetAsmInfo(*this);
+}
+
+/// SparcTargetMachine ctor - Create an ILP32 architecture model
+///
+SparcTargetMachine::SparcTargetMachine(const Module &M, const std::string &FS)
+  : DataLayout("E-p:32:32"),
+    Subtarget(M, FS), InstrInfo(Subtarget),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 8, 0) {
+}
+
+unsigned SparcTargetMachine::getModuleMatchQuality(const Module &M) {
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 6 && std::string(TT.begin(), TT.begin()+6) == "sparc-")
+    return 20;
+  
+  // If the target triple is something non-sparc, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::BigEndian &&
+      M.getPointerSize() == Module::Pointer32)
+#ifdef __sparc__
+    return 20;   // BE/32 ==> Prefer sparc on sparc
+#else
+    return 5;    // BE/32 ==> Prefer ppc elsewhere
+#endif
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+#if defined(__sparc__)
+  return 10;
+#else
+  return 0;
+#endif
+}
+
+bool SparcTargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) {
+  PM.add(createSparcISelDag(*this));
+  return false;
+}
+
+/// addPreEmitPass - This pass may be implemented by targets that want to run
+/// passes immediately before machine code is emitted.  This should return
+/// true if -print-machineinstrs should print out the code after the passes.
+bool SparcTargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) {
+  PM.add(createSparcFPMoverPass(*this));
+  PM.add(createSparcDelaySlotFillerPass(*this));
+  return true;
+}
+
+bool SparcTargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                            std::ostream &Out) {
+  // Output assembly language.
+  PM.add(createSparcCodePrinterPass(Out, *this));
+  return false;
+}

diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
new file mode 100644
index 0000000..cec0a44
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetMachine.h

@@ -0,0 +1,57 @@
+//===-- SparcTargetMachine.h - Define TargetMachine for Sparc ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Sparc specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETMACHINE_H
+#define SPARCTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "SparcInstrInfo.h"
+#include "SparcSubtarget.h"
+
+namespace llvm {
+
+class Module;
+
+class SparcTargetMachine : public LLVMTargetMachine {
+  const TargetData DataLayout;       // Calculates type size & alignment
+  SparcSubtarget Subtarget;
+  SparcInstrInfo InstrInfo;
+  TargetFrameInfo FrameInfo;
+  
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+public:
+  SparcTargetMachine(const Module &M, const std::string &FS);
+
+  virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual const TargetSubtarget  *getSubtargetImpl() const{ return &Subtarget; }
+  virtual const MRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  static unsigned getModuleMatchQuality(const Module &M);
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);
+  virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast);
+  virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                  std::ostream &Out);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/SubtargetFeature.cpp b/lib/Target/SubtargetFeature.cpp
new file mode 100644
index 0000000..d783f8b
--- /dev/null
+++ b/lib/Target/SubtargetFeature.cpp

@@ -0,0 +1,357 @@
+//===- SubtargetFeature.cpp - CPU characteristics Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the 
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SubtargetFeature interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/SubtargetFeature.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Streams.h"
+#include <algorithm>
+#include <ostream>
+#include <cassert>
+#include <cctype>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                          Static Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// hasFlag - Determine if a feature has a flag; '+' or '-'
+///
+static inline bool hasFlag(const std::string &Feature) {
+  assert(!Feature.empty() && "Empty string");
+  // Get first character
+  char Ch = Feature[0];
+  // Check if first character is '+' or '-' flag
+  return Ch == '+' || Ch =='-';
+}
+
+/// StripFlag - Return string stripped of flag.
+///
+static inline std::string StripFlag(const std::string &Feature) {
+  return hasFlag(Feature) ? Feature.substr(1) : Feature;
+}
+
+/// isEnabled - Return true if enable flag; '+'.
+///
+static inline bool isEnabled(const std::string &Feature) {
+  assert(!Feature.empty() && "Empty string");
+  // Get first character
+  char Ch = Feature[0];
+  // Check if first character is '+' for enabled
+  return Ch == '+';
+}
+
+/// PrependFlag - Return a string with a prepended flag; '+' or '-'.
+///
+static inline std::string PrependFlag(const std::string &Feature,
+                                      bool IsEnabled) {
+  assert(!Feature.empty() && "Empty string");
+  if (hasFlag(Feature)) return Feature;
+  return std::string(IsEnabled ? "+" : "-") + Feature;
+}
+
+/// Split - Splits a string of comma separated items in to a vector of strings.
+///
+static void Split(std::vector<std::string> &V, const std::string &S) {
+  // Start at beginning of string.
+  size_t Pos = 0;
+  while (true) {
+    // Find the next comma
+    size_t Comma = S.find(',', Pos);
+    // If no comma found then the the rest of the string is used
+    if (Comma == std::string::npos) {
+      // Add string to vector
+      V.push_back(S.substr(Pos));
+      break;
+    }
+    // Otherwise add substring to vector
+    V.push_back(S.substr(Pos, Comma - Pos));
+    // Advance to next item
+    Pos = Comma + 1;
+  }
+}
+
+/// Join a vector of strings to a string with a comma separating each element.
+///
+static std::string Join(const std::vector<std::string> &V) {
+  // Start with empty string.
+  std::string Result;
+  // If the vector is not empty 
+  if (!V.empty()) {
+    // Start with the CPU feature
+    Result = V[0];
+    // For each successive feature
+    for (size_t i = 1; i < V.size(); i++) {
+      // Add a comma
+      Result += ",";
+      // Add the feature
+      Result += V[i];
+    }
+  }
+  // Return the features string 
+  return Result;
+}
+
+/// Adding features.
+void SubtargetFeatures::AddFeature(const std::string &String,
+                                   bool IsEnabled) {
+  // Don't add empty features
+  if (!String.empty()) {
+    // Convert to lowercase, prepend flag and add to vector
+    Features.push_back(PrependFlag(LowercaseString(String), IsEnabled));
+  }
+}
+
+/// Find KV in array using binary search.
+template<typename T> const T *Find(const std::string &S, const T *A, size_t L) {
+  // Make the lower bound element we're looking for
+  T KV;
+  KV.Key = S.c_str();
+  // Determine the end of the array
+  const T *Hi = A + L;
+  // Binary search the array
+  const T *F = std::lower_bound(A, Hi, KV);
+  // If not found then return NULL
+  if (F == Hi || std::string(F->Key) != S) return NULL;
+  // Return the found array item
+  return F;
+}
+
+/// getLongestEntryLength - Return the length of the longest entry in the table.
+///
+static size_t getLongestEntryLength(const SubtargetFeatureKV *Table,
+                                    size_t Size) {
+  size_t MaxLen = 0;
+  for (size_t i = 0; i < Size; i++)
+    MaxLen = std::max(MaxLen, std::strlen(Table[i].Key));
+  return MaxLen;
+}
+
+/// Display help for feature choices.
+///
+static void Help(const SubtargetFeatureKV *CPUTable, size_t CPUTableSize,
+                 const SubtargetFeatureKV *FeatTable, size_t FeatTableSize) {
+  // Determine the length of the longest CPU and Feature entries.
+  unsigned MaxCPULen  = getLongestEntryLength(CPUTable, CPUTableSize);
+  unsigned MaxFeatLen = getLongestEntryLength(FeatTable, FeatTableSize);
+
+  // Print the CPU table.
+  cerr << "Available CPUs for this target:\n\n";
+  for (size_t i = 0; i != CPUTableSize; i++)
+    cerr << "  " << CPUTable[i].Key
+         << std::string(MaxCPULen - std::strlen(CPUTable[i].Key), ' ')
+         << " - " << CPUTable[i].Desc << ".\n";
+  cerr << "\n";
+  
+  // Print the Feature table.
+  cerr << "Available features for this target:\n\n";
+  for (size_t i = 0; i != FeatTableSize; i++)
+    cerr << "  " << FeatTable[i].Key
+         << std::string(MaxFeatLen - std::strlen(FeatTable[i].Key), ' ')
+         << " - " << FeatTable[i].Desc << ".\n";
+  cerr << "\n";
+  
+  cerr << "Use +feature to enable a feature, or -feature to disable it.\n"
+       << "For example, llc -mcpu=mycpu -mattr=+feature1,-feature2\n";
+  exit(1);
+}
+
+//===----------------------------------------------------------------------===//
+//                    SubtargetFeatures Implementation
+//===----------------------------------------------------------------------===//
+
+SubtargetFeatures::SubtargetFeatures(const std::string &Initial) {
+  // Break up string into separate features
+  Split(Features, Initial);
+}
+
+
+std::string SubtargetFeatures::getString() const {
+  return Join(Features);
+}
+void SubtargetFeatures::setString(const std::string &Initial) {
+  // Throw out old features
+  Features.clear();
+  // Break up string into separate features
+  Split(Features, LowercaseString(Initial));
+}
+
+
+/// setCPU - Set the CPU string.  Replaces previous setting.  Setting to "" 
+/// clears CPU.
+void SubtargetFeatures::setCPU(const std::string &String) {
+  Features[0] = LowercaseString(String);
+}
+
+
+/// setCPUIfNone - Setting CPU string only if no string is set.
+///
+void SubtargetFeatures::setCPUIfNone(const std::string &String) {
+  if (Features[0].empty()) setCPU(String);
+}
+
+/// SetImpliedBits - For each feature that is (transitively) implied by this
+/// feature, set it.
+/// 
+static
+void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+                    const SubtargetFeatureKV *FeatureTable,
+                    size_t FeatureTableSize) {
+  for (size_t i = 0; i < FeatureTableSize; ++i) {
+    const SubtargetFeatureKV &FE = FeatureTable[i];
+
+    if (FeatureEntry->Value == FE.Value) continue;
+
+    if (FeatureEntry->Implies & FE.Value) {
+      Bits |= FE.Value;
+      SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  }
+}
+
+/// ClearImpliedBits - For each feature that (transitively) implies this
+/// feature, clear it.
+/// 
+static
+void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+                      const SubtargetFeatureKV *FeatureTable,
+                      size_t FeatureTableSize) {
+  for (size_t i = 0; i < FeatureTableSize; ++i) {
+    const SubtargetFeatureKV &FE = FeatureTable[i];
+
+    if (FeatureEntry->Value == FE.Value) continue;
+
+    if (FE.Implies & FeatureEntry->Value) {
+      Bits &= ~FE.Value;
+      ClearImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  }
+}
+
+/// getBits - Get feature bits.
+///
+uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
+                                          size_t CPUTableSize,
+                                    const SubtargetFeatureKV *FeatureTable,
+                                          size_t FeatureTableSize) {
+  assert(CPUTable && "missing CPU table");
+  assert(FeatureTable && "missing features table");
+#ifndef NDEBUG
+  for (size_t i = 1; i < CPUTableSize; i++) {
+    assert(strcmp(CPUTable[i - 1].Key, CPUTable[i].Key) < 0 &&
+           "CPU table is not sorted");
+  }
+  for (size_t i = 1; i < FeatureTableSize; i++) {
+    assert(strcmp(FeatureTable[i - 1].Key, FeatureTable[i].Key) < 0 &&
+          "CPU features table is not sorted");
+  }
+#endif
+  uint32_t Bits = 0;                    // Resulting bits
+
+  // Check if help is needed
+  if (Features[0] == "help")
+    Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
+  
+  // Find CPU entry
+  const SubtargetFeatureKV *CPUEntry =
+                            Find(Features[0], CPUTable, CPUTableSize);
+  // If there is a match
+  if (CPUEntry) {
+    // Set base feature bits
+    Bits = CPUEntry->Value;
+
+    // Set the feature implied by this CPU feature, if any.
+    for (size_t i = 0; i < FeatureTableSize; ++i) {
+      const SubtargetFeatureKV &FE = FeatureTable[i];
+      if (CPUEntry->Value & FE.Value)
+        SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  } else {
+    cerr << "'" << Features[0]
+         << "' is not a recognized processor for this target"
+         << " (ignoring processor)"
+         << "\n";
+  }
+  // Iterate through each feature
+  for (size_t i = 1; i < Features.size(); i++) {
+    const std::string &Feature = Features[i];
+    
+    // Check for help
+    if (Feature == "+help")
+      Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
+    
+    // Find feature in table.
+    const SubtargetFeatureKV *FeatureEntry =
+                       Find(StripFlag(Feature), FeatureTable, FeatureTableSize);
+    // If there is a match
+    if (FeatureEntry) {
+      // Enable/disable feature in bits
+      if (isEnabled(Feature)) {
+        Bits |=  FeatureEntry->Value;
+
+        // For each feature that this implies, set it.
+        SetImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+      } else {
+        Bits &= ~FeatureEntry->Value;
+
+        // For each feature that implies this, clear it.
+        ClearImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+      }
+    } else {
+      cerr << "'" << Feature
+           << "' is not a recognized feature for this target"
+           << " (ignoring feature)"
+           << "\n";
+    }
+  }
+
+  return Bits;
+}
+
+/// Get info pointer
+void *SubtargetFeatures::getInfo(const SubtargetInfoKV *Table,
+                                       size_t TableSize) {
+  assert(Table && "missing table");
+#ifndef NDEBUG
+  for (size_t i = 1; i < TableSize; i++) {
+    assert(strcmp(Table[i - 1].Key, Table[i].Key) < 0 && "Table is not sorted");
+  }
+#endif
+
+  // Find entry
+  const SubtargetInfoKV *Entry = Find(Features[0], Table, TableSize);
+  
+  if (Entry) {
+    return Entry->Value;
+  } else {
+    cerr << "'" << Features[0]
+         << "' is not a recognized processor for this target"
+         << " (ignoring processor)"
+         << "\n";
+    return NULL;
+  }
+}
+
+/// print - Print feature string.
+///
+void SubtargetFeatures::print(std::ostream &OS) const {
+  for (size_t i = 0; i < Features.size(); i++) {
+    OS << Features[i] << "  ";
+  }
+  OS << "\n";
+}
+
+/// dump - Dump feature info.
+///
+void SubtargetFeatures::dump() const {
+  print(*cerr.stream());
+}

diff --git a/lib/Target/Target.td b/lib/Target/Target.td
new file mode 100644
index 0000000..48af218
--- /dev/null
+++ b/lib/Target/Target.td

@@ -0,0 +1,413 @@
+//===- Target.td - Target Independent TableGen interface ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the target-independent interfaces which should be
+// implemented by each target which is using a TableGen based code generator.
+//
+//===----------------------------------------------------------------------===//
+
+// Include all information about LLVM intrinsics.
+include "llvm/Intrinsics.td"
+
+//===----------------------------------------------------------------------===//
+// Register file description - These classes are used to fill in the target
+// description classes.
+
+class RegisterClass; // Forward def
+
+// Register - You should define one instance of this class for each register
+// in the target machine.  String n will become the "name" of the register.
+class Register<string n> {
+  string Namespace = "";
+  string Name = n;
+
+  // SpillSize - If this value is set to a non-zero value, it is the size in
+  // bits of the spill slot required to hold this register.  If this value is
+  // set to zero, the information is inferred from any register classes the
+  // register belongs to.
+  int SpillSize = 0;
+
+  // SpillAlignment - This value is used to specify the alignment required for
+  // spilling the register.  Like SpillSize, this should only be explicitly
+  // specified if the register is not in a register class.
+  int SpillAlignment = 0;
+
+  // Aliases - A list of registers that this register overlaps with.  A read or
+  // modification of this register can potentially read or modify the aliased
+  // registers.
+  list<Register> Aliases = [];
+  
+  // SubRegs - A list of registers that are parts of this register. Note these
+  // are "immediate" sub-registers and the registers within the list do not
+  // themselves overlap. e.g. For X86, EAX's SubRegs list contains only [AX],
+  // not [AX, AH, AL].
+  list<Register> SubRegs = [];
+
+  // DwarfNumber - Number used internally by gcc/gdb to identify the register.
+  // These values can be determined by locating the <target>.h file in the
+  // directory llvmgcc/gcc/config/<target>/ and looking for REGISTER_NAMES.  The
+  // order of these names correspond to the enumeration used by gcc.  A value of
+  // -1 indicates that the gcc number is undefined.
+  int DwarfNumber = -1;
+}
+
+// RegisterWithSubRegs - This can be used to define instances of Register which
+// need to specify sub-registers.
+// List "subregs" specifies which registers are sub-registers to this one. This
+// is used to populate the SubRegs and AliasSet fields of TargetRegisterDesc.
+// This allows the code generator to be careful not to put two values with 
+// overlapping live ranges into registers which alias.
+class RegisterWithSubRegs<string n, list<Register> subregs> : Register<n> {
+  let SubRegs = subregs;
+}
+
+// SubRegSet - This can be used to define a specific mapping of registers to
+// indices, for use as named subregs of a particular physical register.  Each
+// register in 'subregs' becomes an addressable subregister at index 'n' of the
+// corresponding register in 'regs'.
+class SubRegSet<int n, list<Register> regs, list<Register> subregs> {
+  int index = n;
+  
+  list<Register> From = regs;
+  list<Register> To = subregs;
+}
+
+// RegisterClass - Now that all of the registers are defined, and aliases
+// between registers are defined, specify which registers belong to which
+// register classes.  This also defines the default allocation order of
+// registers by register allocators.
+//
+class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
+                    list<Register> regList> {
+  string Namespace = namespace;
+
+  // RegType - Specify the list ValueType of the registers in this register
+  // class.  Note that all registers in a register class must have the same
+  // ValueTypes.  This is a list because some targets permit storing different 
+  // types in same register, for example vector values with 128-bit total size,
+  // but different count/size of items, like SSE on x86.
+  //
+  list<ValueType> RegTypes = regTypes;
+
+  // Size - Specify the spill size in bits of the registers.  A default value of
+  // zero lets tablgen pick an appropriate size.
+  int Size = 0;
+
+  // Alignment - Specify the alignment required of the registers when they are
+  // stored or loaded to memory.
+  //
+  int Alignment = alignment;
+
+  // MemberList - Specify which registers are in this class.  If the
+  // allocation_order_* method are not specified, this also defines the order of
+  // allocation used by the register allocator.
+  //
+  list<Register> MemberList = regList;
+  
+  // SubClassList - Specify which register classes correspond to subregisters
+  // of this class. The order should be by subregister set index.
+  list<RegisterClass> SubRegClassList = [];
+
+  // MethodProtos/MethodBodies - These members can be used to insert arbitrary
+  // code into a generated register class.   The normal usage of this is to 
+  // overload virtual methods.
+  code MethodProtos = [{}];
+  code MethodBodies = [{}];
+}
+
+
+//===----------------------------------------------------------------------===//
+// DwarfRegNum - This class provides a mapping of the llvm register enumeration
+// to the register numbering used by gcc and gdb.  These values are used by a
+// debug information writer (ex. DwarfWriter) to describe where values may be
+// located during execution.
+class DwarfRegNum<int N> {
+  // DwarfNumber - Number used internally by gcc/gdb to identify the register.
+  // These values can be determined by locating the <target>.h file in the
+  // directory llvmgcc/gcc/config/<target>/ and looking for REGISTER_NAMES.  The
+  // order of these names correspond to the enumeration used by gcc.  A value of
+  // -1 indicates that the gcc number is undefined.
+  int DwarfNumber = N;
+}
+
+//===----------------------------------------------------------------------===//
+// Pull in the common support for scheduling
+//
+include "TargetSchedule.td"
+
+class Predicate; // Forward def
+
+//===----------------------------------------------------------------------===//
+// Instruction set description - These classes correspond to the C++ classes in
+// the Target/TargetInstrInfo.h file.
+//
+class Instruction {
+  string Name = "";         // The opcode string for this instruction
+  string Namespace = "";
+
+  dag OperandList;          // An dag containing the MI operand list.
+  string AsmString = "";    // The .s format to print the instruction with.
+
+  // Pattern - Set to the DAG pattern for this instruction, if we know of one,
+  // otherwise, uninitialized.
+  list<dag> Pattern;
+
+  // The follow state will eventually be inferred automatically from the
+  // instruction pattern.
+
+  list<Register> Uses = []; // Default to using no non-operand registers
+  list<Register> Defs = []; // Default to modifying no non-operand registers
+
+  // Predicates - List of predicates which will be turned into isel matching
+  // code.
+  list<Predicate> Predicates = [];
+
+  // Code size.
+  int CodeSize = 0;
+
+  // Added complexity passed onto matching pattern.
+  int AddedComplexity  = 0;
+
+  // These bits capture information about the high-level semantics of the
+  // instruction.
+  bit isReturn     = 0;     // Is this instruction a return instruction?
+  bit isBranch     = 0;     // Is this instruction a branch instruction?
+  bit isBarrier    = 0;     // Can control flow fall through this instruction?
+  bit isCall       = 0;     // Is this instruction a call instruction?
+  bit isLoad       = 0;     // Is this instruction a load instruction?
+  bit isStore      = 0;     // Is this instruction a store instruction?
+  bit isTwoAddress = 0;     // Is this a two address instruction?
+  bit isConvertibleToThreeAddress = 0;  // Can this 2-addr instruction promote?
+  bit isCommutable = 0;     // Is this 3 operand instruction commutable?
+  bit isTerminator = 0;     // Is this part of the terminator for a basic block?
+  bit isReMaterializable = 0; // Is this instruction re-materializable?
+  bit isPredicable = 0;     // Is this instruction predicable?
+  bit hasDelaySlot = 0;     // Does this instruction have an delay slot?
+  bit usesCustomDAGSchedInserter = 0; // Pseudo instr needing special help.
+  bit hasCtrlDep   = 0;     // Does this instruction r/w ctrl-flow chains?
+  bit noResults    = 0;     // Does this instruction produce no results?
+  bit isNotDuplicable = 0;  // Is it unsafe to duplicate this instruction?
+  
+  InstrItinClass Itinerary = NoItinerary;// Execution steps used for scheduling.
+
+  string Constraints = "";  // OperandConstraint, e.g. $src = $dst.
+  
+  /// DisableEncoding - List of operand names (e.g. "$op1,$op2") that should not
+  /// be encoded into the output machineinstr.
+  string DisableEncoding = "";
+}
+
+/// Imp - Helper class for specifying the implicit uses/defs set for an
+/// instruction.
+class Imp<list<Register> uses, list<Register> defs> {
+  list<Register> Uses = uses;
+  list<Register> Defs = defs;
+}
+
+/// Predicates - These are extra conditionals which are turned into instruction
+/// selector matching code. Currently each predicate is just a string.
+class Predicate<string cond> {
+  string CondString = cond;
+}
+
+/// NoHonorSignDependentRounding - This predicate is true if support for
+/// sign-dependent-rounding is not enabled.
+def NoHonorSignDependentRounding
+ : Predicate<"!HonorSignDependentRoundingFPMath()">;
+
+class Requires<list<Predicate> preds> {
+  list<Predicate> Predicates = preds;
+}
+
+/// ops definition - This is just a simple marker used to identify the operands
+/// list for an instruction.  This should be used like this:
+///     (ops R32:$dst, R32:$src) or something similar.
+def ops;
+
+/// variable_ops definition - Mark this instruction as taking a variable number
+/// of operands.
+def variable_ops;
+
+/// ptr_rc definition - Mark this operand as being a pointer value whose
+/// register class is resolved dynamically via a callback to TargetInstrInfo.
+/// FIXME: We should probably change this to a class which contain a list of
+/// flags. But currently we have but one flag.
+def ptr_rc;
+
+/// Operand Types - These provide the built-in operand types that may be used
+/// by a target.  Targets can optionally provide their own operand types as
+/// needed, though this should not be needed for RISC targets.
+class Operand<ValueType ty> {
+  ValueType Type = ty;
+  string PrintMethod = "printOperand";
+  dag MIOperandInfo = (ops);
+}
+
+def i1imm  : Operand<i1>;
+def i8imm  : Operand<i8>;
+def i16imm : Operand<i16>;
+def i32imm : Operand<i32>;
+def i64imm : Operand<i64>;
+
+/// zero_reg definition - Special node to stand for the zero register.
+///
+def zero_reg;
+
+/// PredicateOperand - This can be used to define a predicate operand for an
+/// instruction.  OpTypes specifies the MIOperandInfo for the operand, and
+/// AlwaysVal specifies the value of this predicate when set to "always
+/// execute".
+class PredicateOperand<ValueType ty, dag OpTypes, dag AlwaysVal>
+  : Operand<ty> {
+  let MIOperandInfo = OpTypes;
+  dag DefaultOps = AlwaysVal;
+}
+
+/// OptionalDefOperand - This is used to define a optional definition operand
+/// for an instruction. DefaultOps is the register the operand represents if none
+/// is supplied, e.g. zero_reg.
+class OptionalDefOperand<ValueType ty, dag OpTypes, dag defaultops>
+  : Operand<ty> {
+  let MIOperandInfo = OpTypes;
+  dag DefaultOps = defaultops;
+}
+
+
+// InstrInfo - This class should only be instantiated once to provide parameters
+// which are global to the the target machine.
+//
+class InstrInfo {
+  // If the target wants to associate some target-specific information with each
+  // instruction, it should provide these two lists to indicate how to assemble
+  // the target specific information into the 32 bits available.
+  //
+  list<string> TSFlagsFields = [];
+  list<int>    TSFlagsShifts = [];
+
+  // Target can specify its instructions in either big or little-endian formats.
+  // For instance, while both Sparc and PowerPC are big-endian platforms, the
+  // Sparc manual specifies its instructions in the format [31..0] (big), while
+  // PowerPC specifies them using the format [0..31] (little).
+  bit isLittleEndianEncoding = 0;
+}
+
+// Standard Instructions.
+def PHI : Instruction {
+  let OperandList = (ops variable_ops);
+  let AsmString = "PHINODE";
+  let Namespace = "TargetInstrInfo";
+}
+def INLINEASM : Instruction {
+  let OperandList = (ops variable_ops);
+  let AsmString = "";
+  let Namespace = "TargetInstrInfo";
+}
+def LABEL : Instruction {
+  let OperandList = (ops i32imm:$id);
+  let AsmString = "";
+  let Namespace = "TargetInstrInfo";
+  let hasCtrlDep = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// AsmWriter - This class can be implemented by targets that need to customize
+// the format of the .s file writer.
+//
+// Subtargets can have multiple different asmwriters (e.g. AT&T vs Intel syntax
+// on X86 for example).
+//
+class AsmWriter {
+  // AsmWriterClassName - This specifies the suffix to use for the asmwriter
+  // class.  Generated AsmWriter classes are always prefixed with the target
+  // name.
+  string AsmWriterClassName  = "AsmPrinter";
+
+  // InstFormatName - AsmWriters can specify the name of the format string to
+  // print instructions with.
+  string InstFormatName = "AsmString";
+
+  // Variant - AsmWriters can be of multiple different variants.  Variants are
+  // used to support targets that need to emit assembly code in ways that are
+  // mostly the same for different targets, but have minor differences in
+  // syntax.  If the asmstring contains {|} characters in them, this integer
+  // will specify which alternative to use.  For example "{x|y|z}" with Variant
+  // == 1, will expand to "y".
+  int Variant = 0;
+}
+def DefaultAsmWriter : AsmWriter;
+
+
+//===----------------------------------------------------------------------===//
+// Target - This class contains the "global" target information
+//
+class Target {
+  // InstructionSet - Instruction set description for this target.
+  InstrInfo InstructionSet;
+
+  // AssemblyWriters - The AsmWriter instances available for this target.
+  list<AsmWriter> AssemblyWriters = [DefaultAsmWriter];
+}
+
+//===----------------------------------------------------------------------===//
+// SubtargetFeature - A characteristic of the chip set.
+//
+class SubtargetFeature<string n, string a,  string v, string d,
+                       list<SubtargetFeature> i = []> {
+  // Name - Feature name.  Used by command line (-mattr=) to determine the
+  // appropriate target chip.
+  //
+  string Name = n;
+  
+  // Attribute - Attribute to be set by feature.
+  //
+  string Attribute = a;
+  
+  // Value - Value the attribute to be set to by feature.
+  //
+  string Value = v;
+  
+  // Desc - Feature description.  Used by command line (-mattr=) to display help
+  // information.
+  //
+  string Desc = d;
+
+  // Implies - Features that this feature implies are present. If one of those
+  // features isn't set, then this one shouldn't be set either.
+  //
+  list<SubtargetFeature> Implies = i;
+}
+
+//===----------------------------------------------------------------------===//
+// Processor chip sets - These values represent each of the chip sets supported
+// by the scheduler.  Each Processor definition requires corresponding
+// instruction itineraries.
+//
+class Processor<string n, ProcessorItineraries pi, list<SubtargetFeature> f> {
+  // Name - Chip set name.  Used by command line (-mcpu=) to determine the
+  // appropriate target chip.
+  //
+  string Name = n;
+  
+  // ProcItin - The scheduling information for the target processor.
+  //
+  ProcessorItineraries ProcItin = pi;
+  
+  // Features - list of 
+  list<SubtargetFeature> Features = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Pull in the common support for calling conventions.
+//
+include "TargetCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Pull in the common support for DAG isel generation.
+//
+include "TargetSelectionDAG.td"

diff --git a/lib/Target/TargetAsmInfo.cpp b/lib/Target/TargetAsmInfo.cpp
new file mode 100644
index 0000000..df7a2ec
--- /dev/null
+++ b/lib/Target/TargetAsmInfo.cpp

@@ -0,0 +1,131 @@
+//===-- TargetAsmInfo.cpp - Asm Info ---------------------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetAsmInfo.h"
+#include <cctype>
+#include <cstring>
+
+using namespace llvm;
+
+TargetAsmInfo::TargetAsmInfo() :
+  TextSection("\t.text"),
+  DataSection("\t.data"),
+  BSSSection("\t.bss"),
+  TLSDataSection("\t.section .tdata,\"awT\",@progbits"),
+  TLSBSSSection("\t.section .tbss,\"awT\",@nobits"),
+  ZeroFillDirective(0),
+  AddressSize(4),
+  NeedsSet(false),
+  MaxInstLength(4),
+  PCSymbol("$"),
+  SeparatorChar(';'),
+  CommentString("#"),
+  GlobalPrefix(""),
+  PrivateGlobalPrefix("."),
+  JumpTableSpecialLabelPrefix(0),
+  GlobalVarAddrPrefix(""),
+  GlobalVarAddrSuffix(""),
+  FunctionAddrPrefix(""),
+  FunctionAddrSuffix(""),
+  InlineAsmStart("#APP"),
+  InlineAsmEnd("#NO_APP"),
+  AssemblerDialect(0),
+  ZeroDirective("\t.zero\t"),
+  ZeroDirectiveSuffix(0),
+  AsciiDirective("\t.ascii\t"),
+  AscizDirective("\t.asciz\t"),
+  Data8bitsDirective("\t.byte\t"),
+  Data16bitsDirective("\t.short\t"),
+  Data32bitsDirective("\t.long\t"),
+  Data64bitsDirective("\t.quad\t"),
+  AlignDirective("\t.align\t"),
+  AlignmentIsInBytes(true),
+  SwitchToSectionDirective("\t.section\t"),
+  TextSectionStartSuffix(""),
+  DataSectionStartSuffix(""),
+  SectionEndDirectiveSuffix(0),
+  ConstantPoolSection("\t.section .rodata"),
+  JumpTableDataSection("\t.section .rodata"),
+  JumpTableDirective(0),
+  CStringSection(0),
+  StaticCtorsSection("\t.section .ctors,\"aw\",@progbits"),
+  StaticDtorsSection("\t.section .dtors,\"aw\",@progbits"),
+  FourByteConstantSection(0),
+  EightByteConstantSection(0),
+  SixteenByteConstantSection(0),
+  ReadOnlySection(0),
+  GlobalDirective(0),
+  SetDirective(0),
+  LCOMMDirective(0),
+  COMMDirective("\t.comm\t"),
+  COMMDirectiveTakesAlignment(true),
+  HasDotTypeDotSizeDirective(true),
+  UsedDirective(0),
+  WeakRefDirective(0),
+  HiddenDirective("\t.hidden\t"),
+  ProtectedDirective("\t.protected\t"),
+  AbsoluteDebugSectionOffsets(false),
+  AbsoluteEHSectionOffsets(false),
+  HasLEB128(false),
+  HasDotLoc(false),
+  HasDotFile(false),
+  SupportsDebugInformation(false),
+  SupportsExceptionHandling(false),
+  DwarfRequiresFrameSection(true),
+  DwarfSectionOffsetDirective(0),
+  DwarfAbbrevSection(".debug_abbrev"),
+  DwarfInfoSection(".debug_info"),
+  DwarfLineSection(".debug_line"),
+  DwarfFrameSection(".debug_frame"),
+  DwarfPubNamesSection(".debug_pubnames"),
+  DwarfPubTypesSection(".debug_pubtypes"),
+  DwarfStrSection(".debug_str"),
+  DwarfLocSection(".debug_loc"),
+  DwarfARangesSection(".debug_aranges"),
+  DwarfRangesSection(".debug_ranges"),
+  DwarfMacInfoSection(".debug_macinfo"),
+  DwarfEHFrameSection(".eh_frame"),
+  DwarfExceptionSection(".gcc_except_table"),
+  AsmTransCBE(0) {
+}
+
+TargetAsmInfo::~TargetAsmInfo() {
+}
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorChar or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorChar or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+unsigned TargetAsmInfo::getInlineAsmLength(const char *Str) const {
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || *Str == SeparatorChar)
+      atInsnStart = true;
+    if (atInsnStart && !isspace(*Str)) {
+      Length += MaxInstLength;
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, CommentString, strlen(CommentString))==0)
+      atInsnStart = false;
+  }
+
+  return Length;
+}
+

diff --git a/lib/Target/TargetCallingConv.td b/lib/Target/TargetCallingConv.td
new file mode 100644
index 0000000..9419320
--- /dev/null
+++ b/lib/Target/TargetCallingConv.td

@@ -0,0 +1,88 @@
+//===- TargetCallingConv.td - Target Calling Conventions ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the target-independent interfaces with which targets
+// describe their calling conventions.
+//
+//===----------------------------------------------------------------------===//
+
+class CCAction;
+class CallingConv;
+
+/// CCPredicateAction - Instances of this class check some predicate, then
+/// delegate to another action if the predicate is true.
+class CCPredicateAction<CCAction A> : CCAction {
+  CCAction SubAction = A;
+}
+
+/// CCIfType - If the current argument is one of the specified types, apply
+/// Action A.
+class CCIfType<list<ValueType> vts, CCAction A> : CCPredicateAction<A> {
+  list<ValueType> VTs = vts;
+}
+
+/// CCIf - If the predicate matches, apply A.
+class CCIf<string predicate, CCAction A> : CCPredicateAction<A> {
+  string Predicate = predicate;
+}
+
+/// CCIfStruct - If the current argument is a struct, apply
+/// Action A.
+class CCIfStruct<CCAction A> : CCIf<"ArgFlags & ISD::ParamFlags::ByVal", A> {
+}
+
+/// CCIfCC - Match of the current calling convention is 'CC'.
+class CCIfCC<string CC, CCAction A>
+  : CCIf<!strconcat("State.getCallingConv() == ", CC), A> {}
+
+/// CCIfInReg - If this argument is marked with the 'inreg' attribute, apply
+/// the specified action.
+class CCIfInReg<CCAction A> : CCIf<"ArgFlags & ISD::ParamFlags::InReg", A> {}
+
+/// CCIfNotVarArg - If the current function is not vararg - apply the action
+class CCIfNotVarArg<CCAction A> : CCIf<"!State.isVarArg()", A> {}
+
+/// CCAssignToReg - This action matches if there is a register in the specified
+/// list that is still available.  If so, it assigns the value to the first
+/// available register and succeeds.
+class CCAssignToReg<list<Register> regList> : CCAction {
+  list<Register> RegList = regList;
+}
+
+/// CCAssignToStack - This action always matches: it assigns the value to a
+/// stack slot of the specified size and alignment on the stack.
+class CCAssignToStack<int size, int align> : CCAction {
+  int Size = size;
+  int Align = align;
+}
+
+/// CCStructAssign - This action always matches: it will use the C ABI and
+/// the register availability to decided whether to assign to a set of
+/// registers or to a stack slot.
+class CCStructAssign<list<Register> regList> : CCAction {
+  list<Register> RegList = regList;
+}
+
+/// CCPromoteToType - If applied, this promotes the specified current value to
+/// the specified type.
+class CCPromoteToType<ValueType destTy> : CCAction {
+  ValueType DestTy = destTy;
+}
+
+/// CCDelegateTo - This action invokes the specified sub-calling-convention.  It
+/// is successful if the specified CC matches.
+class CCDelegateTo<CallingConv cc> : CCAction {
+  CallingConv CC = cc;
+}
+
+/// CallingConv - An instance of this is used to define each calling convention
+/// that the target supports.
+class CallingConv<list<CCAction> actions> {
+  list<CCAction> Actions = actions;
+}

diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
new file mode 100644
index 0000000..301e8c1
--- /dev/null
+++ b/lib/Target/TargetData.cpp

@@ -0,0 +1,595 @@
+//===-- TargetData.cpp - Data size & alignment routines --------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target properties related to datatype size/offset/alignment
+// information.
+//
+// This structure should be created once, filled in if the defaults are not
+// correct and then passed around by const&.  None of the members functions
+// require modification to the object.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetData.h"
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Constants.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include <algorithm>
+#include <cstdlib>
+#include <sstream>
+using namespace llvm;
+
+// Handle the Pass registration stuff necessary to use TargetData's.
+namespace {
+  // Register the default SparcV9 implementation...
+  RegisterPass<TargetData> X("targetdata", "Target Data Layout");
+}
+char TargetData::ID = 0;
+
+//===----------------------------------------------------------------------===//
+// Support for StructLayout
+//===----------------------------------------------------------------------===//
+
+StructLayout::StructLayout(const StructType *ST, const TargetData &TD) {
+  StructAlignment = 0;
+  StructSize = 0;
+  NumElements = ST->getNumElements();
+
+  // Loop over each of the elements, placing them in memory...
+  for (unsigned i = 0, e = NumElements; i != e; ++i) {
+    const Type *Ty = ST->getElementType(i);
+    unsigned TyAlign;
+    uint64_t TySize;
+    TyAlign = (ST->isPacked() ? 1 : TD.getABITypeAlignment(Ty));
+    TySize = TD.getTypeSize(Ty);
+
+    // Add padding if necessary to make the data element aligned properly...
+    if (StructSize % TyAlign != 0)
+      StructSize = (StructSize/TyAlign + 1) * TyAlign;   // Add padding...
+
+    // Keep track of maximum alignment constraint
+    StructAlignment = std::max(TyAlign, StructAlignment);
+
+    MemberOffsets[i] = StructSize;
+    StructSize += TySize;                 // Consume space for this data item
+  }
+
+  // Empty structures have alignment of 1 byte.
+  if (StructAlignment == 0) StructAlignment = 1;
+
+  // Add padding to the end of the struct so that it could be put in an array
+  // and all array elements would be aligned correctly.
+  if (StructSize % StructAlignment != 0)
+    StructSize = (StructSize/StructAlignment + 1) * StructAlignment;
+}
+
+
+/// getElementContainingOffset - Given a valid offset into the structure,
+/// return the structure index that contains it.
+unsigned StructLayout::getElementContainingOffset(uint64_t Offset) const {
+  const uint64_t *SI =
+    std::upper_bound(&MemberOffsets[0], &MemberOffsets[NumElements], Offset);
+  assert(SI != &MemberOffsets[0] && "Offset not in structure type!");
+  --SI;
+  assert(*SI <= Offset && "upper_bound didn't work");
+  assert((SI == &MemberOffsets[0] || *(SI-1) < Offset) &&
+         (SI+1 == &MemberOffsets[NumElements] || *(SI+1) > Offset) &&
+         "Upper bound didn't work!");
+  return SI-&MemberOffsets[0];
+}
+
+//===----------------------------------------------------------------------===//
+// TargetAlignElem, TargetAlign support
+//===----------------------------------------------------------------------===//
+
+TargetAlignElem
+TargetAlignElem::get(AlignTypeEnum align_type, unsigned char abi_align,
+                     unsigned char pref_align, uint32_t bit_width) {
+  TargetAlignElem retval;
+  retval.AlignType = align_type;
+  retval.ABIAlign = abi_align;
+  retval.PrefAlign = pref_align;
+  retval.TypeBitWidth = bit_width;
+  return retval;
+}
+
+bool
+TargetAlignElem::operator==(const TargetAlignElem &rhs) const {
+  return (AlignType == rhs.AlignType
+          && ABIAlign == rhs.ABIAlign
+          && PrefAlign == rhs.PrefAlign
+          && TypeBitWidth == rhs.TypeBitWidth);
+}
+
+std::ostream &
+TargetAlignElem::dump(std::ostream &os) const {
+  return os << AlignType
+            << TypeBitWidth
+            << ":" << (int) (ABIAlign * 8)
+            << ":" << (int) (PrefAlign * 8);
+}
+
+const TargetAlignElem TargetData::InvalidAlignmentElem =
+                TargetAlignElem::get((AlignTypeEnum) -1, 0, 0, 0);
+
+//===----------------------------------------------------------------------===//
+//                       TargetData Class Implementation
+//===----------------------------------------------------------------------===//
+
+/*!
+ A TargetDescription string consists of a sequence of hyphen-delimited
+ specifiers for target endianness, pointer size and alignments, and various
+ primitive type sizes and alignments. A typical string looks something like:
+ <br><br>
+ "E-p:32:32:32-i1:8:8-i8:8:8-i32:32:32-i64:32:64-f32:32:32-f64:32:64"
+ <br><br>
+ (note: this string is not fully specified and is only an example.)
+ \p
+ Alignments come in two flavors: ABI and preferred. ABI alignment (abi_align,
+ below) dictates how a type will be aligned within an aggregate and when used
+ as an argument.  Preferred alignment (pref_align, below) determines a type's
+ alignment when emitted as a global.
+ \p
+ Specifier string details:
+ <br><br>
+ <i>[E|e]</i>: Endianness. "E" specifies a big-endian target data model, "e"
+ specifies a little-endian target data model.
+ <br><br>
+ <i>p:<size>:<abi_align>:<pref_align></i>: Pointer size, ABI and preferred
+ alignment.
+ <br><br>
+ <i><type><size>:<abi_align>:<pref_align></i>: Numeric type alignment. Type is
+ one of <i>i|f|v|a</i>, corresponding to integer, floating point, vector (aka
+ packed) or aggregate.  Size indicates the size, e.g., 32 or 64 bits.
+ \p
+ The default string, fully specified is:
+ <br><br>
+ "E-p:64:64:64-a0:0:0-f32:32:32-f64:0:64"
+ "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:0:64"
+ "-v64:64:64-v128:128:128"
+ <br><br>
+ Note that in the case of aggregates, 0 is the default ABI and preferred
+ alignment. This is a special case, where the aggregate's computed worst-case
+ alignment will be used.
+ */ 
+void TargetData::init(const std::string &TargetDescription) {
+  std::string temp = TargetDescription;
+  
+  LittleEndian = false;
+  PointerMemSize = 8;
+  PointerABIAlign   = 8;
+  PointerPrefAlign = PointerABIAlign;
+
+  // Default alignments
+  setAlignment(INTEGER_ALIGN,   1,  1, 1);   // Bool
+  setAlignment(INTEGER_ALIGN,   1,  1, 8);   // Byte
+  setAlignment(INTEGER_ALIGN,   2,  2, 16);  // short
+  setAlignment(INTEGER_ALIGN,   4,  4, 32);  // int
+  setAlignment(INTEGER_ALIGN,   4,  8, 64);  // long
+  setAlignment(FLOAT_ALIGN,     4,  4, 32);  // float
+  setAlignment(FLOAT_ALIGN,     8,  8, 64);  // double
+  setAlignment(VECTOR_ALIGN,    8,  8, 64);  // v2i32
+  setAlignment(VECTOR_ALIGN,   16, 16, 128); // v16i8, v8i16, v4i32, ...
+  setAlignment(AGGREGATE_ALIGN, 0,  8,  0);  // struct, union, class, ...
+  
+  while (!temp.empty()) {
+    std::string token = getToken(temp, "-");
+    std::string arg0 = getToken(token, ":");
+    const char *p = arg0.c_str();
+    switch(*p) {
+    case 'E':
+      LittleEndian = false;
+      break;
+    case 'e':
+      LittleEndian = true;
+      break;
+    case 'p':
+      PointerMemSize = atoi(getToken(token,":").c_str()) / 8;
+      PointerABIAlign = atoi(getToken(token,":").c_str()) / 8;
+      PointerPrefAlign = atoi(getToken(token,":").c_str()) / 8;
+      if (PointerPrefAlign == 0)
+        PointerPrefAlign = PointerABIAlign;
+      break;
+    case 'i':
+    case 'v':
+    case 'f':
+    case 'a': {
+      AlignTypeEnum align_type = 
+        (*p == 'i' ? INTEGER_ALIGN : (*p == 'f' ? FLOAT_ALIGN :
+           (*p == 'v' ? VECTOR_ALIGN : AGGREGATE_ALIGN)));
+      uint32_t size = (uint32_t) atoi(++p);
+      unsigned char abi_align = atoi(getToken(token, ":").c_str()) / 8;
+      unsigned char pref_align = atoi(getToken(token, ":").c_str()) / 8;
+      if (pref_align == 0)
+        pref_align = abi_align;
+      setAlignment(align_type, abi_align, pref_align, size);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+}
+
+TargetData::TargetData(const Module *M) 
+  : ImmutablePass((intptr_t)&ID) {
+  init(M->getDataLayout());
+}
+
+void
+TargetData::setAlignment(AlignTypeEnum align_type, unsigned char abi_align,
+                         unsigned char pref_align, uint32_t bit_width) {
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
+    if (Alignments[i].AlignType == align_type &&
+        Alignments[i].TypeBitWidth == bit_width) {
+      // Update the abi, preferred alignments.
+      Alignments[i].ABIAlign = abi_align;
+      Alignments[i].PrefAlign = pref_align;
+      return;
+    }
+  }
+  
+  Alignments.push_back(TargetAlignElem::get(align_type, abi_align,
+                                            pref_align, bit_width));
+}
+
+/// getAlignmentInfo - Return the alignment (either ABI if ABIInfo = true or 
+/// preferred if ABIInfo = false) the target wants for the specified datatype.
+unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType, 
+                                      uint32_t BitWidth, bool ABIInfo) const {
+  // Check to see if we have an exact match and remember the best match we see.
+  int BestMatchIdx = -1;
+  int LargestInt = -1;
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
+    if (Alignments[i].AlignType == AlignType &&
+        Alignments[i].TypeBitWidth == BitWidth)
+      return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign;
+    
+    // The best match so far depends on what we're looking for.
+    if (AlignType == VECTOR_ALIGN) {
+      // If this is a specification for a smaller vector type, we will fall back
+      // to it.  This happens because <128 x double> can be implemented in terms
+      // of 64 <2 x double>.
+      if (Alignments[i].AlignType == VECTOR_ALIGN && 
+          Alignments[i].TypeBitWidth < BitWidth) {
+        // Verify that we pick the biggest of the fallbacks.
+        if (BestMatchIdx == -1 ||
+            Alignments[BestMatchIdx].TypeBitWidth < BitWidth)
+          BestMatchIdx = i;
+      }
+    } else if (AlignType == INTEGER_ALIGN && 
+               Alignments[i].AlignType == INTEGER_ALIGN) {
+      // The "best match" for integers is the smallest size that is larger than
+      // the BitWidth requested.
+      if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 || 
+           Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth))
+        BestMatchIdx = i;
+      // However, if there isn't one that's larger, then we must use the
+      // largest one we have (see below)
+      if (LargestInt == -1 || 
+          Alignments[i].TypeBitWidth > Alignments[LargestInt].TypeBitWidth)
+        LargestInt = i;
+    }
+  }
+
+  // For integers, if we didn't find a best match, use the largest one found.
+  if (BestMatchIdx == -1)
+    BestMatchIdx = LargestInt;
+
+  // Okay, we didn't find an exact solution.  Fall back here depending on what
+  // is being looked for.
+  assert(BestMatchIdx != -1 && "Didn't find alignment info for this datatype!");
+
+  // Since we got a "best match" index, just return it.
+  return ABIInfo ? Alignments[BestMatchIdx].ABIAlign
+                 : Alignments[BestMatchIdx].PrefAlign;
+}
+
+/// LayoutInfo - The lazy cache of structure layout information maintained by
+/// TargetData.  Note that the struct types must have been free'd before
+/// llvm_shutdown is called (and thus this is deallocated) because all the
+/// targets with cached elements should have been destroyed.
+///
+typedef std::pair<const TargetData*,const StructType*> LayoutKey;
+
+struct DenseMapLayoutKeyInfo {
+  static inline LayoutKey getEmptyKey() { return LayoutKey(0, 0); }
+  static inline LayoutKey getTombstoneKey() {
+    return LayoutKey((TargetData*)(intptr_t)-1, 0);
+  }
+  static unsigned getHashValue(const LayoutKey &Val) {
+    return DenseMapKeyInfo<void*>::getHashValue(Val.first) ^
+           DenseMapKeyInfo<void*>::getHashValue(Val.second);
+  }
+  static bool isPod() { return true; }
+};
+
+typedef DenseMap<LayoutKey, StructLayout*, DenseMapLayoutKeyInfo> LayoutInfoTy;
+static ManagedStatic<LayoutInfoTy> LayoutInfo;
+
+
+TargetData::~TargetData() {
+  if (LayoutInfo.isConstructed()) {
+    // Remove any layouts for this TD.
+    LayoutInfoTy &TheMap = *LayoutInfo;
+    for (LayoutInfoTy::iterator I = TheMap.begin(), E = TheMap.end();
+         I != E; ) {
+      if (I->first.first == this) {
+        I->second->~StructLayout();
+        free(I->second);
+        TheMap.erase(I++);
+      } else {
+        ++I;
+      }
+    }
+  }
+}
+
+const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
+  LayoutInfoTy &TheMap = *LayoutInfo;
+  
+  StructLayout *&SL = TheMap[LayoutKey(this, Ty)];
+  if (SL) return SL;
+
+  // Otherwise, create the struct layout.  Because it is variable length, we 
+  // malloc it, then use placement new.
+  int NumElts = Ty->getNumElements();
+  StructLayout *L =
+    (StructLayout *)malloc(sizeof(StructLayout)+(NumElts-1)*sizeof(uint64_t));
+  
+  // Set SL before calling StructLayout's ctor.  The ctor could cause other
+  // entries to be added to TheMap, invalidating our reference.
+  SL = L;
+  
+  new (L) StructLayout(Ty, *this);
+  return L;
+}
+
+/// InvalidateStructLayoutInfo - TargetData speculatively caches StructLayout
+/// objects.  If a TargetData object is alive when types are being refined and
+/// removed, this method must be called whenever a StructType is removed to
+/// avoid a dangling pointer in this cache.
+void TargetData::InvalidateStructLayoutInfo(const StructType *Ty) const {
+  if (!LayoutInfo.isConstructed()) return;  // No cache.
+  
+  LayoutInfoTy::iterator I = LayoutInfo->find(LayoutKey(this, Ty));
+  if (I != LayoutInfo->end()) {
+    I->second->~StructLayout();
+    free(I->second);
+    LayoutInfo->erase(I);
+  }
+}
+
+
+std::string TargetData::getStringRepresentation() const {
+  std::string repr;
+  repr.append(LittleEndian ? "e" : "E");
+  repr.append("-p:").append(itostr((int64_t) (PointerMemSize * 8))).
+      append(":").append(itostr((int64_t) (PointerABIAlign * 8))).
+      append(":").append(itostr((int64_t) (PointerPrefAlign * 8)));
+  for (align_const_iterator I = Alignments.begin();
+       I != Alignments.end();
+       ++I) {
+    repr.append("-").append(1, (char) I->AlignType).
+      append(utostr((int64_t) I->TypeBitWidth)).
+      append(":").append(utostr((uint64_t) (I->ABIAlign * 8))).
+      append(":").append(utostr((uint64_t) (I->PrefAlign * 8)));
+  }
+  return repr;
+}
+
+
+uint64_t TargetData::getTypeSize(const Type *Ty) const {
+  assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
+  switch (Ty->getTypeID()) {
+  case Type::LabelTyID:
+  case Type::PointerTyID:
+    return getPointerSize();
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<ArrayType>(Ty);
+    uint64_t Size;
+    unsigned char Alignment;
+    Size = getTypeSize(ATy->getElementType());
+    Alignment = getABITypeAlignment(ATy->getElementType());
+    uint64_t AlignedSize = (Size + Alignment - 1)/Alignment*Alignment;
+    return AlignedSize*ATy->getNumElements();
+  }
+  case Type::StructTyID: {
+    // Get the layout annotation... which is lazily created on demand.
+    const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
+    return Layout->getSizeInBytes();
+  }
+  case Type::IntegerTyID: {
+    unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
+    if (BitWidth <= 8) {
+      return 1;
+    } else if (BitWidth <= 16) {
+      return 2;
+    } else if (BitWidth <= 32) {
+      return 4;
+    } else if (BitWidth <= 64) {
+      return 8;
+    } else {
+      // The size of this > 64 bit type is chosen as a multiple of the
+      // preferred alignment of the largest "native" size the target supports. 
+      // We first obtain the the alignment info for this type and then compute
+      // the next largest multiple of that size.
+      uint64_t size = getAlignmentInfo(INTEGER_ALIGN, BitWidth, false) * 8;
+      return (((BitWidth / (size)) + (BitWidth % size != 0)) * size) / 8;
+    }
+    break;
+  }
+  case Type::VoidTyID:
+    return 1;
+  case Type::FloatTyID:
+    return 4;
+  case Type::DoubleTyID:
+    return 8;
+  case Type::VectorTyID: {
+    const VectorType *PTy = cast<VectorType>(Ty);
+    return PTy->getBitWidth() / 8;
+  }
+  default:
+    assert(0 && "TargetData::getTypeSize(): Unsupported type");
+    break;
+  }
+  return 0;
+}
+
+uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const {
+  if (Ty->isInteger())
+    return cast<IntegerType>(Ty)->getBitWidth();
+  else
+    return getTypeSize(Ty) * 8;
+}
+
+
+/*!
+  \param abi_or_pref Flag that determines which alignment is returned. true
+  returns the ABI alignment, false returns the preferred alignment.
+  \param Ty The underlying type for which alignment is determined.
+
+  Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref
+  == false) for the requested type \a Ty.
+ */
+unsigned char TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
+  int AlignType = -1;
+
+  assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
+  switch (Ty->getTypeID()) {
+  /* Early escape for the non-numeric types */
+  case Type::LabelTyID:
+  case Type::PointerTyID:
+    return (abi_or_pref
+            ? getPointerABIAlignment()
+            : getPointerPrefAlignment());
+  case Type::ArrayTyID:
+    return getAlignment(cast<ArrayType>(Ty)->getElementType(), abi_or_pref);
+    
+  case Type::StructTyID: {
+    // Packed structure types always have an ABI alignment of one.
+    if (cast<StructType>(Ty)->isPacked() && abi_or_pref)
+      return 1;
+    
+    // Get the layout annotation... which is lazily created on demand.
+    const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
+    unsigned Align = getAlignmentInfo(AGGREGATE_ALIGN, 0, abi_or_pref);
+    return std::max(Align, (unsigned)Layout->getAlignment());
+  }
+  case Type::IntegerTyID:
+  case Type::VoidTyID:
+    AlignType = INTEGER_ALIGN;
+    break;
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+    AlignType = FLOAT_ALIGN;
+    break;
+  case Type::VectorTyID: {
+    const VectorType *VTy = cast<VectorType>(Ty);
+    // Degenerate vectors are assumed to be scalar-ized
+    if (VTy->getNumElements() == 1)
+      return getAlignment(VTy->getElementType(), abi_or_pref);
+    else
+      AlignType = VECTOR_ALIGN;
+    break;
+  }
+  default:
+    assert(0 && "Bad type for getAlignment!!!");
+    break;
+  }
+
+  return getAlignmentInfo((AlignTypeEnum)AlignType, getTypeSize(Ty) * 8,
+                          abi_or_pref);
+}
+
+unsigned char TargetData::getABITypeAlignment(const Type *Ty) const {
+  return getAlignment(Ty, true);
+}
+
+unsigned char TargetData::getPrefTypeAlignment(const Type *Ty) const {
+  return getAlignment(Ty, false);
+}
+
+unsigned char TargetData::getPreferredTypeAlignmentShift(const Type *Ty) const {
+  unsigned Align = (unsigned) getPrefTypeAlignment(Ty);
+  assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
+  return Log2_32(Align);
+}
+
+/// getIntPtrType - Return an unsigned integer type that is the same size or
+/// greater to the host pointer size.
+const Type *TargetData::getIntPtrType() const {
+  switch (getPointerSize()) {
+  default: assert(0 && "Unknown pointer size!");
+  case 2: return Type::Int16Ty;
+  case 4: return Type::Int32Ty;
+  case 8: return Type::Int64Ty;
+  }
+}
+
+
+uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
+                                      unsigned NumIndices) const {
+  const Type *Ty = ptrTy;
+  assert(isa<PointerType>(Ty) && "Illegal argument for getIndexedOffset()");
+  uint64_t Result = 0;
+
+  generic_gep_type_iterator<Value* const*>
+    TI = gep_type_begin(ptrTy, Indices, Indices+NumIndices);
+  for (unsigned CurIDX = 0; CurIDX != NumIndices; ++CurIDX, ++TI) {
+    if (const StructType *STy = dyn_cast<StructType>(*TI)) {
+      assert(Indices[CurIDX]->getType() == Type::Int32Ty &&
+             "Illegal struct idx");
+      unsigned FieldNo = cast<ConstantInt>(Indices[CurIDX])->getZExtValue();
+
+      // Get structure layout information...
+      const StructLayout *Layout = getStructLayout(STy);
+
+      // Add in the offset, as calculated by the structure layout info...
+      Result += Layout->getElementOffset(FieldNo);
+
+      // Update Ty to refer to current element
+      Ty = STy->getElementType(FieldNo);
+    } else {
+      // Update Ty to refer to current element
+      Ty = cast<SequentialType>(Ty)->getElementType();
+
+      // Get the array index and the size of each array element.
+      int64_t arrayIdx = cast<ConstantInt>(Indices[CurIDX])->getSExtValue();
+      Result += arrayIdx * (int64_t)getTypeSize(Ty);
+    }
+  }
+
+  return Result;
+}
+
+/// getPreferredAlignmentLog - Return the preferred alignment of the
+/// specified global, returned in log form.  This includes an explicitly
+/// requested alignment (if the global has one).
+unsigned TargetData::getPreferredAlignmentLog(const GlobalVariable *GV) const {
+  const Type *ElemType = GV->getType()->getElementType();
+  unsigned Alignment = getPreferredTypeAlignmentShift(ElemType);
+  if (GV->getAlignment() > (1U << Alignment))
+    Alignment = Log2_32(GV->getAlignment());
+  
+  if (GV->hasInitializer()) {
+    if (Alignment < 4) {
+      // If the global is not external, see if it is large.  If so, give it a
+      // larger alignment.
+      if (getTypeSize(ElemType) > 128)
+        Alignment = 4;    // 16-byte alignment.
+    }
+  }
+  return Alignment;
+}

diff --git a/lib/Target/TargetFrameInfo.cpp b/lib/Target/TargetFrameInfo.cpp
new file mode 100644
index 0000000..e6f15b1
--- /dev/null
+++ b/lib/Target/TargetFrameInfo.cpp

@@ -0,0 +1,19 @@
+//===-- TargetFrameInfo.cpp - Implement machine frame interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the layout of a stack frame on the target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetFrameInfo.h"
+#include <cstdlib>
+using namespace llvm;
+
+TargetFrameInfo::~TargetFrameInfo() {
+}

diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
new file mode 100644
index 0000000..11b5b63
--- /dev/null
+++ b/lib/Target/TargetInstrInfo.cpp

@@ -0,0 +1,99 @@
+//===-- TargetInstrInfo.cpp - Target Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Constant.h"
+#include "llvm/DerivedTypes.h"
+using namespace llvm;
+
+/// findTiedToSrcOperand - Returns the operand that is tied to the specified
+/// dest operand. Returns -1 if there isn't one.
+int TargetInstrDescriptor::findTiedToSrcOperand(unsigned OpNum) const {
+  for (unsigned i = 0, e = numOperands; i != e; ++i) {
+    if (i == OpNum)
+      continue;
+    if (getOperandConstraint(i, TOI::TIED_TO) == (int)OpNum)
+      return i;
+  }
+  return -1;
+}
+
+
+TargetInstrInfo::TargetInstrInfo(const TargetInstrDescriptor* Desc,
+                                 unsigned numOpcodes)
+  : desc(Desc), NumOpcodes(numOpcodes) {
+}
+
+TargetInstrInfo::~TargetInstrInfo() {
+}
+
+// commuteInstruction - The default implementation of this method just exchanges
+// operand 1 and 2.
+MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI) const {
+  assert(MI->getOperand(1).isRegister() && MI->getOperand(2).isRegister() &&
+         "This only knows how to commute register operands so far");
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  unsigned Reg2 = MI->getOperand(2).getReg();
+  bool Reg1IsKill = MI->getOperand(1).isKill();
+  bool Reg2IsKill = MI->getOperand(2).isKill();
+  MI->getOperand(2).setReg(Reg1);
+  MI->getOperand(1).setReg(Reg2);
+  if (Reg1IsKill)
+    MI->getOperand(2).setIsKill();
+  else
+    MI->getOperand(2).unsetIsKill();
+  if (Reg2IsKill)
+    MI->getOperand(1).setIsKill();
+  else
+    MI->getOperand(1).unsetIsKill();
+  return MI;
+}
+
+bool TargetInstrInfo::PredicateInstruction(MachineInstr *MI,
+                                const std::vector<MachineOperand> &Pred) const {
+  bool MadeChange = false;
+  const TargetInstrDescriptor *TID = MI->getInstrDescriptor();
+  if (TID->Flags & M_PREDICABLE) {
+    for (unsigned j = 0, i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      if ((TID->OpInfo[i].Flags & M_PREDICATE_OPERAND)) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg()) {
+          MO.setReg(Pred[j].getReg());
+          MadeChange = true;
+        } else if (MO.isImm()) {
+          MO.setImm(Pred[j].getImmedValue());
+          MadeChange = true;
+        } else if (MO.isMBB()) {
+          MO.setMachineBasicBlock(Pred[j].getMachineBasicBlock());
+          MadeChange = true;
+        }
+        ++j;
+      }
+    }
+  }
+  return MadeChange;
+}
+
+bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  const TargetInstrDescriptor *TID = MI->getInstrDescriptor();
+  if (TID->Flags & M_TERMINATOR_FLAG) {
+    // Conditional branch is a special case.
+    if ((TID->Flags & M_BRANCH_FLAG) != 0 && (TID->Flags & M_BARRIER_FLAG) == 0)
+      return true;
+    if ((TID->Flags & M_PREDICABLE) == 0)
+      return true;
+    return !isPredicated(MI);
+  }
+  return false;
+}

diff --git a/lib/Target/TargetMachOWriterInfo.cpp b/lib/Target/TargetMachOWriterInfo.cpp
new file mode 100644
index 0000000..9a9d0d3
--- /dev/null
+++ b/lib/Target/TargetMachOWriterInfo.cpp

@@ -0,0 +1,25 @@
+//===-- llvm/Target/TargetMachOWriterInfo.h - MachO Writer Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bill Wendling and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the TargetMachOWriterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachOWriterInfo.h"
+#include "llvm/CodeGen/MachineRelocation.h"
+using namespace llvm;
+
+TargetMachOWriterInfo::~TargetMachOWriterInfo() {}
+
+MachineRelocation
+TargetMachOWriterInfo::GetJTRelocation(unsigned Offset,
+                                       MachineBasicBlock *MBB) const {
+  // FIXME: do something about PIC
+  return MachineRelocation::getBB(Offset, MachineRelocation::VANILLA, MBB);
+}

diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
new file mode 100644
index 0000000..6c00a3f
--- /dev/null
+++ b/lib/Target/TargetMachine.cpp

@@ -0,0 +1,164 @@
+//===-- TargetMachine.cpp - General Target Information ---------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+//---------------------------------------------------------------------------
+// Command-line options that tend to be useful on more than one back-end.
+//
+
+namespace llvm {
+  bool PrintMachineCode;
+  bool NoFramePointerElim;
+  bool NoExcessFPPrecision;
+  bool UnsafeFPMath;
+  bool FiniteOnlyFPMathOption;
+  bool HonorSignDependentRoundingFPMathOption;
+  bool UseSoftFloat;
+  bool NoZerosInBSS;
+  bool ExceptionHandling;
+  Reloc::Model RelocationModel;
+  CodeModel::Model CMModel;
+}
+namespace {
+  cl::opt<bool, true> PrintCode("print-machineinstrs",
+    cl::desc("Print generated machine code"),
+    cl::location(PrintMachineCode), cl::init(false));
+
+  cl::opt<bool, true>
+    DisableFPElim("disable-fp-elim",
+                  cl::desc("Disable frame pointer elimination optimization"),
+                  cl::location(NoFramePointerElim),
+                  cl::init(false));
+  cl::opt<bool, true>
+  DisableExcessPrecision("disable-excess-fp-precision",
+               cl::desc("Disable optimizations that may increase FP precision"),
+               cl::location(NoExcessFPPrecision),
+               cl::init(false));
+  cl::opt<bool, true>
+  EnableUnsafeFPMath("enable-unsafe-fp-math",
+               cl::desc("Enable optimizations that may decrease FP precision"),
+               cl::location(UnsafeFPMath),
+               cl::init(false));
+  cl::opt<bool, true>
+  EnableFiniteOnlyFPMath("enable-finite-only-fp-math",
+               cl::desc("Enable optimizations that assumes non- NaNs / +-Infs"),
+               cl::location(FiniteOnlyFPMathOption),
+               cl::init(false));
+  cl::opt<bool, true>
+  EnableHonorSignDependentRoundingFPMath(cl::Hidden,
+               "enable-sign-dependent-rounding-fp-math",
+       cl::desc("Force codegen to assume rounding mode can change dynamically"),
+               cl::location(HonorSignDependentRoundingFPMathOption),
+               cl::init(false));
+
+  cl::opt<bool, true>
+  GenerateSoftFloatCalls("soft-float",
+               cl::desc("Generate software floating point library calls"),
+               cl::location(UseSoftFloat),
+               cl::init(false));
+  cl::opt<bool, true>
+  DontPlaceZerosInBSS("nozero-initialized-in-bss",
+              cl::desc("Don't place zero-initialized symbols into bss section"),
+              cl::location(NoZerosInBSS),
+              cl::init(false));
+  cl::opt<bool, true>
+  EnableExceptionHandling("enable-eh",
+               cl::desc("Exception handling should be emitted."),
+               cl::location(ExceptionHandling),
+               cl::init(false));
+
+  cl::opt<llvm::Reloc::Model, true>
+  DefRelocationModel(
+    "relocation-model",
+    cl::desc("Choose relocation model"),
+    cl::location(RelocationModel),
+    cl::init(Reloc::Default),
+    cl::values(
+      clEnumValN(Reloc::Default, "default",
+                 "  Target default relocation model"),
+      clEnumValN(Reloc::Static, "static",
+                 "  Non-relocatable code"),
+      clEnumValN(Reloc::PIC_, "pic",
+                 "  Fully relocatable, position independent code"),
+      clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic",
+                 "  Relocatable external references, non-relocatable code"),
+      clEnumValEnd));
+  cl::opt<llvm::CodeModel::Model, true>
+  DefCodeModel(
+    "code-model",
+    cl::desc("Choose code model"),
+    cl::location(CMModel),
+    cl::init(CodeModel::Default),
+    cl::values(
+      clEnumValN(CodeModel::Default, "default",
+                 "  Target default code model"),
+      clEnumValN(CodeModel::Small, "small",
+                 "  Small code model"),
+      clEnumValN(CodeModel::Kernel, "kernel",
+                 "  Kernel code model"),
+      clEnumValN(CodeModel::Medium, "medium",
+                 "  Medium code model"),
+      clEnumValN(CodeModel::Large, "large",
+                 "  Large code model"),
+      clEnumValEnd));
+}
+
+//---------------------------------------------------------------------------
+// TargetMachine Class
+//
+
+TargetMachine::~TargetMachine() {
+  delete AsmInfo;
+}
+
+/// getRelocationModel - Returns the code generation relocation model. The
+/// choices are static, PIC, and dynamic-no-pic, and target default.
+Reloc::Model TargetMachine::getRelocationModel() {
+  return RelocationModel;
+}
+
+/// setRelocationModel - Sets the code generation relocation model.
+void TargetMachine::setRelocationModel(Reloc::Model Model) {
+  RelocationModel = Model;
+}
+
+/// getCodeModel - Returns the code model. The choices are small, kernel,
+/// medium, large, and target default.
+CodeModel::Model TargetMachine::getCodeModel() {
+  return CMModel;
+}
+
+/// setCodeModel - Sets the code model.
+void TargetMachine::setCodeModel(CodeModel::Model Model) {
+  CMModel = Model;
+}
+
+namespace llvm {
+  /// FiniteOnlyFPMath - This returns true when the -enable-finite-only-fp-math
+  /// option is specified on the command line. If this returns false (default),
+  /// the code generator is not allowed to assume that FP arithmetic arguments
+  /// and results are never NaNs or +-Infs.
+  bool FiniteOnlyFPMath() { return UnsafeFPMath || FiniteOnlyFPMathOption; }
+  
+  /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
+  /// that the rounding mode of the FPU can change from its default.
+  bool HonorSignDependentRoundingFPMath() {
+    return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
+  }
+}
+

diff --git a/lib/Target/TargetMachineRegistry.cpp b/lib/Target/TargetMachineRegistry.cpp
new file mode 100644
index 0000000..2ab8f51
--- /dev/null
+++ b/lib/Target/TargetMachineRegistry.cpp

@@ -0,0 +1,107 @@
+//===-- TargetMachineRegistry.cpp - Target Auto Registration Impl ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the RegisterTarget class, which TargetMachine
+// implementations should use to register themselves with the system.  This file
+// also exposes the TargetMachineRegistry class, which allows tools to inspect
+// all of registered targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachineRegistry.h"
+#include <algorithm>
+using namespace llvm;
+
+/// List - This is the main list of all of the registered target machines.
+const TargetMachineRegistry::Entry *TargetMachineRegistry::List = 0;
+
+/// Listeners - All of the listeners registered to get notified when new targets
+/// are loaded.
+static TargetRegistrationListener *Listeners = 0;
+
+TargetMachineRegistry::Entry::Entry(const char *N, const char *SD,
+                       TargetMachine *(*CF)(const Module &,const std::string &),
+                           unsigned (*MMF)(const Module &M), unsigned (*JMF)())
+  : Name(N), ShortDesc(SD), CtorFn(CF), ModuleMatchQualityFn(MMF),
+    JITMatchQualityFn(JMF), Next(List) {
+  List = this;
+  for (TargetRegistrationListener *L = Listeners; L; L = L->getNext())
+    L->targetRegistered(this);
+}
+
+TargetRegistrationListener::TargetRegistrationListener() {
+  Next = Listeners;
+  if (Next) Next->Prev = &Next;
+  Prev = &Listeners;
+  Listeners = this;
+}
+
+TargetRegistrationListener::~TargetRegistrationListener() {
+  *Prev = Next;
+}
+
+/// getClosestStaticTargetForModule - Given an LLVM module, pick the best target
+/// that is compatible with the module.  If no close target can be found, this
+/// returns null and sets the Error string to a reason.
+const TargetMachineRegistry::Entry *
+TargetMachineRegistry::getClosestStaticTargetForModule(const Module &M,
+                                                       std::string &Error) {
+  std::vector<std::pair<unsigned, const Entry *> > UsableTargets;
+  for (const Entry *E = getList(); E; E = E->getNext())
+    if (unsigned Qual = E->ModuleMatchQualityFn(M))
+      UsableTargets.push_back(std::make_pair(Qual, E));
+
+  if (UsableTargets.empty()) {
+    Error = "No available targets are compatible with this module";
+    return 0;
+  } else if (UsableTargets.size() == 1)
+    return UsableTargets.back().second;
+
+  // Otherwise, take the best target, but make sure we don't have to equally
+  // good best targets.
+  std::sort(UsableTargets.begin(), UsableTargets.end());
+  if (UsableTargets.back().first ==UsableTargets[UsableTargets.size()-2].first){
+    Error = "Cannot choose between targets \"" +
+      std::string(UsableTargets.back().second->Name) + "\" and \"" +
+      std::string(UsableTargets[UsableTargets.size()-2].second->Name) + "\"";
+    return 0;
+  }
+  return UsableTargets.back().second;
+}
+
+/// getClosestTargetForJIT - Given an LLVM module, pick the best target that
+/// is compatible with the current host and the specified module.  If no
+/// close target can be found, this returns null and sets the Error string
+/// to a reason.
+const TargetMachineRegistry::Entry *
+TargetMachineRegistry::getClosestTargetForJIT(std::string &Error) {
+  std::vector<std::pair<unsigned, const Entry *> > UsableTargets;
+  for (const Entry *E = getList(); E; E = E->getNext())
+    if (unsigned Qual = E->JITMatchQualityFn())
+      UsableTargets.push_back(std::make_pair(Qual, E));
+
+  if (UsableTargets.empty()) {
+    Error = "No JIT is available for this host";
+    return 0;
+  } else if (UsableTargets.size() == 1)
+    return UsableTargets.back().second;
+
+  // Otherwise, take the best target.  If there is a tie, just pick one.
+  unsigned MaxQual = UsableTargets.front().first;
+  const Entry *MaxQualTarget = UsableTargets.front().second;
+
+  for (unsigned i = 1, e = UsableTargets.size(); i != e; ++i)
+    if (UsableTargets[i].first > MaxQual) {
+      MaxQual = UsableTargets[i].first;
+      MaxQualTarget = UsableTargets[i].second;
+    }
+
+  return MaxQualTarget;
+}
+

diff --git a/lib/Target/TargetSchedule.td b/lib/Target/TargetSchedule.td
new file mode 100644
index 0000000..8ac537f
--- /dev/null
+++ b/lib/Target/TargetSchedule.td

@@ -0,0 +1,72 @@
+//===- TargetSchedule.td - Target Independent Scheduling ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the target-independent scheduling interfaces which should
+// be implemented by each target which is using TableGen based scheduling.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Processor functional unit - These values represent the function units
+// available across all chip sets for the target.  Eg., IntUnit, FPUnit, ...
+// These may be independent values for each chip set or may be shared across
+// all chip sets of the target.  Each functional unit is treated as a resource
+// during scheduling and has an affect instruction order based on availability
+// during a time interval.
+//  
+class FuncUnit;
+
+//===----------------------------------------------------------------------===//
+// Instruction stage - These values represent a step in the execution of an
+// instruction.  The latency represents the number of discrete time slots used
+// need to complete the stage.  Units represent the choice of functional units
+// that can be used to complete the stage.  Eg. IntUnit1, IntUnit2.
+//
+class InstrStage<int cycles, list<FuncUnit> units> {
+  int Cycles          = cycles;       // length of stage in machine cycles
+  list<FuncUnit> Units = units;       // choice of functional units
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction itinerary - An itinerary represents a sequential series of steps
+// required to complete an instruction.  Itineraries are represented as lists of
+// instruction stages.
+//
+
+//===----------------------------------------------------------------------===//
+// Instruction itinerary classes - These values represent 'named' instruction
+// itinerary.  Using named itineraries simplifies managing groups of
+// instructions across chip sets.  An instruction uses the same itinerary class
+// across all chip sets.  Thus a new chip set can be added without modifying
+// instruction information.
+//
+class InstrItinClass;
+def NoItinerary : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Instruction itinerary data - These values provide a runtime map of an 
+// instruction itinerary class (name) to it's itinerary data.
+//
+class InstrItinData<InstrItinClass Class, list<InstrStage> stages> {
+  InstrItinClass TheClass = Class;
+  list<InstrStage> Stages = stages;
+}
+
+//===----------------------------------------------------------------------===//
+// Processor itineraries - These values represent the set of all itinerary
+// classes for a given chip set.
+//
+class ProcessorItineraries<list<InstrItinData> iid> {
+  list<InstrItinData> IID = iid;
+}
+
+// NoItineraries - A marker that can be used by processors without schedule
+// info.
+def NoItineraries : ProcessorItineraries<[]>;
+

diff --git a/lib/Target/TargetSelectionDAG.td b/lib/Target/TargetSelectionDAG.td
new file mode 100644
index 0000000..491bb02
--- /dev/null
+++ b/lib/Target/TargetSelectionDAG.td

@@ -0,0 +1,763 @@
+//===- TargetSelectionDAG.td - Common code for DAG isels ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the target-independent interfaces used by SelectionDAG
+// instruction selection generators.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Selection DAG Type Constraint definitions.
+//
+// Note that the semantics of these constraints are hard coded into tblgen.  To
+// modify or add constraints, you have to hack tblgen.
+//
+
+class SDTypeConstraint<int opnum> {
+  int OperandNum = opnum;
+}
+
+// SDTCisVT - The specified operand has exactly this VT.
+class SDTCisVT<int OpNum, ValueType vt> : SDTypeConstraint<OpNum> {
+  ValueType VT = vt;
+}
+
+class SDTCisPtrTy<int OpNum> : SDTypeConstraint<OpNum>;
+
+// SDTCisInt - The specified operand is has integer type.
+class SDTCisInt<int OpNum> : SDTypeConstraint<OpNum>;
+
+// SDTCisFP - The specified operand is has floating point type.
+class SDTCisFP<int OpNum> : SDTypeConstraint<OpNum>;
+
+// SDTCisSameAs - The two specified operands have identical types.
+class SDTCisSameAs<int OpNum, int OtherOp> : SDTypeConstraint<OpNum> {
+  int OtherOperandNum = OtherOp;
+}
+
+// SDTCisVTSmallerThanOp - The specified operand is a VT SDNode, and its type is
+// smaller than the 'Other' operand.
+class SDTCisVTSmallerThanOp<int OpNum, int OtherOp> : SDTypeConstraint<OpNum> {
+  int OtherOperandNum = OtherOp;
+}
+
+class SDTCisOpSmallerThanOp<int SmallOp, int BigOp> : SDTypeConstraint<SmallOp>{
+  int BigOperandNum = BigOp;
+}
+
+/// SDTCisIntVectorOfSameSize - This indicates that ThisOp and OtherOp are
+/// vector types, and that ThisOp is the result of 
+/// MVT::getIntVectorWithNumElements with the number of elements that ThisOp
+/// has.
+class SDTCisIntVectorOfSameSize<int ThisOp, int OtherOp>
+  : SDTypeConstraint<ThisOp> {
+  int OtherOpNum = OtherOp;
+}
+
+//===----------------------------------------------------------------------===//
+// Selection DAG Type Profile definitions.
+//
+// These use the constraints defined above to describe the type requirements of
+// the various nodes.  These are not hard coded into tblgen, allowing targets to
+// add their own if needed.
+//
+
+// SDTypeProfile - This profile describes the type requirements of a Selection
+// DAG node.
+class SDTypeProfile<int numresults, int numoperands,
+                    list<SDTypeConstraint> constraints> {
+  int NumResults = numresults;
+  int NumOperands = numoperands;
+  list<SDTypeConstraint> Constraints = constraints;
+}
+
+// Builtin profiles.
+def SDTIntLeaf: SDTypeProfile<1, 0, [SDTCisInt<0>]>;      // for 'imm'.
+def SDTFPLeaf : SDTypeProfile<1, 0, [SDTCisFP<0>]>;       // for 'fpimm'.
+def SDTPtrLeaf: SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;      // for '&g'.
+def SDTOther  : SDTypeProfile<1, 0, [SDTCisVT<0, OtherVT>]>; // for 'vt'.
+def SDTUNDEF  : SDTypeProfile<1, 0, []>; // for 'undef'.
+def SDTUnaryOp  : SDTypeProfile<1, 1, []>; // bitconvert
+
+def SDTIntBinOp : SDTypeProfile<1, 2, [   // add, and, or, xor, udiv, etc.
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>
+]>;
+def SDTIntShiftOp : SDTypeProfile<1, 2, [   // shl, sra, srl
+  SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>
+]>;
+def SDTFPBinOp : SDTypeProfile<1, 2, [      // fadd, fmul, etc.
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>
+]>;
+def SDTFPSignOp : SDTypeProfile<1, 2, [      // fcopysign.
+  SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisFP<2>
+]>;
+def SDTFPTernaryOp : SDTypeProfile<1, 3, [      // fmadd, fnmsub, etc.
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisFP<0>
+]>;
+def SDTIntUnaryOp : SDTypeProfile<1, 1, [   // ctlz
+  SDTCisSameAs<0, 1>, SDTCisInt<0>
+]>;
+def SDTIntExtendOp : SDTypeProfile<1, 1, [  // sext, zext, anyext
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>
+]>;
+def SDTIntTruncOp  : SDTypeProfile<1, 1, [  // trunc
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>
+]>;
+def SDTFPUnaryOp  : SDTypeProfile<1, 1, [   // fneg, fsqrt, etc
+  SDTCisSameAs<0, 1>, SDTCisFP<0>
+]>;
+def SDTFPRoundOp  : SDTypeProfile<1, 1, [   // fround
+  SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>
+]>;
+def SDTFPExtendOp  : SDTypeProfile<1, 1, [   // fextend
+  SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0>
+]>;
+def SDTIntToFPOp : SDTypeProfile<1, 1, [   // [su]int_to_fp 
+  SDTCisFP<0>, SDTCisInt<1>
+]>;
+def SDTFPToIntOp : SDTypeProfile<1, 1, [   // fp_to_[su]int 
+  SDTCisInt<0>, SDTCisFP<1>
+]>;
+def SDTExtInreg : SDTypeProfile<1, 2, [   // sext_inreg
+  SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisVT<2, OtherVT>,
+  SDTCisVTSmallerThanOp<2, 1>
+]>;
+
+def SDTSetCC : SDTypeProfile<1, 3, [ // setcc
+  SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
+]>;
+
+def SDTSelect : SDTypeProfile<1, 3, [ // select 
+  SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>
+]>;
+
+def SDTSelectCC : SDTypeProfile<1, 5, [ // select_cc
+  SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>, SDTCisSameAs<0, 3>,
+  SDTCisVT<5, OtherVT>
+]>;
+
+def SDTBr : SDTypeProfile<0, 1, [ // br
+  SDTCisVT<0, OtherVT>
+]>;
+
+def SDTBrcond : SDTypeProfile<0, 2, [ // brcond
+  SDTCisInt<0>, SDTCisVT<1, OtherVT>
+]>;
+
+def SDTBrind : SDTypeProfile<0, 1, [ // brind
+  SDTCisPtrTy<0>
+]>;
+
+def SDTRet : SDTypeProfile<0, 0, []>; // ret
+
+def SDTLoad : SDTypeProfile<1, 1, [ // load
+  SDTCisPtrTy<1>  
+]>;
+
+def SDTStore : SDTypeProfile<0, 2, [ // store
+  SDTCisPtrTy<1>  
+]>;
+
+def SDTIStore : SDTypeProfile<1, 3, [ // indexed store
+  SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3>
+]>;
+
+def SDTVecShuffle : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisIntVectorOfSameSize<3, 0>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Selection DAG Node Properties.
+//
+// Note: These are hard coded into tblgen.
+//
+class SDNodeProperty;
+def SDNPCommutative : SDNodeProperty;   // X op Y == Y op X
+def SDNPAssociative : SDNodeProperty;   // (X op Y) op Z == X op (Y op Z)
+def SDNPHasChain    : SDNodeProperty;   // R/W chain operand and result
+def SDNPOutFlag     : SDNodeProperty;   // Write a flag result
+def SDNPInFlag      : SDNodeProperty;   // Read a flag operand
+def SDNPOptInFlag   : SDNodeProperty;   // Optionally read a flag operand
+
+//===----------------------------------------------------------------------===//
+// Selection DAG Node definitions.
+//
+class SDNode<string opcode, SDTypeProfile typeprof,
+             list<SDNodeProperty> props = [], string sdclass = "SDNode"> {
+  string Opcode  = opcode;
+  string SDClass = sdclass;
+  list<SDNodeProperty> Properties = props;
+  SDTypeProfile TypeProfile = typeprof;
+}
+
+def set;
+def node;
+def srcvalue;
+
+def imm        : SDNode<"ISD::Constant"  , SDTIntLeaf , [], "ConstantSDNode">;
+def fpimm      : SDNode<"ISD::TargetConstantFP",
+                         SDTFPLeaf, [], "ConstantFPSDNode">;
+def vt         : SDNode<"ISD::VALUETYPE" , SDTOther   , [], "VTSDNode">;
+def bb         : SDNode<"ISD::BasicBlock", SDTOther   , [], "BasicBlockSDNode">;
+def cond       : SDNode<"ISD::CONDCODE"  , SDTOther   , [], "CondCodeSDNode">;
+def undef      : SDNode<"ISD::UNDEF"     , SDTUNDEF   , []>;
+def globaladdr : SDNode<"ISD::GlobalAddress",         SDTPtrLeaf, [],
+                        "GlobalAddressSDNode">;
+def tglobaladdr : SDNode<"ISD::TargetGlobalAddress",  SDTPtrLeaf, [],
+                         "GlobalAddressSDNode">;
+def globaltlsaddr : SDNode<"ISD::GlobalTLSAddress",         SDTPtrLeaf, [],
+                          "GlobalAddressSDNode">;
+def tglobaltlsaddr : SDNode<"ISD::TargetGlobalTLSAddress",  SDTPtrLeaf, [],
+                           "GlobalAddressSDNode">;
+def constpool   : SDNode<"ISD::ConstantPool",         SDTPtrLeaf, [],
+                         "ConstantPoolSDNode">;
+def tconstpool  : SDNode<"ISD::TargetConstantPool",   SDTPtrLeaf, [],
+                         "ConstantPoolSDNode">;
+def jumptable   : SDNode<"ISD::JumpTable",            SDTPtrLeaf, [],
+                         "JumpTableSDNode">;
+def tjumptable  : SDNode<"ISD::TargetJumpTable",      SDTPtrLeaf, [],
+                         "JumpTableSDNode">;
+def frameindex  : SDNode<"ISD::FrameIndex",           SDTPtrLeaf, [],
+                         "FrameIndexSDNode">;
+def tframeindex : SDNode<"ISD::TargetFrameIndex",     SDTPtrLeaf, [],
+                         "FrameIndexSDNode">;
+def externalsym : SDNode<"ISD::ExternalSymbol",       SDTPtrLeaf, [],
+                         "ExternalSymbolSDNode">;
+def texternalsym: SDNode<"ISD::TargetExternalSymbol", SDTPtrLeaf, [],
+                         "ExternalSymbolSDNode">;
+
+def add        : SDNode<"ISD::ADD"       , SDTIntBinOp   ,
+                        [SDNPCommutative, SDNPAssociative]>;
+def sub        : SDNode<"ISD::SUB"       , SDTIntBinOp>;
+def mul        : SDNode<"ISD::MUL"       , SDTIntBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def mulhs      : SDNode<"ISD::MULHS"     , SDTIntBinOp, [SDNPCommutative]>;
+def mulhu      : SDNode<"ISD::MULHU"     , SDTIntBinOp, [SDNPCommutative]>;
+def sdiv       : SDNode<"ISD::SDIV"      , SDTIntBinOp>;
+def udiv       : SDNode<"ISD::UDIV"      , SDTIntBinOp>;
+def srem       : SDNode<"ISD::SREM"      , SDTIntBinOp>;
+def urem       : SDNode<"ISD::UREM"      , SDTIntBinOp>;
+def srl        : SDNode<"ISD::SRL"       , SDTIntShiftOp>;
+def sra        : SDNode<"ISD::SRA"       , SDTIntShiftOp>;
+def shl        : SDNode<"ISD::SHL"       , SDTIntShiftOp>;
+def rotl       : SDNode<"ISD::ROTL"      , SDTIntShiftOp>;
+def rotr       : SDNode<"ISD::ROTR"      , SDTIntShiftOp>;
+def and        : SDNode<"ISD::AND"       , SDTIntBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def or         : SDNode<"ISD::OR"        , SDTIntBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def xor        : SDNode<"ISD::XOR"       , SDTIntBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def addc       : SDNode<"ISD::ADDC"      , SDTIntBinOp,
+                        [SDNPCommutative, SDNPOutFlag]>;
+def adde       : SDNode<"ISD::ADDE"      , SDTIntBinOp,
+                        [SDNPCommutative, SDNPOutFlag, SDNPInFlag]>;
+def subc       : SDNode<"ISD::SUBC"      , SDTIntBinOp,
+                        [SDNPOutFlag]>;
+def sube       : SDNode<"ISD::SUBE"      , SDTIntBinOp,
+                        [SDNPOutFlag, SDNPInFlag]>;
+                        
+def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
+def bswap      : SDNode<"ISD::BSWAP"      , SDTIntUnaryOp>;
+def ctlz       : SDNode<"ISD::CTLZ"       , SDTIntUnaryOp>;
+def cttz       : SDNode<"ISD::CTTZ"       , SDTIntUnaryOp>;
+def ctpop      : SDNode<"ISD::CTPOP"      , SDTIntUnaryOp>;
+def sext       : SDNode<"ISD::SIGN_EXTEND", SDTIntExtendOp>;
+def zext       : SDNode<"ISD::ZERO_EXTEND", SDTIntExtendOp>;
+def anyext     : SDNode<"ISD::ANY_EXTEND" , SDTIntExtendOp>;
+def trunc      : SDNode<"ISD::TRUNCATE"   , SDTIntTruncOp>;
+def bitconvert : SDNode<"ISD::BIT_CONVERT", SDTUnaryOp>;
+                        
+def fadd       : SDNode<"ISD::FADD"       , SDTFPBinOp, [SDNPCommutative]>;
+def fsub       : SDNode<"ISD::FSUB"       , SDTFPBinOp>;
+def fmul       : SDNode<"ISD::FMUL"       , SDTFPBinOp, [SDNPCommutative]>;
+def fdiv       : SDNode<"ISD::FDIV"       , SDTFPBinOp>;
+def frem       : SDNode<"ISD::FREM"       , SDTFPBinOp>;
+def fabs       : SDNode<"ISD::FABS"       , SDTFPUnaryOp>;
+def fneg       : SDNode<"ISD::FNEG"       , SDTFPUnaryOp>;
+def fsqrt      : SDNode<"ISD::FSQRT"      , SDTFPUnaryOp>;
+def fsin       : SDNode<"ISD::FSIN"       , SDTFPUnaryOp>;
+def fcos       : SDNode<"ISD::FCOS"       , SDTFPUnaryOp>;
+
+def fround     : SDNode<"ISD::FP_ROUND"   , SDTFPRoundOp>;
+def fextend    : SDNode<"ISD::FP_EXTEND"  , SDTFPExtendOp>;
+def fcopysign  : SDNode<"ISD::FCOPYSIGN"  , SDTFPSignOp>;
+
+def sint_to_fp : SDNode<"ISD::SINT_TO_FP" , SDTIntToFPOp>;
+def uint_to_fp : SDNode<"ISD::UINT_TO_FP" , SDTIntToFPOp>;
+def fp_to_sint : SDNode<"ISD::FP_TO_SINT" , SDTFPToIntOp>;
+def fp_to_uint : SDNode<"ISD::FP_TO_UINT" , SDTFPToIntOp>;
+
+def setcc      : SDNode<"ISD::SETCC"      , SDTSetCC>;
+def select     : SDNode<"ISD::SELECT"     , SDTSelect>;
+def selectcc   : SDNode<"ISD::SELECT_CC"  , SDTSelectCC>;
+
+def brcond     : SDNode<"ISD::BRCOND"     , SDTBrcond, [SDNPHasChain]>;
+def brind      : SDNode<"ISD::BRIND"      , SDTBrind,  [SDNPHasChain]>;
+def br         : SDNode<"ISD::BR"         , SDTBr,     [SDNPHasChain]>;
+def ret        : SDNode<"ISD::RET"        , SDTRet,    [SDNPHasChain]>;
+
+// Do not use ld, st directly. Use load, extload, sextload, zextload, store,
+// and truncst (see below).
+def ld         : SDNode<"ISD::LOAD"       , SDTLoad,  [SDNPHasChain]>;
+def st         : SDNode<"ISD::STORE"      , SDTStore, [SDNPHasChain]>;
+def ist        : SDNode<"ISD::STORE"      , SDTIStore, [SDNPHasChain]>;
+
+def vector_shuffle : SDNode<"ISD::VECTOR_SHUFFLE", SDTVecShuffle, []>;
+def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, 0, []>, []>;
+def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
+                              []>;
+def vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
+    SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>, []>;
+def vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT",
+    SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
+
+// Nodes for intrinsics, you should use the intrinsic itself and let tblgen use
+// these internally.  Don't reference these directly.
+def intrinsic_void : SDNode<"ISD::INTRINSIC_VOID", 
+                            SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                            [SDNPHasChain]>;
+def intrinsic_w_chain : SDNode<"ISD::INTRINSIC_W_CHAIN", 
+                               SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>,
+                               [SDNPHasChain]>;
+def intrinsic_wo_chain : SDNode<"ISD::INTRINSIC_WO_CHAIN", 
+                                SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>, []>;
+
+
+//===----------------------------------------------------------------------===//
+// Selection DAG Condition Codes
+
+class CondCode; // ISD::CondCode enums
+def SETOEQ : CondCode; def SETOGT : CondCode;
+def SETOGE : CondCode; def SETOLT : CondCode; def SETOLE : CondCode;
+def SETONE : CondCode; def SETO   : CondCode; def SETUO  : CondCode;
+def SETUEQ : CondCode; def SETUGT : CondCode; def SETUGE : CondCode;
+def SETULT : CondCode; def SETULE : CondCode; def SETUNE : CondCode;
+
+def SETEQ : CondCode; def SETGT : CondCode; def SETGE : CondCode;
+def SETLT : CondCode; def SETLE : CondCode; def SETNE : CondCode;
+
+
+//===----------------------------------------------------------------------===//
+// Selection DAG Node Transformation Functions.
+//
+// This mechanism allows targets to manipulate nodes in the output DAG once a
+// match has been formed.  This is typically used to manipulate immediate
+// values.
+//
+class SDNodeXForm<SDNode opc, code xformFunction> {
+  SDNode Opcode = opc;
+  code XFormFunction = xformFunction;
+}
+
+def NOOP_SDNodeXForm : SDNodeXForm<imm, [{}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Selection DAG Pattern Fragments.
+//
+// Pattern fragments are reusable chunks of dags that match specific things.
+// They can take arguments and have C++ predicates that control whether they
+// match.  They are intended to make the patterns for common instructions more
+// compact and readable.
+//
+
+/// PatFrag - Represents a pattern fragment.  This can match something on the
+/// DAG, frame a single node to multiply nested other fragments.
+///
+class PatFrag<dag ops, dag frag, code pred = [{}],
+              SDNodeXForm xform = NOOP_SDNodeXForm> {
+  dag Operands = ops;
+  dag Fragment = frag;
+  code Predicate = pred;
+  SDNodeXForm OperandTransform = xform;
+}
+
+// PatLeaf's are pattern fragments that have no operands.  This is just a helper
+// to define immediates and other common things concisely.
+class PatLeaf<dag frag, code pred = [{}], SDNodeXForm xform = NOOP_SDNodeXForm>
+ : PatFrag<(ops), frag, pred, xform>;
+
+// Leaf fragments.
+
+def vtInt      : PatLeaf<(vt),  [{ return MVT::isInteger(N->getVT()); }]>;
+def vtFP       : PatLeaf<(vt),  [{ return MVT::isFloatingPoint(N->getVT()); }]>;
+
+def immAllOnes : PatLeaf<(imm), [{ return N->isAllOnesValue(); }]>;
+def immAllOnesV: PatLeaf<(build_vector), [{
+  return ISD::isBuildVectorAllOnes(N);
+}]>;
+def immAllZerosV: PatLeaf<(build_vector), [{
+  return ISD::isBuildVectorAllZeros(N);
+}]>;
+
+def immAllOnesV_bc: PatLeaf<(bitconvert), [{
+  return ISD::isBuildVectorAllOnes(N);
+}]>;
+
+
+// Other helper fragments.
+def not  : PatFrag<(ops node:$in), (xor node:$in, immAllOnes)>;
+def vnot : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV)>;
+def vnot_conv : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV_bc)>;
+def ineg : PatFrag<(ops node:$in), (sub 0, node:$in)>;
+
+// load fragments.
+def load : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::NON_EXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED;
+  return false;
+}]>;
+
+// extending load fragments.
+def extloadi1  : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::EXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i1;
+  return false;
+}]>;
+def extloadi8  : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::EXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i8;
+  return false;
+}]>;
+def extloadi16 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::EXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i16;
+  return false;
+}]>;
+def extloadi32 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::EXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i32;
+  return false;
+}]>;
+def extloadf32 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::EXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::f32;
+  return false;
+}]>;
+
+def sextloadi1  : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::SEXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i1;
+  return false;
+}]>;
+def sextloadi8  : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::SEXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i8;
+  return false;
+}]>;
+def sextloadi16 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::SEXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i16;
+  return false;
+}]>;
+def sextloadi32 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::SEXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i32;
+  return false;
+}]>;
+
+def zextloadi1  : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::ZEXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i1;
+  return false;
+}]>;
+def zextloadi8  : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::ZEXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i8;
+  return false;
+}]>;
+def zextloadi16 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::ZEXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i16;
+  return false;
+}]>;
+def zextloadi32 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return LD->getExtensionType() == ISD::ZEXTLOAD &&
+           LD->getAddressingMode() == ISD::UNINDEXED &&
+           LD->getLoadedVT() == MVT::i32;
+  return false;
+}]>;
+
+// store fragments.
+def store : PatFrag<(ops node:$val, node:$ptr),
+                    (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return !ST->isTruncatingStore() &&
+           ST->getAddressingMode() == ISD::UNINDEXED;
+  return false;
+}]>;
+
+// truncstore fragments.
+def truncstorei1 : PatFrag<(ops node:$val, node:$ptr),
+                           (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isTruncatingStore() && ST->getStoredVT() == MVT::i1 &&
+           ST->getAddressingMode() == ISD::UNINDEXED;
+  return false;
+}]>;
+def truncstorei8 : PatFrag<(ops node:$val, node:$ptr),
+                           (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isTruncatingStore() && ST->getStoredVT() == MVT::i8 &&
+           ST->getAddressingMode() == ISD::UNINDEXED;
+  return false;
+}]>;
+def truncstorei16 : PatFrag<(ops node:$val, node:$ptr),
+                            (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isTruncatingStore() && ST->getStoredVT() == MVT::i16 &&
+           ST->getAddressingMode() == ISD::UNINDEXED;
+  return false;
+}]>;
+def truncstorei32 : PatFrag<(ops node:$val, node:$ptr),
+                            (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isTruncatingStore() && ST->getStoredVT() == MVT::i32 &&
+           ST->getAddressingMode() == ISD::UNINDEXED;
+  return false;
+}]>;
+def truncstoref32 : PatFrag<(ops node:$val, node:$ptr),
+                            (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isTruncatingStore() && ST->getStoredVT() == MVT::f32 &&
+           ST->getAddressingMode() == ISD::UNINDEXED;
+  return false;
+}]>;
+
+// indexed store fragments.
+def pre_store : PatFrag<(ops node:$val, node:$base, node:$offset),
+                        (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) &&
+           !ST->isTruncatingStore();
+  }
+  return false;
+}]>;
+
+def pre_truncsti1 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                            (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) &&
+           ST->isTruncatingStore() && ST->getStoredVT() == MVT::i1;
+  }
+  return false;
+}]>;
+def pre_truncsti8 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                            (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) &&
+           ST->isTruncatingStore() && ST->getStoredVT() == MVT::i8;
+  }
+  return false;
+}]>;
+def pre_truncsti16 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                             (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) &&
+           ST->isTruncatingStore() && ST->getStoredVT() == MVT::i16;
+  }
+  return false;
+}]>;
+def pre_truncsti32 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                             (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) &&
+           ST->isTruncatingStore() && ST->getStoredVT() == MVT::i32;
+  }
+  return false;
+}]>;
+def pre_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                             (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) &&
+           ST->isTruncatingStore() && ST->getStoredVT() == MVT::f32;
+  }
+  return false;
+}]>;
+
+def post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
+                         (ist node:$val, node:$ptr, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return !ST->isTruncatingStore() &&
+            (AM == ISD::POST_INC || AM == ISD::POST_DEC);
+  }
+  return false;
+}]>;
+
+def post_truncsti1 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                             (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::POST_INC || AM == ISD::POST_DEC) &&
+           ST->isTruncatingStore() && ST->getStoredVT() == MVT::i1;
+  }
+  return false;
+}]>;
+def post_truncsti8 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                             (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::POST_INC || AM == ISD::POST_DEC) &&
+           ST->isTruncatingStore() && ST->getStoredVT() == MVT::i8;
+  }
+  return false;
+}]>;
+def post_truncsti16 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                              (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::POST_INC || AM == ISD::POST_DEC) &&
+           ST->isTruncatingStore() && ST->getStoredVT() == MVT::i16;
+  }
+  return false;
+}]>;
+def post_truncsti32 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                              (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::POST_INC || AM == ISD::POST_DEC) &&
+           ST->isTruncatingStore() && ST->getStoredVT() == MVT::i32;
+  }
+  return false;
+}]>;
+def post_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                              (ist node:$val, node:$base, node:$offset), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ISD::MemIndexedMode AM = ST->getAddressingMode();
+    return (AM == ISD::POST_INC || AM == ISD::POST_DEC) &&
+           ST->isTruncatingStore() && ST->getStoredVT() == MVT::f32;
+  }
+  return false;
+}]>;
+
+// setcc convenience fragments.
+def setoeq : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETOEQ)>;
+def setogt : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETOGT)>;
+def setoge : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETOGE)>;
+def setolt : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETOLT)>;
+def setole : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETOLE)>;
+def setone : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETONE)>;
+def seto   : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETO)>;
+def setuo  : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETUO)>;
+def setueq : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETUEQ)>;
+def setugt : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETUGT)>;
+def setuge : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETUGE)>;
+def setult : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETULT)>;
+def setule : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETULE)>;
+def setune : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETUNE)>;
+def seteq  : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETEQ)>;
+def setgt  : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETGT)>;
+def setge  : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETGE)>;
+def setlt  : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETLT)>;
+def setle  : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETLE)>;
+def setne  : PatFrag<(ops node:$lhs, node:$rhs),
+                     (setcc node:$lhs, node:$rhs, SETNE)>;
+
+//===----------------------------------------------------------------------===//
+// Selection DAG Pattern Support.
+//
+// Patterns are what are actually matched against the target-flavored
+// instruction selection DAG.  Instructions defined by the target implicitly
+// define patterns in most cases, but patterns can also be explicitly added when
+// an operation is defined by a sequence of instructions (e.g. loading a large
+// immediate value on RISC targets that do not support immediates as large as
+// their GPRs).
+//
+
+class Pattern<dag patternToMatch, list<dag> resultInstrs> {
+  dag             PatternToMatch  = patternToMatch;
+  list<dag>       ResultInstrs    = resultInstrs;
+  list<Predicate> Predicates      = [];  // See class Instruction in Target.td.
+  int             AddedComplexity = 0;  // See class Instruction in Target.td.
+}
+
+// Pat - A simple (but common) form of a pattern, which produces a simple result
+// not needing a full list.
+class Pat<dag pattern, dag result> : Pattern<pattern, [result]>;
+
+//===----------------------------------------------------------------------===//
+// Complex pattern definitions.
+//
+// Complex patterns, e.g. X86 addressing mode, requires pattern matching code
+// in C++. NumOperands is the number of operands returned by the select function;
+// SelectFunc is the name of the function used to pattern match the max. pattern;
+// RootNodes are the list of possible root nodes of the sub-dags to match.
+// e.g. X86 addressing mode - def addr : ComplexPattern<4, "SelectAddr", [add]>;
+//
+class ComplexPattern<ValueType ty, int numops, string fn,
+                     list<SDNode> roots = [], list<SDNodeProperty> props = []> {
+  ValueType Ty = ty;
+  int NumOperands = numops;
+  string SelectFunc = fn;
+  list<SDNode> RootNodes = roots;
+  list<SDNodeProperty> Properties = props;
+}
+
+//===----------------------------------------------------------------------===//
+// Dwarf support.
+//
+def SDT_dwarf_loc : SDTypeProfile<0, 3,
+                      [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+def dwarf_loc : SDNode<"ISD::DEBUG_LOC", SDT_dwarf_loc,[SDNPHasChain]>;
+
+
+

diff --git a/lib/Target/TargetSubtarget.cpp b/lib/Target/TargetSubtarget.cpp
new file mode 100644
index 0000000..a8bb9b9
--- /dev/null
+++ b/lib/Target/TargetSubtarget.cpp

@@ -0,0 +1,22 @@
+//===-- TargetSubtarget.cpp - General Target Information -------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Begeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetSubtarget.h"
+using namespace llvm;
+
+//---------------------------------------------------------------------------
+// TargetSubtarget Class
+//
+TargetSubtarget::TargetSubtarget() {}
+
+TargetSubtarget::~TargetSubtarget() {}

diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile
new file mode 100644
index 0000000..5416cdb
--- /dev/null
+++ b/lib/Target/X86/Makefile

@@ -0,0 +1,20 @@
+##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by the LLVM research group and is distributed under
+# the University of Illinois Open Source License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMX86
+TARGET = X86
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \
+                X86GenRegisterInfo.inc X86GenInstrNames.inc \
+                X86GenInstrInfo.inc X86GenAsmWriter.inc \
+                X86GenAsmWriter1.inc X86GenDAGISel.inc  \
+                X86GenCallingConv.inc X86GenSubtarget.inc
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/X86/README-FPStack.txt b/lib/Target/X86/README-FPStack.txt
new file mode 100644
index 0000000..d94fa02
--- /dev/null
+++ b/lib/Target/X86/README-FPStack.txt

@@ -0,0 +1,99 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: FP stack related stuff
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Some targets (e.g. athlons) prefer freep to fstp ST(0):
+http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
+
+//===---------------------------------------------------------------------===//
+
+On darwin/x86, we should codegen:
+
+        ret double 0.000000e+00
+
+as fld0/ret, not as:
+
+        movl $0, 4(%esp)
+        movl $0, (%esp)
+        fldl (%esp)
+	...
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This should use fiadd on chips where it is profitable:
+double foo(double P, int *I) { return P+*I; }
+
+We have fiadd patterns now but the followings have the same cost and
+complexity. We need a way to specify the later is more profitable.
+
+def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
+                    [(set RFP:$dst, (fadd RFP:$src1,
+                                     (extloadf64f32 addr:$src2)))]>;
+                // ST(0) = ST(0) + [mem32]
+
+def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP:$dst, (fadd RFP:$src1,
+                                     (X86fild addr:$src2, i32)))]>;
+                // ST(0) = ST(0) + [mem32int]
+
+//===---------------------------------------------------------------------===//
+
+The FP stackifier needs to be global.  Also, it should handle simple permutates
+to reduce number of shuffle instructions, e.g. turning:
+
+fld P	->		fld Q
+fld Q			fld P
+fxch
+
+or:
+
+fxch	->		fucomi
+fucomi			jl X
+jg X
+
+Ideas:
+http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
+
+
+//===---------------------------------------------------------------------===//
+
+Add a target specific hook to DAG combiner to handle SINT_TO_FP and
+FP_TO_SINT when the source operand is already in memory.
+
+//===---------------------------------------------------------------------===//
+
+Open code rint,floor,ceil,trunc:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
+
+Opencode the sincos[f] libcall.
+
+//===---------------------------------------------------------------------===//
+
+None of the FPStack instructions are handled in
+X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
+folding spill code into the instructions.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+	subl $20, %esp
+	movsd 24(%esp), %xmm0
+	movsd %xmm0, 8(%esp)
+	fldl 8(%esp)
+	fisttpll (%esp)
+	movl (%esp), %eax
+	addl $20, %esp
+	ret
+
+This will be solved when we go to a dynamic programming based isel.
+
+//===---------------------------------------------------------------------===//

diff --git a/lib/Target/X86/README-MMX.txt b/lib/Target/X86/README-MMX.txt
new file mode 100644
index 0000000..57c7c3f
--- /dev/null
+++ b/lib/Target/X86/README-MMX.txt

@@ -0,0 +1,69 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: MMX-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+#include <mmintrin.h>
+
+__v2si qux(int A) {
+  return (__v2si){ 0, A };
+}
+
+is compiled into:
+
+_qux:
+        subl $28, %esp
+        movl 32(%esp), %eax
+        movd %eax, %mm0
+        movq %mm0, (%esp)
+        movl (%esp), %eax
+        movl %eax, 20(%esp)
+        movq %mm0, 8(%esp)
+        movl 12(%esp), %eax
+        movl %eax, 16(%esp)
+        movq 16(%esp), %mm0
+        addl $28, %esp
+        ret
+
+Yuck!
+
+GCC gives us:
+
+_qux:
+        subl    $12, %esp
+        movl    16(%esp), %eax
+        movl    20(%esp), %edx
+        movl    $0, (%eax)
+        movl    %edx, 4(%eax)
+        addl    $12, %esp
+        ret     $4
+
+//===---------------------------------------------------------------------===//
+
+int main() {
+  __m64 A[1] = { _mm_cvtsi32_si64(1)  };
+  __m64 B[1] = { _mm_cvtsi32_si64(10) };
+  __m64 sum = _mm_cvtsi32_si64(0);
+
+  sum = __builtin_ia32_paddq(__builtin_ia32_paddq(A[0], B[0]), sum);
+
+  printf("Sum = %d\n", _mm_cvtsi64_si32(sum));
+  return 0;
+}
+
+Generates:
+
+        movl $11, %eax
+###     movd %eax, %mm0
+###     movq %mm0, 8(%esp)
+###     movl 8(%esp), %eax
+        movl %eax, 4(%esp)
+        movl $_str, (%esp)
+        call L_printf$stub
+        xorl %eax, %eax
+        addl $28, %esp
+
+These instructions are unnecessary.

diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
new file mode 100644
index 0000000..f4b54c4
--- /dev/null
+++ b/lib/Target/X86/README-SSE.txt

@@ -0,0 +1,629 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: SSE-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Expand libm rounding functions inline:  Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
+//===---------------------------------------------------------------------===//
+
+When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
+other fast SSE modes.
+
+//===---------------------------------------------------------------------===//
+
+Think about doing i64 math in SSE regs.
+
+//===---------------------------------------------------------------------===//
+
+This testcase should have no SSE instructions in it, and only one load from
+a constant pool:
+
+double %test3(bool %B) {
+        %C = select bool %B, double 123.412, double 523.01123123
+        ret double %C
+}
+
+Currently, the select is being lowered, which prevents the dag combiner from
+turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
+
+The pattern isel got this one right.
+
+//===---------------------------------------------------------------------===//
+
+SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
+like this:
+
+  X += y
+
+and the register allocator decides to spill X, it is cheaper to emit this as:
+
+Y += [xslot]
+store Y -> [xslot]
+
+than as:
+
+tmp = [xslot]
+tmp += y
+store tmp -> [xslot]
+
+..and this uses one fewer register (so this should be done at load folding
+time, not at spiller time).  *Note* however that this can only be done
+if Y is dead.  Here's a testcase:
+
+%.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
+implementation   ; Functions:
+declare void %printf(int, ...)
+void %main() {
+build_tree.exit:
+        br label %no_exit.i7
+no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
+        %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
+        %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
+        %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
+        %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
+        br bool false, label %Compute_Tree.exit23, label %no_exit.i7
+Compute_Tree.exit23:            ; preds = %no_exit.i7
+        tail call void (int, ...)* %printf( int 0 )
+        store double %tmp.34.i18, double* null
+        ret void
+}
+
+We currently emit:
+
+.BBmain_1:
+        xorpd %XMM1, %XMM1
+        addsd %XMM0, %XMM1
+***     movsd %XMM2, QWORD PTR [%ESP + 8]
+***     addsd %XMM2, %XMM1
+***     movsd QWORD PTR [%ESP + 8], %XMM2
+        jmp .BBmain_1   # no_exit.i7
+
+This is a bugpoint reduced testcase, which is why the testcase doesn't make
+much sense (e.g. its an infinite loop). :)
+
+//===---------------------------------------------------------------------===//
+
+SSE should implement 'select_cc' using 'emulated conditional moves' that use
+pcmp/pand/pandn/por to do a selection instead of a conditional branch:
+
+double %X(double %Y, double %Z, double %A, double %B) {
+        %C = setlt double %A, %B
+        %z = add double %Z, 0.0    ;; select operand is not a load
+        %D = select bool %C, double %Y, double %z
+        ret double %D
+}
+
+We currently emit:
+
+_X:
+        subl $12, %esp
+        xorpd %xmm0, %xmm0
+        addsd 24(%esp), %xmm0
+        movsd 32(%esp), %xmm1
+        movsd 16(%esp), %xmm2
+        ucomisd 40(%esp), %xmm1
+        jb LBB_X_2
+LBB_X_1:
+        movsd %xmm0, %xmm2
+LBB_X_2:
+        movsd %xmm2, (%esp)
+        fldl (%esp)
+        addl $12, %esp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+It's not clear whether we should use pxor or xorps / xorpd to clear XMM
+registers. The choice may depend on subtarget information. We should do some
+more experiments on different x86 machines.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+	subl $20, %esp
+	movsd 24(%esp), %xmm0
+	movsd %xmm0, 8(%esp)
+	fldl 8(%esp)
+	fisttpll (%esp)
+	movl (%esp), %eax
+	addl $20, %esp
+	ret
+
+This will be solved when we go to a dynamic programming based isel.
+
+//===---------------------------------------------------------------------===//
+
+Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
+feasible.
+
+//===---------------------------------------------------------------------===//
+
+Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
+the reg-reg copy in this example:
+
+float foo(int *x, float *y, unsigned c) {
+  float res = 0.0;
+  unsigned i;
+  for (i = 0; i < c; i++) {
+    float xx = (float)x[i];
+    xx = xx * y[i];
+    xx += res;
+    res = xx;
+  }
+  return res;
+}
+
+LBB_foo_3:      # no_exit
+        cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
+        mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
+        addss %XMM0, %XMM1
+        inc %ESI
+        cmp %ESI, %ECX
+****    movaps %XMM1, %XMM0
+        jb LBB_foo_3    # no_exit
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+  if (copysign(1.0, x) == copysign(1.0, y))
+into:
+  if (x^y & mask)
+when using SSE.
+
+//===---------------------------------------------------------------------===//
+
+Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
+of a v4sf value.
+
+//===---------------------------------------------------------------------===//
+
+Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
+Perhaps use pxor / xorp* to clear a XMM register first?
+
+//===---------------------------------------------------------------------===//
+
+How to decide when to use the "floating point version" of logical ops? Here are
+some code fragments:
+
+	movaps LCPI5_5, %xmm2
+	divps %xmm1, %xmm2
+	mulps %xmm2, %xmm3
+	mulps 8656(%ecx), %xmm3
+	addps 8672(%ecx), %xmm3
+	andps LCPI5_6, %xmm2
+	andps LCPI5_1, %xmm3
+	por %xmm2, %xmm3
+	movdqa %xmm3, (%edi)
+
+	movaps LCPI5_5, %xmm1
+	divps %xmm0, %xmm1
+	mulps %xmm1, %xmm3
+	mulps 8656(%ecx), %xmm3
+	addps 8672(%ecx), %xmm3
+	andps LCPI5_6, %xmm1
+	andps LCPI5_1, %xmm3
+	orps %xmm1, %xmm3
+	movaps %xmm3, 112(%esp)
+	movaps %xmm3, (%ebx)
+
+Due to some minor source change, the later case ended up using orps and movaps
+instead of por and movdqa. Does it matter?
+
+//===---------------------------------------------------------------------===//
+
+X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
+to choose between movaps, movapd, and movdqa based on types of source and
+destination?
+
+How about andps, andpd, and pand? Do we really care about the type of the packed
+elements? If not, why not always use the "ps" variants which are likely to be
+shorter.
+
+//===---------------------------------------------------------------------===//
+
+External test Nurbs exposed some problems. Look for
+__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
+emits:
+
+        movaps    (%edx), %xmm2                                 #59.21
+        movaps    (%edx), %xmm5                                 #60.21
+        movaps    (%edx), %xmm4                                 #61.21
+        movaps    (%edx), %xmm3                                 #62.21
+        movl      40(%ecx), %ebp                                #69.49
+        shufps    $0, %xmm2, %xmm5                              #60.21
+        movl      100(%esp), %ebx                               #69.20
+        movl      (%ebx), %edi                                  #69.20
+        imull     %ebp, %edi                                    #69.49
+        addl      (%eax), %edi                                  #70.33
+        shufps    $85, %xmm2, %xmm4                             #61.21
+        shufps    $170, %xmm2, %xmm3                            #62.21
+        shufps    $255, %xmm2, %xmm2                            #63.21
+        lea       (%ebp,%ebp,2), %ebx                           #69.49
+        negl      %ebx                                          #69.49
+        lea       -3(%edi,%ebx), %ebx                           #70.33
+        shll      $4, %ebx                                      #68.37
+        addl      32(%ecx), %ebx                                #68.37
+        testb     $15, %bl                                      #91.13
+        jne       L_B1.24       # Prob 5%                       #91.13
+
+This is the llvm code after instruction scheduling:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+	%reg1078 = MOV32ri -3
+	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
+	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
+	%reg1080 = IMUL32rr %reg1079, %reg1037
+	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
+	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
+	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
+	%reg1082 = SHL32ri %reg1038, 4
+	%reg1039 = ADD32rr %reg1036, %reg1082
+	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
+	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
+	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
+	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
+	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
+	%reg1040 = MOV32rr %reg1039
+	%reg1084 = AND32ri8 %reg1039, 15
+	CMP32ri8 %reg1084, 0
+	JE mbb<cond_next204,0xa914d30>
+
+Still ok. After register allocation:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+	%EAX = MOV32ri -3
+	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
+	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
+	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
+	%EDX = MOV32rm %EDX, 1, %NOREG, 40
+	IMUL32rr %EAX<def&use>, %EDX
+	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
+	%ESI = MOV32rm %ESI, 1, %NOREG, 0
+	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
+	%EAX = LEA32r %ESI, 1, %EAX, -3
+	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
+	%ESI = MOV32rm %ESI, 1, %NOREG, 32
+	%EDI = MOV32rr %EAX
+	SHL32ri %EDI<def&use>, 4
+	ADD32rr %EDI<def&use>, %ESI
+	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
+	%XMM1 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM1<def&use>, %XMM1, 170
+	%XMM2 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM2<def&use>, %XMM2, 0
+	%XMM3 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM3<def&use>, %XMM3, 255
+	SHUFPSrr %XMM0<def&use>, %XMM0, 85
+	%EBX = MOV32rr %EDI
+	AND32ri8 %EBX<def&use>, 15
+	CMP32ri8 %EBX, 0
+	JE mbb<cond_next204,0xa914d30>
+
+This looks really bad. The problem is shufps is a destructive opcode. Since it
+appears as operand two in more than one shufps ops. It resulted in a number of
+copies. Note icc also suffers from the same problem. Either the instruction
+selector should select pshufd or The register allocator can made the two-address
+to three-address transformation.
+
+It also exposes some other problems. See MOV32ri -3 and the spills.
+
+//===---------------------------------------------------------------------===//
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
+
+LLVM is producing bad code.
+
+LBB_main_4:	# cond_true44
+	addps %xmm1, %xmm2
+	subps %xmm3, %xmm2
+	movaps (%ecx), %xmm4
+	movaps %xmm2, %xmm1
+	addps %xmm4, %xmm1
+	addl $16, %ecx
+	incl %edx
+	cmpl $262144, %edx
+	movaps %xmm3, %xmm2
+	movaps %xmm4, %xmm3
+	jne LBB_main_4	# cond_true44
+
+There are two problems. 1) No need to two loop induction variables. We can
+compare against 262144 * 16. 2) Known register coalescer issue. We should
+be able eliminate one of the movaps:
+
+	addps %xmm2, %xmm1    <=== Commute!
+	subps %xmm3, %xmm1
+	movaps (%ecx), %xmm4
+	movaps %xmm1, %xmm1   <=== Eliminate!
+	addps %xmm4, %xmm1
+	addl $16, %ecx
+	incl %edx
+	cmpl $262144, %edx
+	movaps %xmm3, %xmm2
+	movaps %xmm4, %xmm3
+	jne LBB_main_4	# cond_true44
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+__m128 test(float a) {
+  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
+}
+
+This compiles into:
+
+movss 4(%esp), %xmm1
+mulss %xmm1, %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Because mulss doesn't modify the top 3 elements, the top elements of 
+xmm1 are already zero'd.  We could compile this to:
+
+movss 4(%esp), %xmm0
+mulss %xmm0, %xmm0
+ret
+
+//===---------------------------------------------------------------------===//
+
+Here's a sick and twisted idea.  Consider code like this:
+
+__m128 test(__m128 a) {
+  float b = *(float*)&A;
+  ...
+  return _mm_set_ps(0.0, 0.0, 0.0, b);
+}
+
+This might compile to this code:
+
+movaps c(%esp), %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Now consider if the ... code caused xmm1 to get spilled.  This might produce
+this code:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+xorps %xmm0, %xmm0
+movaps c2(%esp), %xmm1
+movss %xmm1, %xmm0
+ret
+
+However, since the reload is only used by these instructions, we could 
+"fold" it into the uses, producing something like this:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+movss c2(%esp), %xmm0
+ret
+
+... saving two instructions.
+
+The basic idea is that a reload from a spill slot, can, if only one 4-byte 
+chunk is used, bring in 3 zeros the the one element instead of 4 elements.
+This can be used to simplify a variety of shuffle operations, where the
+elements are fixed zeros.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+#include <emmintrin.h>
+void test(__m128d *r, __m128d *A, double B) {
+  *r = _mm_loadl_pd(*A, &B);
+}
+
+We generates:
+
+	subl $12, %esp
+	movsd 24(%esp), %xmm0
+	movsd %xmm0, (%esp)
+	movl 20(%esp), %eax
+	movapd (%eax), %xmm0
+	movlpd (%esp), %xmm0
+	movl 16(%esp), %eax
+	movapd %xmm0, (%eax)
+	addl $12, %esp
+	ret
+
+icc generates:
+
+        movl      4(%esp), %edx                                 #3.6
+        movl      8(%esp), %eax                                 #3.6
+        movapd    (%eax), %xmm0                                 #4.22
+        movlpd    12(%esp), %xmm0                               #4.8
+        movapd    %xmm0, (%edx)                                 #4.3
+        ret                                                     #5.1
+
+So icc is smart enough to know that B is in memory so it doesn't load it and
+store it back to stack.
+
+//===---------------------------------------------------------------------===//
+
+__m128d test1( __m128d A, __m128d B) {
+  return _mm_shuffle_pd(A, B, 0x3);
+}
+
+compiles to
+
+shufpd $3, %xmm1, %xmm0
+
+Perhaps it's better to use unpckhpd instead?
+
+unpckhpd %xmm1, %xmm0
+
+Don't know if unpckhpd is faster. But it is shorter.
+
+//===---------------------------------------------------------------------===//
+
+This code generates ugly code, probably due to costs being off or something:
+
+void %test(float* %P, <4 x float>* %P2 ) {
+        %xFloat0.688 = load float* %P
+        %loadVector37.712 = load <4 x float>* %P2
+        %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
+        store <4 x float> %inFloat3.713, <4 x float>* %P2
+        ret void
+}
+
+Generates:
+
+_test:
+        pxor %xmm0, %xmm0
+        movd %xmm0, %eax        ;; EAX = 0!
+        movl 8(%esp), %ecx
+        movaps (%ecx), %xmm0
+        pinsrw $6, %eax, %xmm0
+        shrl $16, %eax          ;; EAX = 0 again!
+        pinsrw $7, %eax, %xmm0
+        movaps %xmm0, (%ecx)
+        ret
+
+It would be better to generate:
+
+_test:
+        movl 8(%esp), %ecx
+        movaps (%ecx), %xmm0
+	xor %eax, %eax
+        pinsrw $6, %eax, %xmm0
+        pinsrw $7, %eax, %xmm0
+        movaps %xmm0, (%ecx)
+        ret
+
+or use pxor (to make a zero vector) and shuffle (to insert it).
+
+//===---------------------------------------------------------------------===//
+
+Some useful information in the Apple Altivec / SSE Migration Guide:
+
+http://developer.apple.com/documentation/Performance/Conceptual/
+Accelerate_sse_migration/index.html
+
+e.g. SSE select using and, andnot, or. Various SSE compare translations.
+
+//===---------------------------------------------------------------------===//
+
+Add hooks to commute some CMPP operations.
+
+//===---------------------------------------------------------------------===//
+
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
+
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
+
+//===---------------------------------------------------------------------===//
+
+We should compile this:
+#include <xmmintrin.h>
+typedef union {
+  int i[4];
+  float f[4];
+  __m128 v;
+} vector4_t;
+void swizzle (const void *a, vector4_t * b, vector4_t * c) {
+  b->v = _mm_loadl_pi (b->v, (__m64 *) a);
+  c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
+}
+
+to:
+
+_swizzle:
+        movl    4(%esp), %eax
+        movl    8(%esp), %edx
+        movl    12(%esp), %ecx
+        movlps  (%eax), %xmm0
+        movlps  %xmm0, (%edx)
+        movlps  8(%eax), %xmm0
+        movlps  %xmm0, (%ecx)
+        ret
+
+not:
+
+swizzle:
+        movl 8(%esp), %eax
+        movaps (%eax), %xmm0
+        movl 4(%esp), %ecx
+        movlps (%ecx), %xmm0
+        movaps %xmm0, (%eax)
+        movl 12(%esp), %eax
+        movaps (%eax), %xmm0
+        movlps 8(%ecx), %xmm0
+        movaps %xmm0, (%eax)
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+#include <emmintrin.h>
+__m128i test(long long i) { return _mm_cvtsi64x_si128(i); }
+
+Should turn into a single 'movq %rdi, %xmm0' instruction.  Instead, we 
+get this (on x86-64):
+
+_test:
+	movd %rdi, %xmm1
+	xorps %xmm0, %xmm0
+	movsd %xmm1, %xmm0
+	ret
+
+The LLVM IR is:
+
+target triple = "x86_64-apple-darwin8"
+define <2 x i64> @test(i64 %i) {
+entry:
+	%tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0	
+	%tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
+	ret <2 x i64> %tmp11
+}
+
+//===---------------------------------------------------------------------===//
+
+These functions should produce the same code:
+
+#include <emmintrin.h>
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+int foo(__m128i* val) {
+  return __builtin_ia32_vec_ext_v4si(*val, 1);
+}
+int bar(__m128i* val) {
+  union vs {
+    __m128i *_v;
+    int* _s;
+  } v = {val};
+  return v._s[1];
+}
+
+We currently produce (with -m64):
+
+_foo:
+        pshufd $1, (%rdi), %xmm0
+        movd %xmm0, %eax
+        ret
+_bar:
+        movl 4(%rdi), %eax
+        ret
+
+//===---------------------------------------------------------------------===//

diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
new file mode 100644
index 0000000..191904a
--- /dev/null
+++ b/lib/Target/X86/README-X86-64.txt

@@ -0,0 +1,223 @@
+//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
+
+Implement different PIC models? Right now we only support Mac OS X with small
+PIC code model.
+
+//===---------------------------------------------------------------------===//
+
+Make use of "Red Zone".
+
+//===---------------------------------------------------------------------===//
+
+Implement __int128 and long double support.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+extern void xx(void);
+void bar(void) {
+  xx();
+}
+
+gcc compiles to:
+
+.globl _bar
+_bar:
+	jmp	_xx
+
+We need to do the tailcall optimization as well.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
+multiplication by a constant. How much of it applies to Intel's X86-64
+implementation? There are definite trade-offs to consider: latency vs. register
+pressure vs. code size.
+
+//===---------------------------------------------------------------------===//
+
+Are we better off using branches instead of cmove to implement FP to
+unsigned i64?
+
+_conv:
+	ucomiss	LC0(%rip), %xmm0
+	cvttss2siq	%xmm0, %rdx
+	jb	L3
+	subss	LC0(%rip), %xmm0
+	movabsq	$-9223372036854775808, %rax
+	cvttss2siq	%xmm0, %rdx
+	xorq	%rax, %rdx
+L3:
+	movq	%rdx, %rax
+	ret
+
+instead of
+
+_conv:
+	movss LCPI1_0(%rip), %xmm1
+	cvttss2siq %xmm0, %rcx
+	movaps %xmm0, %xmm2
+	subss %xmm1, %xmm2
+	cvttss2siq %xmm2, %rax
+	movabsq $-9223372036854775808, %rdx
+	xorq %rdx, %rax
+	ucomiss %xmm1, %xmm0
+	cmovb %rcx, %rax
+	ret
+
+Seems like the jb branch has high likelyhood of being taken. It would have
+saved a few instructions.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen:
+
+int X[2];
+int b;
+void test(void) {
+  memset(X, b, 2*sizeof(X[0]));
+}
+
+llc:
+	movq _b@GOTPCREL(%rip), %rax
+	movzbq (%rax), %rax
+	movq %rax, %rcx
+	shlq $8, %rcx
+	orq %rax, %rcx
+	movq %rcx, %rax
+	shlq $16, %rax
+	orq %rcx, %rax
+	movq %rax, %rcx
+	shlq $32, %rcx
+	movq _X@GOTPCREL(%rip), %rdx
+	orq %rax, %rcx
+	movq %rcx, (%rdx)
+	ret
+
+gcc:
+	movq	_b@GOTPCREL(%rip), %rax
+	movabsq	$72340172838076673, %rdx
+	movzbq	(%rax), %rax
+	imulq	%rdx, %rax
+	movq	_X@GOTPCREL(%rip), %rdx
+	movq	%rax, (%rdx)
+	ret
+
+//===---------------------------------------------------------------------===//
+
+Vararg function prologue can be further optimized. Currently all XMM registers
+are stored into register save area. Most of them can be eliminated since the
+upper bound of the number of XMM registers used are passed in %al. gcc produces
+something like the following:
+
+	movzbl	%al, %edx
+	leaq	0(,%rdx,4), %rax
+	leaq	4+L2(%rip), %rdx
+	leaq	239(%rsp), %rax
+       	jmp	*%rdx
+	movaps	%xmm7, -15(%rax)
+	movaps	%xmm6, -31(%rax)
+	movaps	%xmm5, -47(%rax)
+	movaps	%xmm4, -63(%rax)
+	movaps	%xmm3, -79(%rax)
+	movaps	%xmm2, -95(%rax)
+	movaps	%xmm1, -111(%rax)
+	movaps	%xmm0, -127(%rax)
+L2:
+
+It jumps over the movaps that do not need to be stored. Hard to see this being
+significant as it added 5 instruciton (including a indirect branch) to avoid
+executing 0 to 8 stores in the function prologue.
+
+Perhaps we can optimize for the common case where no XMM registers are used for
+parameter passing. i.e. is %al == 0 jump over all stores. Or in the case of a
+leaf function where we can determine that no XMM input parameter is need, avoid
+emitting the stores at all.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 has a complex calling convention for aggregate passing by value:
+
+1. If the size of an object is larger than two eightbytes, or in C++, is a non- 
+   POD structure or union type, or contains unaligned fields, it has class 
+   MEMORY.
+2. Both eightbytes get initialized to class NO_CLASS. 
+3. Each field of an object is classified recursively so that always two fields
+   are considered. The resulting class is calculated according to the classes
+   of the fields in the eightbyte: 
+   (a) If both classes are equal, this is the resulting class. 
+   (b) If one of the classes is NO_CLASS, the resulting class is the other 
+       class. 
+   (c) If one of the classes is MEMORY, the result is the MEMORY class. 
+   (d) If one of the classes is INTEGER, the result is the INTEGER. 
+   (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, MEMORY is used as
+      class. 
+   (f) Otherwise class SSE is used. 
+4. Then a post merger cleanup is done: 
+   (a) If one of the classes is MEMORY, the whole argument is passed in memory. 
+   (b) If SSEUP is not preceeded by SSE, it is converted to SSE.
+
+Currently llvm frontend does not handle this correctly.
+
+Problem 1:
+    typedef struct { int i; double d; } QuadWordS;
+It is currently passed in two i64 integer registers. However, gcc compiled
+callee expects the second element 'd' to be passed in XMM0.
+
+Problem 2:
+    typedef struct { int32_t i; float j; double d; } QuadWordS;
+The size of the first two fields == i64 so they will be combined and passed in
+a integer register RDI. The third field is still passed in XMM0.
+
+Problem 3:
+    typedef struct { int64_t i; int8_t j; int64_t d; } S;
+    void test(S s)
+The size of this aggregate is greater than two i64 so it should be passed in 
+memory. Currently llvm breaks this down and passed it in three integer
+registers.
+
+Problem 4:
+Taking problem 3 one step ahead where a function expects a aggregate value
+in memory followed by more parameter(s) passed in register(s).
+    void test(S s, int b)
+
+LLVM IR does not allow parameter passing by aggregates, therefore it must break
+the aggregates value (in problem 3 and 4) into a number of scalar values:
+    void %test(long %s.i, byte %s.j, long %s.d);
+
+However, if the backend were to lower this code literally it would pass the 3
+values in integer registers. To force it be passed in memory, the frontend
+should change the function signiture to:
+    void %test(long %undef1, long %undef2, long %undef3, long %undef4, 
+               long %undef5, long %undef6,
+               long %s.i, byte %s.j, long %s.d);
+And the callee would look something like this:
+    call void %test( undef, undef, undef, undef, undef, undef,
+                     %tmp.s.i, %tmp.s.j, %tmp.s.d );
+The first 6 undef parameters would exhaust the 6 integer registers used for
+parameter passing. The following three integer values would then be forced into
+memory.
+
+For problem 4, the parameter 'd' would be moved to the front of the parameter
+list so it will be passed in register:
+    void %test(int %d,
+               long %undef1, long %undef2, long %undef3, long %undef4, 
+               long %undef5, long %undef6,
+               long %s.i, byte %s.j, long %s.d);
+
+//===---------------------------------------------------------------------===//
+
+Right now the asm printer assumes GlobalAddress are accessed via RIP relative
+addressing. Therefore, it is not possible to generate this:
+        movabsq $__ZTV10polynomialIdE+16, %rax
+
+That is ok for now since we currently only support small model. So the above
+is selected as
+        leaq __ZTV10polynomialIdE+16(%rip), %rax
+
+This is probably slightly slower but is much shorter than movabsq. However, if
+we were to support medium or larger code models, we need to use the movabs
+instruction. We should probably introduce something like AbsoluteAddress to
+distinguish it from GlobalAddress so the asm printer and JIT code emitter can
+do the right thing.

diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
new file mode 100644
index 0000000..f15090a
--- /dev/null
+++ b/lib/Target/X86/README.txt

@@ -0,0 +1,1150 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend.
+//===---------------------------------------------------------------------===//
+
+Missing features:
+  - Support for SSE4: http://www.intel.com/software/penryn
+http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
+  - support for 3DNow!
+  - weird abis?
+
+//===---------------------------------------------------------------------===//
+
+Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
+Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
+X86, & make the dag combiner produce it when needed.  This will eliminate one
+imul from the code generated for:
+
+long long test(long long X, long long Y) { return X*Y; }
+
+by using the EAX result from the mul.  We should add a similar node for
+DIVREM.
+
+another case is:
+
+long long test(int X, int Y) { return (long long)X*Y; }
+
+... which should only be one imul instruction.
+
+or:
+
+unsigned long long int t2(unsigned int a, unsigned int b) {
+       return (unsigned long long)a * b;
+}
+
+... which should be one mul instruction.
+
+
+This can be done with a custom expander, but it would be nice to move this to
+generic code.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case.  We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
+
+//===---------------------------------------------------------------------===//
+
+This should be one DIV/IDIV instruction, not a libcall:
+
+unsigned test(unsigned long long X, unsigned Y) {
+        return X/Y;
+}
+
+This can be done trivially with a custom legalizer.  What about overflow 
+though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+
+//===---------------------------------------------------------------------===//
+
+Improvements to the multiply -> shift/add algorithm:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
+
+//===---------------------------------------------------------------------===//
+
+Improve code like this (occurs fairly frequently, e.g. in LLVM):
+long long foo(int x) { return 1LL << x; }
+
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
+
+Another useful one would be  ~0ULL >> X and ~0ULL << X.
+
+One better solution for 1LL << x is:
+        xorl    %eax, %eax
+        xorl    %edx, %edx
+        testb   $32, %cl
+        sete    %al
+        setne   %dl
+        sall    %cl, %eax
+        sall    %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+64-bit shifts (in general) expand to really bad code.  Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
+//===---------------------------------------------------------------------===//
+
+Compile this:
+_Bool f(_Bool a) { return a!=1; }
+
+into:
+        movzbl  %dil, %eax
+        xorl    $1, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Some isel ideas:
+
+1. Dynamic programming based approach when compile time if not an
+   issue.
+2. Code duplication (addressing mode) during isel.
+3. Other ideas from "Register-Sensitive Selection, Duplication, and
+   Sequencing of Instructions".
+4. Scheduling for reduced register pressure.  E.g. "Minimum Register 
+   Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 
+   and other related papers.
+   http://citeseer.ist.psu.edu/govindarajan01minimum.html
+
+//===---------------------------------------------------------------------===//
+
+Should we promote i16 to i32 to avoid partial register update stalls?
+
+//===---------------------------------------------------------------------===//
+
+Leave any_extend as pseudo instruction and hint to register
+allocator. Delay codegen until post register allocation.
+
+//===---------------------------------------------------------------------===//
+
+Count leading zeros and count trailing zeros:
+
+int clz(int X) { return __builtin_clz(X); }
+int ctz(int X) { return __builtin_ctz(X); }
+
+$ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
+clz:
+        bsr     %eax, DWORD PTR [%esp+4]
+        xor     %eax, 31
+        ret
+ctz:
+        bsf     %eax, DWORD PTR [%esp+4]
+        ret
+
+however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
+aren't.
+
+Another example (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+  if (j)
+    return __builtin_ffs (j) - 1;
+  else
+    return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+It appears icc use push for parameter passing. Need to investigate.
+
+//===---------------------------------------------------------------------===//
+
+Only use inc/neg/not instructions on processors where they are faster than
+add/sub/xor.  They are slower on the P4 due to only updating some processor
+flags.
+
+//===---------------------------------------------------------------------===//
+
+The instruction selector sometimes misses folding a load into a compare.  The
+pattern is written as (cmp reg, (load p)).  Because the compare isn't 
+commutative, it is not matched with the load on both sides.  The dag combiner
+should be made smart enough to cannonicalize the load into the RHS of a compare
+when it can invert the result of the compare for free.
+
+//===---------------------------------------------------------------------===//
+
+How about intrinsics? An example is:
+  *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
+
+compiles to
+	pmuludq (%eax), %xmm0
+	movl 8(%esp), %eax
+	movdqa (%eax), %xmm1
+	pmulhuw %xmm0, %xmm1
+
+The transformation probably requires a X86 specific pass or a DAG combiner
+target specific hook.
+
+//===---------------------------------------------------------------------===//
+
+In many cases, LLVM generates code like this:
+
+_test:
+        movl 8(%esp), %eax
+        cmpl %eax, 4(%esp)
+        setl %al
+        movzbl %al, %eax
+        ret
+
+on some processors (which ones?), it is more efficient to do this:
+
+_test:
+        movl 8(%esp), %ebx
+        xor  %eax, %eax
+        cmpl %ebx, 4(%esp)
+        setl %al
+        ret
+
+Doing this correctly is tricky though, as the xor clobbers the flags.
+
+//===---------------------------------------------------------------------===//
+
+We should generate bts/btr/etc instructions on targets where they are cheap or
+when codesize is important.  e.g., for:
+
+void setbit(int *target, int bit) {
+    *target |= (1 << bit);
+}
+void clearbit(int *target, int bit) {
+    *target &= ~(1 << bit);
+}
+
+//===---------------------------------------------------------------------===//
+
+Instead of the following for memset char*, 1, 10:
+
+	movl $16843009, 4(%edx)
+	movl $16843009, (%edx)
+	movw $257, 8(%edx)
+
+It might be better to generate
+
+	movl $16843009, %eax
+	movl %eax, 4(%edx)
+	movl %eax, (%edx)
+	movw al, 8(%edx)
+	
+when we can spare a register. It reduces code size.
+
+//===---------------------------------------------------------------------===//
+
+Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
+get this:
+
+int %test1(int %X) {
+        %Y = div int %X, 8
+        ret int %Y
+}
+
+_test1:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        sarl $31, %ecx
+        shrl $29, %ecx
+        addl %ecx, %eax
+        sarl $3, %eax
+        ret
+
+GCC knows several different ways to codegen it, one of which is this:
+
+_test1:
+        movl    4(%esp), %eax
+        cmpl    $-1, %eax
+        leal    7(%eax), %ecx
+        cmovle  %ecx, %eax
+        sarl    $3, %eax
+        ret
+
+which is probably slower, but it's interesting at least :)
+
+//===---------------------------------------------------------------------===//
+
+The first BB of this code:
+
+declare bool %foo()
+int %bar() {
+        %V = call bool %foo()
+        br bool %V, label %T, label %F
+T:
+        ret int 1
+F:
+        call bool %foo()
+        ret int 12
+}
+
+compiles to:
+
+_bar:
+        subl $12, %esp
+        call L_foo$stub
+        xorb $1, %al
+        testb %al, %al
+        jne LBB_bar_2   # F
+
+It would be better to emit "cmp %al, 1" than a xor and test.
+
+//===---------------------------------------------------------------------===//
+
+Enable X86InstrInfo::convertToThreeAddress().
+
+//===---------------------------------------------------------------------===//
+
+We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
+We should leave these as libcalls for everything over a much lower threshold,
+since libc is hand tuned for medium and large mem ops (avoiding RFO for large
+stores, TLB preheating, etc)
+
+//===---------------------------------------------------------------------===//
+
+Optimize this into something reasonable:
+ x * copysign(1.0, y) * copysign(1.0, z)
+
+//===---------------------------------------------------------------------===//
+
+Optimize copysign(x, *y) to use an integer load from y.
+
+//===---------------------------------------------------------------------===//
+
+%X = weak global int 0
+
+void %foo(int %N) {
+	%N = cast int %N to uint
+	%tmp.24 = setgt int %N, 0
+	br bool %tmp.24, label %no_exit, label %return
+
+no_exit:
+	%indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
+	%i.0.0 = cast uint %indvar to int
+	volatile store int %i.0.0, int* %X
+	%indvar.next = add uint %indvar, 1
+	%exitcond = seteq uint %indvar.next, %N
+	br bool %exitcond, label %return, label %no_exit
+
+return:
+	ret void
+}
+
+compiles into:
+
+	.text
+	.align	4
+	.globl	_foo
+_foo:
+	movl 4(%esp), %eax
+	cmpl $1, %eax
+	jl LBB_foo_4	# return
+LBB_foo_1:	# no_exit.preheader
+	xorl %ecx, %ecx
+LBB_foo_2:	# no_exit
+	movl L_X$non_lazy_ptr, %edx
+	movl %ecx, (%edx)
+	incl %ecx
+	cmpl %eax, %ecx
+	jne LBB_foo_2	# no_exit
+LBB_foo_3:	# return.loopexit
+LBB_foo_4:	# return
+	ret
+
+We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
+remateralization is implemented. This can be accomplished with 1) a target
+dependent LICM pass or 2) makeing SelectDAG represent the whole function. 
+
+//===---------------------------------------------------------------------===//
+
+The following tests perform worse with LSR:
+
+lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
+
+//===---------------------------------------------------------------------===//
+
+We are generating far worse code than gcc:
+
+volatile short X, Y;
+
+void foo(int N) {
+  int i;
+  for (i = 0; i < N; i++) { X = i; Y = i*4; }
+}
+
+LBB1_1:	#bb.preheader
+	xorl %ecx, %ecx
+	xorw %dx, %dx
+LBB1_2:	#bb
+	movl L_X$non_lazy_ptr, %esi
+	movw %dx, (%esi)
+	movw %dx, %si
+	shlw $2, %si
+	movl L_Y$non_lazy_ptr, %edi
+	movw %si, (%edi)
+	incl %ecx
+	incw %dx
+	cmpl %eax, %ecx
+	jne LBB1_2	#bb
+
+vs.
+
+	xorl	%edx, %edx
+	movl	L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
+	movl	L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
+L4:
+	movw	%dx, (%esi)
+	leal	0(,%edx,4), %eax
+	movw	%ax, (%ecx)
+	addl	$1, %edx
+	cmpl	%edx, %edi
+	jne	L4
+
+There are 3 issues:
+
+1. Lack of post regalloc LICM.
+2. Poor sub-regclass support. That leads to inability to promote the 16-bit
+   arithmetic op to 32-bit and making use of leal.
+3. LSR unable to reused IV for a different type (i16 vs. i32) even though
+   the cast would be free.
+
+//===---------------------------------------------------------------------===//
+
+Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
+FR64 to VR128.
+
+//===---------------------------------------------------------------------===//
+
+mov $reg, 48(%esp)
+...
+leal 48(%esp), %eax
+mov %eax, (%esp)
+call _foo
+
+Obviously it would have been better for the first mov (or any op) to store
+directly %esp[0] if there are no other uses.
+
+//===---------------------------------------------------------------------===//
+
+Adding to the list of cmp / test poor codegen issues:
+
+int test(__m128 *A, __m128 *B) {
+  if (_mm_comige_ss(*A, *B))
+    return 3;
+  else
+    return 4;
+}
+
+_test:
+	movl 8(%esp), %eax
+	movaps (%eax), %xmm0
+	movl 4(%esp), %eax
+	movaps (%eax), %xmm1
+	comiss %xmm0, %xmm1
+	setae %al
+	movzbl %al, %ecx
+	movl $3, %eax
+	movl $4, %edx
+	cmpl $0, %ecx
+	cmove %edx, %eax
+	ret
+
+Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
+are a number of issues. 1) We are introducing a setcc between the result of the
+intrisic call and select. 2) The intrinsic is expected to produce a i32 value
+so a any extend (which becomes a zero extend) is added.
+
+We probably need some kind of target DAG combine hook to fix this.
+
+//===---------------------------------------------------------------------===//
+
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+
+There is also one case we do worse on PPC.
+
+//===---------------------------------------------------------------------===//
+
+If shorter, we should use things like:
+movzwl %ax, %eax
+instead of:
+andl $65535, %EAX
+
+The former can also be used when the two-addressy nature of the 'and' would
+require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
+
+//===---------------------------------------------------------------------===//
+
+Bad codegen:
+
+char foo(int x) { return x; }
+
+_foo:
+	movl 4(%esp), %eax
+	shll $24, %eax
+	sarl $24, %eax
+	ret
+
+SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of 
+sub-registers.
+
+//===---------------------------------------------------------------------===//
+
+Consider this:
+
+typedef struct pair { float A, B; } pair;
+void pairtest(pair P, float *FP) {
+        *FP = P.A+P.B;
+}
+
+We currently generate this code with llvmgcc4:
+
+_pairtest:
+        movl 8(%esp), %eax
+        movl 4(%esp), %ecx
+        movd %eax, %xmm0
+        movd %ecx, %xmm1
+        addss %xmm0, %xmm1
+        movl 12(%esp), %eax
+        movss %xmm1, (%eax)
+        ret
+
+we should be able to generate:
+_pairtest:
+        movss 4(%esp), %xmm0
+        movl 12(%esp), %eax
+        addss 8(%esp), %xmm0
+        movss %xmm0, (%eax)
+        ret
+
+The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
+integer chunks.  It does this so that structs like {short,short} are passed in
+a single 32-bit integer stack slot.  We should handle the safe cases above much
+nicer, while still handling the hard cases.
+
+While true in general, in this specific case we could do better by promoting
+load int + bitcast to float -> load fload.  This basically needs alignment info,
+the code is already implemented (but disabled) in dag combine).
+
+//===---------------------------------------------------------------------===//
+
+Another instruction selector deficiency:
+
+void %bar() {
+	%tmp = load int (int)** %foo
+	%tmp = tail call int %tmp( int 3 )
+	ret void
+}
+
+_bar:
+	subl $12, %esp
+	movl L_foo$non_lazy_ptr, %eax
+	movl (%eax), %eax
+	call *%eax
+	addl $12, %esp
+	ret
+
+The current isel scheme will not allow the load to be folded in the call since
+the load's chain result is read by the callseq_start.
+
+//===---------------------------------------------------------------------===//
+
+Don't forget to find a way to squash noop truncates in the JIT environment.
+
+//===---------------------------------------------------------------------===//
+
+Implement anyext in the same manner as truncate that would allow them to be
+eliminated.
+
+//===---------------------------------------------------------------------===//
+
+How about implementing truncate / anyext as a property of machine instruction
+operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
+Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
+For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+  return a * 3;
+}
+
+We currently emits
+	imull $3, 4(%esp), %eax
+
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+	movl	4(%esp), %eax
+	leal	(%eax,%eax,2), %eax
+
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
+
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
+
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
+
+//===---------------------------------------------------------------------===//
+
+Implement CTTZ, CTLZ with bsf and bsr.
+
+//===---------------------------------------------------------------------===//
+
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
+
+//===---------------------------------------------------------------------===//
+
+int %foo(int* %a, int %t) {
+entry:
+        br label %cond_true
+
+cond_true:              ; preds = %cond_true, %entry
+        %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]  
+        %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
+        %tmp2 = getelementptr int* %a, int %x.0.0              
+        %tmp3 = load int* %tmp2         ; <int> [#uses=1]
+        %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
+        %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
+        %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
+        %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
+        br bool %tmp, label %bb12, label %cond_true
+
+bb12:           ; preds = %cond_true
+        ret int %tmp7
+}
+
+is pessimized by -loop-reduce and -indvars
+
+//===---------------------------------------------------------------------===//
+
+u32 to float conversion improvement:
+
+float uint32_2_float( unsigned u ) {
+  float fl = (int) (u & 0xffff);
+  float fh = (int) (u >> 16);
+  fh *= 0x1.0p16f;
+  return fh + fl;
+}
+
+00000000        subl    $0x04,%esp
+00000003        movl    0x08(%esp,1),%eax
+00000007        movl    %eax,%ecx
+00000009        shrl    $0x10,%ecx
+0000000c        cvtsi2ss        %ecx,%xmm0
+00000010        andl    $0x0000ffff,%eax
+00000015        cvtsi2ss        %eax,%xmm1
+00000019        mulss   0x00000078,%xmm0
+00000021        addss   %xmm1,%xmm0
+00000025        movss   %xmm0,(%esp,1)
+0000002a        flds    (%esp,1)
+0000002d        addl    $0x04,%esp
+00000030        ret
+
+//===---------------------------------------------------------------------===//
+
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+
+int f(int a, int b) {
+  if (a == 4 || a == 6)
+    b++;
+  return b;
+}
+
+
+as:
+
+or eax, 2
+cmp eax, 6
+jz label
+
+//===---------------------------------------------------------------------===//
+
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b".  For example, instead of:
+
+int G;
+void f(int X, int Y) {
+  G = X < 0 ? 14 : 13;
+}
+
+compiling to:
+
+_f:
+        movl $14, %eax
+        movl $13, %ecx
+        movl 4(%esp), %edx
+        testl %edx, %edx
+        cmovl %eax, %ecx
+        movl %ecx, _G
+        ret
+
+it could be:
+_f:
+        movl    4(%esp), %eax
+        sarl    $31, %eax
+        notl    %eax
+        addl    $14, %eax
+        movl    %eax, _G
+        ret
+
+etc.
+
+//===---------------------------------------------------------------------===//
+
+Currently we don't have elimination of redundant stack manipulations. Consider
+the code:
+
+int %main() {
+entry:
+	call fastcc void %test1( )
+	call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
+	ret int 0
+}
+
+declare fastcc void %test1()
+
+declare fastcc void %test2(sbyte*)
+
+
+This currently compiles to:
+
+	subl $16, %esp
+	call _test5
+	addl $12, %esp
+	subl $16, %esp
+	movl $_test5, (%esp)
+	call _test6
+	addl $12, %esp
+
+The add\sub pair is really unneeded here.
+
+//===---------------------------------------------------------------------===//
+
+We currently compile sign_extend_inreg into two shifts:
+
+long foo(long X) {
+  return (long)(signed char)X;
+}
+
+becomes:
+
+_foo:
+        movl 4(%esp), %eax
+        shll $24, %eax
+        sarl $24, %eax
+        ret
+
+This could be:
+
+_foo:
+        movsbl  4(%esp),%eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Consider the expansion of:
+
+uint %test3(uint %X) {
+        %tmp1 = rem uint %X, 255
+        ret uint %tmp1
+}
+
+Currently it compiles to:
+
+...
+        movl $2155905153, %ecx
+        movl 8(%esp), %esi
+        movl %esi, %eax
+        mull %ecx
+...
+
+This could be "reassociated" into:
+
+        movl $2155905153, %eax
+        movl 8(%esp), %ecx
+        mull %ecx
+
+to avoid the copy.  In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction.  I guess this has
+to be done at isel time based on the #uses to mul?
+
+//===---------------------------------------------------------------------===//
+
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
+
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary.  In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
+942  942 0x3d03 movl     %dh, (1809(%esp, %esi)                                                                          
+937  937 0x3d0a incl     %esi                           
+3    3   0x3d0b cmpb     %bl, %dl                                               
+27   27  0x3d0d jnz      0x000062db <main+11707>
+
+//===---------------------------------------------------------------------===//
+
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
+
+int f(char *p) {
+    if ((p[0] == 1) & (p[1] == 2)) return 1;
+    return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+We should inline lrintf and probably other libc functions.
+
+//===---------------------------------------------------------------------===//
+
+Start using the flags more.  For example, compile:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x += y) == 0)
+          return a;
+     else
+          return b;
+}
+
+to:
+       addl    %esi, (%rdi)
+       movl    %edx, %eax
+       cmovne  %ecx, %eax
+       ret
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        movl %esi, (%rdi)
+        testl %esi, %esi
+        cmove %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+and:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x + y) < 0)
+          return a;
+     else
+          return b;
+}
+
+to:
+
+add_zf:
+        addl    (%rdi), %esi
+        movl    %edx, %eax
+        cmovns  %ecx, %eax
+        ret
+
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        testl %esi, %esi
+        cmovs %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This:
+#include <math.h>
+int foo(double X) { return isnan(X); }
+
+compiles to (-m64):
+
+_foo:
+        pxor %xmm1, %xmm1
+        ucomisd %xmm1, %xmm0
+        setp %al
+        movzbl %al, %eax
+        ret
+
+the pxor is not needed, we could compare the value against itself.
+
+//===---------------------------------------------------------------------===//
+
+These two functions have identical effects:
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+
+We currently compile them to:
+
+_f:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        movl 8(%esp), %edx
+        cmpl %edx, %ecx
+        jne LBB1_2      #UnifiedReturnBlock
+LBB1_1: #cond_true
+        addl $2, %eax
+        ret
+LBB1_2: #UnifiedReturnBlock
+        movl %ecx, %eax
+        ret
+_f2:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        cmpl 8(%esp), %ecx
+        sete %cl
+        movzbl %cl, %ecx
+        leal 1(%ecx,%eax), %eax
+        ret
+
+both of which are inferior to GCC's:
+
+_f:
+        movl    4(%esp), %edx
+        leal    1(%edx), %eax
+        addl    $2, %edx
+        cmpl    8(%esp), %eax
+        cmove   %edx, %eax
+        ret
+_f2:
+        movl    4(%esp), %eax
+        addl    $1, %eax
+        xorl    %edx, %edx
+        cmpl    8(%esp), %eax
+        sete    %dl
+        addl    %edx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void test(int X) {
+  if (X) abort();
+}
+
+is currently compiled to:
+
+_test:
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne LBB1_1
+        addl $12, %esp
+        ret
+LBB1_1:
+        call L_abort$stub
+
+It would be better to produce:
+
+_test:
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne L_abort$stub
+        addl $12, %esp
+        ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        subl $12, %esp
+        call L_abort$stub
+
+Both are useful in different situations.  Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        pop %eax   # realign stack.
+        call L_abort$stub
+
+Though this probably isn't worth it.
+
+//===---------------------------------------------------------------------===//
+
+We need to teach the codegen to convert two-address INC instructions to LEA
+when the flags are dead.  For example, on X86-64, compile:
+
+int foo(int A, int B) {
+  return A+1;
+}
+
+to:
+
+_foo:
+        leal    1(%edi), %eax
+        ret
+
+instead of:
+
+_foo:
+        incl %edi
+        movl %edi, %eax
+        ret
+
+Another example is:
+
+;; X's live range extends beyond the shift, so the register allocator
+;; cannot coalesce it with Y.  Because of this, a copy needs to be
+;; emitted before the shift to save the register value before it is
+;; clobbered.  However, this copy is not needed if the register
+;; allocator turns the shift into an LEA.  This also occurs for ADD.
+
+; Check that the shift gets turned into an LEA.
+; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN:   not grep {mov E.X, E.X}
+
+%G = external global int
+
+int %test1(int %X, int %Y) {
+        %Z = add int %X, %Y
+        volatile store int %Y, int* %G
+        volatile store int %Z, int* %G
+        ret int %X
+}
+
+int %test2(int %X) {
+        %Z = add int %X, 1  ;; inc
+        volatile store int %Z, int* %G
+        ret int %X
+}
+
+//===---------------------------------------------------------------------===//
+
+We use push/pop of stack space around calls in situations where we don't have to.
+Call to f below produces:
+        subl $16, %esp      <<<<<
+        movl %eax, (%esp)
+        call L_f$stub
+        addl $16, %esp     <<<<<
+The stack push/pop can be moved into the prolog/epilog.  It does this because it's
+building the frame pointer, but this should not be sufficient, only the use of alloca
+should cause it to do this.
+(There are other issues shown by this code, but this is one.)
+
+typedef struct _range_t {
+    float fbias;
+    float fscale;
+    int ibias;
+    int iscale;
+    int ishift;
+    unsigned char lut[];
+} range_t;
+
+struct _decode_t {
+    int type:4;
+    int unit:4;
+    int alpha:8;
+    int N:8;
+    int bpc:8;
+    int bpp:16;
+    int skip:8;
+    int swap:8;
+    const range_t*const*range;
+};
+
+typedef struct _decode_t decode_t;
+
+extern int f(const decode_t* decode);
+
+int decode_byte (const decode_t* decode) {
+  if (decode->swap != 0)
+    return f(decode);
+  return 0;
+}
+
+
+//===---------------------------------------------------------------------===//
+
+This:
+#include <xmmintrin.h>
+unsigned test(float f) {
+ return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
+}
+
+Compiles to:
+_test:
+        movss 4(%esp), %xmm0
+        movd %xmm0, %eax
+        ret
+
+it should compile to a move from the stack slot directly into eax.  DAGCombine
+has this xform, but it is currently disabled until the alignment fields of 
+the load/store nodes are trustworthy.
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction.  Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+        movl $7, %eax
+        movsbl 4(%esp), %ecx
+        subl %ecx, %eax
+        ret
+
+We would use one fewer register if codegen'd as:
+
+        movsbl 4(%esp), %eax
+	neg %eax
+        add $7, %eax
+        ret
+
+Note that this isn't beneficial if the load can be folded into the sub.  In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+        movl $7, %eax
+        subl 4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+For code like:
+phi (undef, x)
+
+We get an implicit def on the undef side. If the phi is spilled, we then get:
+implicitdef xmm1
+store xmm1 -> stack
+
+It should be possible to teach the x86 backend to "fold" the store into the
+implicitdef, which just deletes the implicit def.
+
+These instructions should go away:
+#IMPLICIT_DEF %xmm1 
+movaps %xmm1, 192(%esp) 
+movaps %xmm1, 224(%esp) 
+movaps %xmm1, 176(%esp)

diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
new file mode 100644
index 0000000..c7663be
--- /dev/null
+++ b/lib/Target/X86/X86.h

@@ -0,0 +1,66 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_X86_H
+#define TARGET_X86_H
+
+#include <iosfwd>
+
+namespace llvm {
+
+class X86TargetMachine;
+class FunctionPassManager;
+class FunctionPass;
+class MachineCodeEmitter;
+
+/// createX86ISelDag - This pass converts a legalized DAG into a 
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *createX86ISelDag(X86TargetMachine &TM, bool Fast);
+
+/// createX86FloatingPointStackifierPass - This function returns a pass which
+/// converts floating point register references and pseudo instructions into
+/// floating point stack references and physical instructions.
+///
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// createX86CodePrinterPass - Returns a pass that prints the X86
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.
+///
+FunctionPass *createX86CodePrinterPass(std::ostream &o, X86TargetMachine &tm);
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *createX86CodeEmitterPass(X86TargetMachine &TM,
+                                       MachineCodeEmitter &MCE);
+
+/// createX86EmitCodeToMemory - Returns a pass that converts a register
+/// allocated function into raw machine code in a dynamically
+/// allocated chunk of memory.
+///
+FunctionPass *createEmitX86CodeToMemory();
+
+} // End llvm namespace
+
+// Defines symbolic names for X86 registers.  This defines a mapping from
+// register name to register number.
+//
+#include "X86GenRegisterNames.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#include "X86GenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
new file mode 100644
index 0000000..98362c8
--- /dev/null
+++ b/lib/Target/X86/X86.td

@@ -0,0 +1,150 @@
+//===- X86.td - Target definition file for the Intel X86 arch ---*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, refered to
+// here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "../Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features.
+//===----------------------------------------------------------------------===//
+ 
+def FeatureMMX     : SubtargetFeature<"mmx","X86SSELevel", "MMX",
+                                      "Enable MMX instructions">;
+def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+                                      "Enable SSE instructions",
+                                      [FeatureMMX]>;
+def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+                                      "Enable SSE2 instructions",
+                                      [FeatureSSE1]>;
+def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+                                      "Enable SSE3 instructions",
+                                      [FeatureSSE2]>;
+def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+                                      "Enable SSSE3 instructions",
+                                      [FeatureSSE3]>;
+def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+                                      "Enable 3DNow! instructions">;
+def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+                                      "Enable 3DNow! Athlon instructions",
+                                      [Feature3DNow]>;
+def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
+                                      "Support 64-bit instructions",
+                                      [FeatureSSE2]>;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"i386",            []>;
+def : Proc<"i486",            []>;
+def : Proc<"pentium",         []>;
+def : Proc<"pentium-mmx",     [FeatureMMX]>;
+def : Proc<"i686",            []>;
+def : Proc<"pentiumpro",      []>;
+def : Proc<"pentium2",        [FeatureMMX]>;
+def : Proc<"pentium3",        [FeatureSSE1]>;
+def : Proc<"pentium-m",       [FeatureSSE2]>;
+def : Proc<"pentium4",        [FeatureSSE2]>;
+def : Proc<"x86-64",          [Feature64Bit]>;
+def : Proc<"yonah",           [FeatureSSE3]>;
+def : Proc<"prescott",        [FeatureSSE3]>;
+def : Proc<"nocona",          [FeatureSSE3]>;
+def : Proc<"core2",           [FeatureSSSE3]>;
+
+def : Proc<"k6",              [FeatureMMX]>;
+def : Proc<"k6-2",            [FeatureMMX,    Feature3DNow]>;
+def : Proc<"k6-3",            [FeatureMMX,    Feature3DNow]>;
+def : Proc<"athlon",          [FeatureMMX,    Feature3DNowA]>;
+def : Proc<"athlon-tbird",    [FeatureMMX,    Feature3DNowA]>;
+def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA]>;
+def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA]>;
+def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA]>;
+def : Proc<"k8",              [Feature3DNowA, Feature64Bit]>;
+def : Proc<"opteron",         [Feature3DNowA, Feature64Bit]>;
+def : Proc<"athlon64",        [Feature3DNowA, Feature64Bit]>;
+def : Proc<"athlon-fx",       [Feature3DNowA, Feature64Bit]>;
+
+def : Proc<"winchip-c6",      [FeatureMMX]>;
+def : Proc<"winchip2",        [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3",              [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3-2",            [FeatureSSE1]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo {
+
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the X86InstrInfo.h file.
+  let TSFlagsFields = ["FormBits",
+                       "hasOpSizePrefix",
+                       "hasAdSizePrefix",
+                       "Prefix",
+                       "hasREX_WPrefix",
+                       "ImmTypeBits",
+                       "FPFormBits",
+                       "Opcode"];
+  let TSFlagsShifts = [0,
+                       6,
+                       7,
+                       8,
+                       12,
+                       13,
+                       16,
+                       24];
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "ATTAsmPrinter";
+  int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "IntelAsmPrinter";
+  int Variant = 1;
+}
+
+
+def X86 : Target {
+  // Information about the instructions...
+  let InstructionSet = X86InstrInfo;
+
+  let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}

diff --git a/lib/Target/X86/X86ATTAsmPrinter.cpp b/lib/Target/X86/X86ATTAsmPrinter.cpp
new file mode 100755
index 0000000..e97babe
--- /dev/null
+++ b/lib/Target/X86/X86ATTAsmPrinter.cpp

@@ -0,0 +1,607 @@
+//===-- X86ATTAsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to AT&T format assembly
+// language. This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86ATTAsmPrinter.h"
+#include "X86.h"
+#include "X86COFF.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "X86TargetAsmInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Module.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+static std::string computePICLabel(unsigned FnNum,
+                                   const TargetAsmInfo *TAI,
+                                   const X86Subtarget* Subtarget)  {
+  std::string label;
+  if (Subtarget->isTargetDarwin())
+    label =  "\"L" + utostr_32(FnNum) + "$pb\"";
+  else if (Subtarget->isTargetELF())
+    label = ".Lllvm$" + utostr_32(FnNum) + "$piclabel";
+  else
+    assert(0 && "Don't know how to print PIC label!\n");
+
+  return label;
+}
+
+/// getSectionForFunction - Return the section that we should emit the
+/// specified function body into.
+std::string X86ATTAsmPrinter::getSectionForFunction(const Function &F) const {
+  switch (F.getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage: 
+  case Function::DLLExportLinkage:
+  case Function::ExternalLinkage:
+    return TAI->getTextSection();
+  case Function::WeakLinkage:
+  case Function::LinkOnceLinkage:
+    if (Subtarget->isTargetDarwin()) {
+      return ".section __TEXT,__textcoal_nt,coalesced,pure_instructions";
+    } else if (Subtarget->isTargetCygMing()) {
+      return "\t.section\t.text$linkonce." + CurrentFnName + ",\"ax\"";
+    } else {
+      return "\t.section\t.llvm.linkonce.t." + CurrentFnName +
+             ",\"ax\",@progbits";
+    }
+  }
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86ATTAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  if (TAI->doesSupportDebugInformation()) {
+    // Let PassManager know we need debug information and relay
+    // the MachineModuleInfo address on to DwarfWriter.
+    DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>());
+  }
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  unsigned CC = F->getCallingConv();
+
+  // Populate function information map.  Actually, We don't want to populate
+  // non-stdcall or non-fastcall functions' information right now.
+  if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+    FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
+
+  X86SharedAsmPrinter::decorateName(CurrentFnName, F);
+
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+    
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+    EmitAlignment(4, F);     // FIXME: This should be parameterized somewhere.
+    break;
+  case Function::DLLExportLinkage:
+    DLLExportedFns.insert(Mang->makeNameProper(F->getName(), ""));
+    //FALLS THROUGH
+  case Function::ExternalLinkage:
+    EmitAlignment(4, F);     // FIXME: This should be parameterized somewhere.
+    O << "\t.globl\t" << CurrentFnName << "\n";    
+    break;
+  case Function::LinkOnceLinkage:
+  case Function::WeakLinkage:
+    if (Subtarget->isTargetDarwin()) {
+      O << "\t.globl\t" << CurrentFnName << "\n";
+      O << "\t.weak_definition\t" << CurrentFnName << "\n";
+    } else if (Subtarget->isTargetCygMing()) {
+      EmitAlignment(4, F);     // FIXME: This should be parameterized somewhere.
+      O << "\t.globl " << CurrentFnName << "\n";
+      O << "\t.linkonce discard\n";
+    } else {
+      EmitAlignment(4, F);     // FIXME: This should be parameterized somewhere.
+      O << "\t.weak " << CurrentFnName << "\n";
+    }
+    break;
+  }
+  if (F->hasHiddenVisibility()) {
+    if (const char *Directive = TAI->getHiddenDirective())
+      O << Directive << CurrentFnName << "\n";
+  } else if (F->hasProtectedVisibility()) {
+    if (const char *Directive = TAI->getProtectedDirective())
+      O << Directive << CurrentFnName << "\n";
+  }
+
+  if (Subtarget->isTargetELF())
+    O << "\t.type " << CurrentFnName << ",@function\n";
+  else if (Subtarget->isTargetCygMing()) {
+    O << "\t.def\t " << CurrentFnName
+      << ";\t.scl\t" <<
+      (F->getLinkage() == Function::InternalLinkage ? COFF::C_STAT : COFF::C_EXT)
+      << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+      << ";\t.endef\n";
+  }
+
+  O << CurrentFnName << ":\n";
+  // Add some workaround for linkonce linkage on Cygwin\MinGW
+  if (Subtarget->isTargetCygMing() &&
+      (F->getLinkage() == Function::LinkOnceLinkage ||
+       F->getLinkage() == Function::WeakLinkage))
+    O << "Lllvm$workaround$fake$stub$" << CurrentFnName << ":\n";
+
+  if (TAI->doesSupportDebugInformation()) {
+    // Emit pre-function debug information.
+    DW.BeginFunction(&MF);
+  }
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I->pred_begin() != I->pred_end()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printMachineInstruction(II);
+    }
+  }
+
+  if (TAI->hasDotTypeDotSizeDirective())
+    O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n";
+
+  if (TAI->doesSupportDebugInformation()) {
+    // Emit post-function debug information.
+    DW.EndFunction();
+  }
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+  
+  // We didn't modify anything.
+  return false;
+}
+
+static inline bool printGOT(TargetMachine &TM, const X86Subtarget* ST) {
+  return ST->isPICStyleGOT() && TM.getRelocationModel() == Reloc::PIC_;
+}
+
+static inline bool printStub(TargetMachine &TM, const X86Subtarget* ST) {
+  return ST->isPICStyleStub() && TM.getRelocationModel() != Reloc::Static;
+}
+
+void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                    const char *Modifier, bool NotRIPRel) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {
+    assert(MRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           "Virtual registers should not make it this far!");
+    O << '%';
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+      MVT::ValueType VT = (strcmp(Modifier+6,"64") == 0) ?
+        MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+                    ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+      Reg = getX86SubSuperRegister(Reg, VT);
+    }
+    for (const char *Name = RI.get(Reg).Name; *Name; ++Name)
+      O << (char)tolower(*Name);
+    return;
+  }
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier ||
+        (strcmp(Modifier, "debug") && strcmp(Modifier, "mem")))
+      O << '$';
+    O << MO.getImmedValue();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMachineBasicBlock());
+    return;
+  case MachineOperand::MO_JumpTableIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << '$';
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << "_"
+      << MO.getJumpTableIndex();
+
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      if (Subtarget->isPICStyleStub())
+        O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+          << "$pb\"";
+      else if (Subtarget->isPICStyleGOT())
+        O << "@GOTOFF";
+    }
+    
+    if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel)
+      O << "(%rip)";
+    return;
+  }
+  case MachineOperand::MO_ConstantPoolIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << '$';
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getConstantPoolIndex();
+
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      if (Subtarget->isPICStyleStub())
+        O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+          << "$pb\"";
+      else if (Subtarget->isPICStyleGOT())
+        O << "@GOTOFF";
+    }
+    
+    int Offset = MO.getOffset();
+    if (Offset > 0)
+      O << "+" << Offset;
+    else if (Offset < 0)
+      O << Offset;
+
+    if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel)
+      O << "(%rip)";
+    return;
+  }
+  case MachineOperand::MO_GlobalAddress: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    bool needCloseParen = false;
+
+    GlobalValue *GV = MO.getGlobal();
+    GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+    bool isThreadLocal = GVar && GVar->isThreadLocal();
+
+    std::string Name = Mang->getValueName(GV);
+    X86SharedAsmPrinter::decorateName(Name, GV);
+    
+    if (!isMemOp && !isCallOp)
+      O << '$';
+    else if (Name[0] == '$') {
+      // The name begins with a dollar-sign. In order to avoid having it look
+      // like an integer immediate to the assembler, enclose it in parens.
+      O << '(';
+      needCloseParen = true;
+    }
+
+    if (printStub(TM, Subtarget)) {
+      // Link-once, declaration, or Weakly-linked global variables need
+      // non-lazily-resolved stubs
+      if (GV->isDeclaration() ||
+          GV->hasWeakLinkage() ||
+          GV->hasLinkOnceLinkage()) {
+        // Dynamically-resolved functions need a stub for the function.
+        if (isCallOp && isa<Function>(GV)) {
+          FnStubs.insert(Name);
+          O << TAI->getPrivateGlobalPrefix() << Name << "$stub";
+        } else {
+          GVStubs.insert(Name);
+          O << TAI->getPrivateGlobalPrefix() << Name << "$non_lazy_ptr";
+        }
+      } else {
+        if (GV->hasDLLImportLinkage())
+          O << "__imp_";          
+        O << Name;
+      }
+      
+      if (!isCallOp && TM.getRelocationModel() == Reloc::PIC_)
+        O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+          << "$pb\"";
+    } else {
+      if (GV->hasDLLImportLinkage()) {
+        O << "__imp_";          
+      }       
+      O << Name;
+
+      if (isCallOp && isa<Function>(GV)) {
+        if (printGOT(TM, Subtarget)) {
+          // Assemble call via PLT for non-local symbols
+          if (!(GV->hasHiddenVisibility() || GV->hasProtectedVisibility()) ||
+              GV->isDeclaration())
+            O << "@PLT";
+        }
+        if (Subtarget->isTargetCygMing() && GV->isDeclaration())
+          // Save function name for later type emission
+          FnStubs.insert(Name);
+      }
+    }
+
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+    
+    int Offset = MO.getOffset();
+    if (Offset > 0)
+      O << "+" << Offset;
+    else if (Offset < 0)
+      O << Offset;
+
+    if (isThreadLocal) {
+      if (TM.getRelocationModel() == Reloc::PIC_)
+        O << "@TLSGD"; // general dynamic TLS model
+      else
+        if (GV->isDeclaration())
+          O << "@INDNTPOFF"; // initial exec TLS model
+        else
+          O << "@NTPOFF"; // local exec TLS model
+    } else if (isMemOp) {
+      if (printGOT(TM, Subtarget)) {
+        if (Subtarget->GVRequiresExtraLoad(GV, TM, false))
+          O << "@GOT";
+        else
+          O << "@GOTOFF";
+      } else if (Subtarget->isPICStyleRIPRel() && !NotRIPRel) {
+        if ((GV->isDeclaration() ||
+             GV->hasWeakLinkage() ||
+             GV->hasLinkOnceLinkage()) &&
+            TM.getRelocationModel() != Reloc::Static)
+          O << "@GOTPCREL";
+
+        if (needCloseParen) {
+          needCloseParen = false;
+          O << ')';
+        }
+
+        // Use rip when possible to reduce code size, except when
+        // index or base register are also part of the address. e.g.
+        // foo(%rip)(%rcx,%rax,4) is not legal
+        O << "(%rip)";
+      }
+    }
+
+    if (needCloseParen)
+      O << ')';
+
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    bool needCloseParen = false;
+    std::string Name(TAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+    if (isCallOp && printStub(TM, Subtarget)) {
+      FnStubs.insert(Name);
+      O << TAI->getPrivateGlobalPrefix() << Name << "$stub";
+      return;
+    }
+    if (!isCallOp)
+      O << '$';
+    else if (Name[0] == '$') {
+      // The name begins with a dollar-sign. In order to avoid having it look
+      // like an integer immediate to the assembler, enclose it in parens.
+      O << '(';
+      needCloseParen = true;
+    }
+
+    O << Name;
+
+    if (printGOT(TM, Subtarget)) {
+      std::string GOTName(TAI->getGlobalPrefix());
+      GOTName+="_GLOBAL_OFFSET_TABLE_";
+      if (Name == GOTName)
+        // HACK! Emit extra offset to PC during printing GOT offset to
+        // compensate for the size of popl instruction. The resulting code
+        // should look like:
+        //   call .piclabel
+        // piclabel:
+        //   popl %some_register
+        //   addl $_GLOBAL_ADDRESS_TABLE_ + [.-piclabel], %some_register
+        O << " + [.-"
+          << computePICLabel(getFunctionNumber(), TAI, Subtarget) << "]";
+
+      if (isCallOp)
+        O << "@PLT";
+    }
+
+    if (needCloseParen)
+      O << ')';
+
+    if (!isCallOp && Subtarget->isPICStyleRIPRel())
+      O << "(%rip)";
+
+    return;
+  }
+  default:
+    O << "<unknown operand type>"; return;
+  }
+}
+
+void X86ATTAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+  unsigned char value = MI->getOperand(Op).getImmedValue();
+  assert(value <= 7 && "Invalid ssecc argument!");
+  switch (value) {
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+void X86ATTAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                         const char *Modifier){
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+  MachineOperand BaseReg  = MI->getOperand(Op);
+  MachineOperand IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+  bool NotRIPRel = IndexReg.getReg() || BaseReg.getReg();
+  if (DispSpec.isGlobalAddress() ||
+      DispSpec.isConstantPoolIndex() ||
+      DispSpec.isJumpTableIndex()) {
+    printOperand(MI, Op+3, "mem", NotRIPRel);
+  } else {
+    int DispVal = DispSpec.getImmedValue();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+      O << DispVal;
+  }
+
+  if (IndexReg.getReg() || BaseReg.getReg()) {
+    unsigned ScaleVal = MI->getOperand(Op+1).getImmedValue();
+    unsigned BaseRegOperand = 0, IndexRegOperand = 2;
+      
+    // There are cases where we can end up with ESP/RSP in the indexreg slot.
+    // If this happens, swap the base/index register to support assemblers that
+    // don't work when the index is *SP.
+    if (IndexReg.getReg() == X86::ESP || IndexReg.getReg() == X86::RSP) {
+      assert(ScaleVal == 1 && "Scale not supported for stack pointer!");
+      std::swap(BaseReg, IndexReg);
+      std::swap(BaseRegOperand, IndexRegOperand);
+    }
+    
+    O << "(";
+    if (BaseReg.getReg())
+      printOperand(MI, Op+BaseRegOperand, Modifier);
+
+    if (IndexReg.getReg()) {
+      O << ",";
+      printOperand(MI, Op+IndexRegOperand, Modifier);
+      if (ScaleVal != 1)
+        O << "," << ScaleVal;
+    }
+    O << ")";
+  }
+}
+
+void X86ATTAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+  std::string label = computePICLabel(getFunctionNumber(), TAI, Subtarget);
+  O << label << "\n" << label << ":";
+}
+
+
+bool X86ATTAsmPrinter::printAsmMRegister(const MachineOperand &MO,
+                                         const char Mode) {
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    break;
+  }
+
+  O << '%';
+  for (const char *Name = RI.get(Reg).Name; *Name; ++Name)
+    O << (char)tolower(*Name);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86ATTAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant, 
+                                       const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'c': // Don't print "$" before a global var name or constant.
+      printOperand(MI, OpNo, "mem");
+      return false;
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+      if (MI->getOperand(OpNo).isReg())
+        return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
+      printOperand(MI, OpNo);
+      return false;
+      
+    case 'P': // Don't print @PLT, but do print as memory.
+      printOperand(MI, OpNo, "mem");
+      return false;
+    }
+  }
+  
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool X86ATTAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                             unsigned OpNo,
+                                             unsigned AsmVariant, 
+                                             const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  printMemReference(MI, OpNo);
+  return false;
+}
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction
+/// MI in AT&T syntax to the current output stream.
+///
+void X86ATTAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // See if a truncate instruction can be turned into a nop.
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::TRUNC_64to32:
+  case X86::TRUNC_64to16:
+  case X86::TRUNC_32to16:
+  case X86::TRUNC_32to8:
+  case X86::TRUNC_16to8:
+  case X86::TRUNC_32_to8:
+  case X86::TRUNC_16_to8: {
+    const MachineOperand &MO0 = MI->getOperand(0);
+    const MachineOperand &MO1 = MI->getOperand(1);
+    unsigned Reg0 = MO0.getReg();
+    unsigned Reg1 = MO1.getReg();
+    unsigned Opc = MI->getOpcode();
+    if (Opc == X86::TRUNC_64to32)
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i32);
+    else if (Opc == X86::TRUNC_32to16 || Opc == X86::TRUNC_64to16)
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i16);
+    else
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i8);
+    O << TAI->getCommentString() << " TRUNCATE ";
+    if (Reg0 != Reg1)
+      O << "\n\t";
+    break;
+  }
+  case X86::PsMOVZX64rr32:
+    O << TAI->getCommentString() << " ZERO-EXTEND " << "\n\t";
+    break;
+  }
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+}
+
+// Include the auto-generated portion of the assembly writer.
+#include "X86GenAsmWriter.inc"
+

diff --git a/lib/Target/X86/X86ATTAsmPrinter.h b/lib/Target/X86/X86ATTAsmPrinter.h
new file mode 100755
index 0000000..a3bdce9
--- /dev/null
+++ b/lib/Target/X86/X86ATTAsmPrinter.h

@@ -0,0 +1,87 @@
+//===-- X86ATTAsmPrinter.h - Convert X86 LLVM code to AT&T assembly -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AT&T assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ATTASMPRINTER_H
+#define X86ATTASMPRINTER_H
+
+#include "X86AsmPrinter.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+struct X86ATTAsmPrinter : public X86SharedAsmPrinter {
+ X86ATTAsmPrinter(std::ostream &O, X86TargetMachine &TM, const TargetAsmInfo *T)
+    : X86SharedAsmPrinter(O, TM, T) { }
+
+  virtual const char *getPassName() const {
+    return "X86 AT&T-Style Assembly Printer";
+  }
+
+  /// printInstruction - This method is automatically generated by tablegen
+  /// from the instruction set description.  This method returns true if the
+  /// machine instruction was sufficiently described to print it, otherwise it
+  /// returns false.
+  bool printInstruction(const MachineInstr *MI);
+
+  // These methods are used by the tablegen'erated instruction printer.
+  void printOperand(const MachineInstr *MI, unsigned OpNo,
+                    const char *Modifier = 0, bool NotRIPRel = false);
+  void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo, "subreg64");
+  }
+  
+  bool printAsmMRegister(const MachineOperand &MO, const char Mode);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode);
+  
+  void printMachineInstruction(const MachineInstr *MI);
+  void printSSECC(const MachineInstr *MI, unsigned Op);
+  void printMemReference(const MachineInstr *MI, unsigned Op,
+                         const char *Modifier=NULL);
+  void printPICLabel(const MachineInstr *MI, unsigned Op);
+  bool runOnMachineFunction(MachineFunction &F);
+  
+  /// getSectionForFunction - Return the section that we should emit the
+  /// specified function body into.
+  virtual std::string getSectionForFunction(const Function &F) const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 0000000..59b9b1f
--- /dev/null
+++ b/lib/Target/X86/X86AsmPrinter.cpp

@@ -0,0 +1,409 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM IR to X86 assembly -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file the shared super class printer that converts from our internal
+// representation of machine-dependent LLVM code to Intel and AT&T format
+// assembly language.
+// This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "X86ATTAsmPrinter.h"
+#include "X86COFF.h"
+#include "X86IntelAsmPrinter.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Type.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static X86MachineFunctionInfo calculateFunctionInfo(const Function *F,
+                                                    const TargetData *TD) {
+  X86MachineFunctionInfo Info;
+  uint64_t Size = 0;
+  
+  switch (F->getCallingConv()) {
+  case CallingConv::X86_StdCall:
+    Info.setDecorationStyle(StdCall);
+    break;
+  case CallingConv::X86_FastCall:
+    Info.setDecorationStyle(FastCall);
+    break;
+  default:
+    return Info;
+  }
+
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI)
+    // Size should be aligned to DWORD boundary
+    Size += ((TD->getTypeSize(AI->getType()) + 3)/4)*4;
+  
+  // We're not supporting tooooo huge arguments :)
+  Info.setBytesToPopOnReturn((unsigned int)Size);
+  return Info;
+}
+
+
+/// decorateName - Query FunctionInfoMap and use this information for various
+/// name decoration.
+void X86SharedAsmPrinter::decorateName(std::string &Name,
+                                       const GlobalValue *GV) {
+  const Function *F = dyn_cast<Function>(GV);
+  if (!F) return;
+
+  // We don't want to decorate non-stdcall or non-fastcall functions right now
+  unsigned CC = F->getCallingConv();
+  if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall)
+    return;
+
+  // Decorate names only when we're targeting Cygwin/Mingw32 targets
+  if (!Subtarget->isTargetCygMing())
+    return;
+    
+  FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F);
+
+  const X86MachineFunctionInfo *Info;
+  if (info_item == FunctionInfoMap.end()) {
+    // Calculate apropriate function info and populate map
+    FunctionInfoMap[F] = calculateFunctionInfo(F, TM.getTargetData());
+    Info = &FunctionInfoMap[F];
+  } else {
+    Info = &info_item->second;
+  }
+  
+  const FunctionType *FT = F->getFunctionType();
+  switch (Info->getDecorationStyle()) {
+  case None:
+    break;
+  case StdCall:
+    // "Pure" variadic functions do not receive @0 suffix.
+    if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+        (FT->getNumParams() == 1 && FT->isStructReturn())) 
+      Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+    break;
+  case FastCall:
+    // "Pure" variadic functions do not receive @0 suffix.
+    if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+        (FT->getNumParams() == 1 && FT->isStructReturn())) 
+      Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+
+    if (Name[0] == '_') {
+      Name[0] = '@';
+    } else {
+      Name = '@' + Name;
+    }    
+    break;
+  default:
+    assert(0 && "Unsupported DecorationStyle");
+  }
+}
+
+/// doInitialization
+bool X86SharedAsmPrinter::doInitialization(Module &M) {
+  if (TAI->doesSupportDebugInformation()) {
+    // Emit initial debug information.
+    DW.BeginModule(&M);
+  }
+
+  AsmPrinter::doInitialization(M);
+
+  // Darwin wants symbols to be quoted if they have complex names.
+  if (Subtarget->isTargetDarwin())
+    Mang->setUseQuotes(true);
+
+  return false;
+}
+
+bool X86SharedAsmPrinter::doFinalization(Module &M) {
+  // Note: this code is not shared by the Intel printer as it is too different
+  // from how MASM does things.  When making changes here don't forget to look
+  // at X86IntelAsmPrinter::doFinalization().
+  const TargetData *TD = TM.getTargetData();
+  
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (!I->hasInitializer())
+      continue;   // External global require no code
+    
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(I)) {
+      if (Subtarget->isTargetDarwin() &&
+          TM.getRelocationModel() == Reloc::Static) {
+        if (I->getName() == "llvm.global_ctors")
+          O << ".reference .constructors_used\n";
+        else if (I->getName() == "llvm.global_dtors")
+          O << ".reference .destructors_used\n";
+      }
+      continue;
+    }
+    
+    std::string name = Mang->getValueName(I);
+    Constant *C = I->getInitializer();
+    const Type *Type = C->getType();
+    unsigned Size = TD->getTypeSize(Type);
+    unsigned Align = TD->getPreferredAlignmentLog(I);
+
+    if (I->hasHiddenVisibility()) {
+      if (const char *Directive = TAI->getHiddenDirective())
+        O << Directive << name << "\n";
+    } else if (I->hasProtectedVisibility()) {
+      if (const char *Directive = TAI->getProtectedDirective())
+        O << Directive << name << "\n";
+    }
+    
+    if (Subtarget->isTargetELF())
+      O << "\t.type " << name << ",@object\n";
+    
+    if (C->isNullValue()) {
+      if (I->hasExternalLinkage()) {
+        if (const char *Directive = TAI->getZeroFillDirective()) {
+          O << "\t.globl\t" << name << "\n";
+          O << Directive << "__DATA__, __common, " << name << ", "
+            << Size << ", " << Align << "\n";
+          continue;
+        }
+      }
+      
+      if (!I->hasSection() && !I->isThreadLocal() &&
+          (I->hasInternalLinkage() || I->hasWeakLinkage() ||
+           I->hasLinkOnceLinkage())) {
+        if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+        if (!NoZerosInBSS && TAI->getBSSSection())
+          SwitchToDataSection(TAI->getBSSSection(), I);
+        else
+          SwitchToDataSection(TAI->getDataSection(), I);
+        if (TAI->getLCOMMDirective() != NULL) {
+          if (I->hasInternalLinkage()) {
+            O << TAI->getLCOMMDirective() << name << "," << Size;
+            if (Subtarget->isTargetDarwin())
+              O << "," << Align;
+          } else
+            O << TAI->getCOMMDirective()  << name << "," << Size;
+        } else {
+          if (!Subtarget->isTargetCygMing()) {
+            if (I->hasInternalLinkage())
+              O << "\t.local\t" << name << "\n";
+          }
+          O << TAI->getCOMMDirective()  << name << "," << Size;
+          if (TAI->getCOMMDirectiveTakesAlignment())
+            O << "," << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+        }
+        O << "\t\t" << TAI->getCommentString() << " " << I->getName() << "\n";
+        continue;
+      }
+    }
+
+    switch (I->getLinkage()) {
+    case GlobalValue::LinkOnceLinkage:
+    case GlobalValue::WeakLinkage:
+      if (Subtarget->isTargetDarwin()) {
+        O << "\t.globl " << name << "\n"
+          << "\t.weak_definition " << name << "\n";
+        SwitchToDataSection(".section __DATA,__const_coal,coalesced", I);
+      } else if (Subtarget->isTargetCygMing()) {
+        std::string SectionName(".section\t.data$linkonce." +
+                                name +
+                                ",\"aw\"");
+        SwitchToDataSection(SectionName.c_str(), I);
+        O << "\t.globl " << name << "\n"
+          << "\t.linkonce same_size\n";
+      } else {
+        std::string SectionName("\t.section\t.llvm.linkonce.d." +
+                                name +
+                                ",\"aw\",@progbits");
+        SwitchToDataSection(SectionName.c_str(), I);
+        O << "\t.weak " << name << "\n";
+      }
+      break;
+    case GlobalValue::AppendingLinkage:
+      // FIXME: appending linkage variables should go into a section of
+      // their name or something.  For now, just emit them as external.
+    case GlobalValue::DLLExportLinkage:
+      DLLExportedGVs.insert(Mang->makeNameProper(I->getName(),""));
+      // FALL THROUGH
+    case GlobalValue::ExternalLinkage:
+      // If external or appending, declare as a global symbol
+      O << "\t.globl " << name << "\n";
+      // FALL THROUGH
+    case GlobalValue::InternalLinkage: {
+      if (I->isConstant()) {
+        const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+        if (TAI->getCStringSection() && CVA && CVA->isCString()) {
+          SwitchToDataSection(TAI->getCStringSection(), I);
+          break;
+        }
+      }
+      // FIXME: special handling for ".ctors" & ".dtors" sections
+      if (I->hasSection() &&
+          (I->getSection() == ".ctors" ||
+           I->getSection() == ".dtors")) {
+        std::string SectionName = ".section " + I->getSection();
+        
+        if (Subtarget->isTargetCygMing()) {
+          SectionName += ",\"aw\"";
+        } else {
+          assert(!Subtarget->isTargetDarwin());
+          SectionName += ",\"aw\",@progbits";
+        }
+
+        SwitchToDataSection(SectionName.c_str());
+      } else {
+        if (C->isNullValue() && !NoZerosInBSS && TAI->getBSSSection())
+          SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSBSSSection() :
+                              TAI->getBSSSection(), I);
+        else if (!I->isConstant())
+          SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSDataSection() :
+                              TAI->getDataSection(), I);
+        else if (I->isThreadLocal())
+          SwitchToDataSection(TAI->getTLSDataSection());
+        else {
+          // Read-only data.
+          bool HasReloc = C->ContainsRelocations();
+          if (HasReloc &&
+              Subtarget->isTargetDarwin() &&
+              TM.getRelocationModel() != Reloc::Static)
+            SwitchToDataSection("\t.const_data\n");
+          else if (!HasReloc && Size == 4 &&
+                   TAI->getFourByteConstantSection())
+            SwitchToDataSection(TAI->getFourByteConstantSection(), I);
+          else if (!HasReloc && Size == 8 &&
+                   TAI->getEightByteConstantSection())
+            SwitchToDataSection(TAI->getEightByteConstantSection(), I);
+          else if (!HasReloc && Size == 16 &&
+                   TAI->getSixteenByteConstantSection())
+            SwitchToDataSection(TAI->getSixteenByteConstantSection(), I);
+          else if (TAI->getReadOnlySection())
+            SwitchToDataSection(TAI->getReadOnlySection(), I);
+          else
+            SwitchToDataSection(TAI->getDataSection(), I);
+        }
+      }
+      
+      break;
+    }
+    default:
+      assert(0 && "Unknown linkage type!");
+    }
+
+    EmitAlignment(Align, I);
+    O << name << ":\t\t\t\t" << TAI->getCommentString() << " " << I->getName()
+      << "\n";
+    if (TAI->hasDotTypeDotSizeDirective())
+      O << "\t.size " << name << ", " << Size << "\n";
+    // If the initializer is a extern weak symbol, remember to emit the weak
+    // reference!
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+      if (GV->hasExternalWeakLinkage())
+        ExtWeakSymbols.insert(GV);
+
+    EmitGlobalConstant(C);
+  }
+  
+  // Output linker support code for dllexported globals
+  if (DLLExportedGVs.begin() != DLLExportedGVs.end()) {
+    SwitchToDataSection(".section .drectve");
+  }
+
+  for (std::set<std::string>::iterator i = DLLExportedGVs.begin(),
+         e = DLLExportedGVs.end();
+         i != e; ++i) {
+    O << "\t.ascii \" -export:" << *i << ",data\"\n";
+  }    
+
+  if (DLLExportedFns.begin() != DLLExportedFns.end()) {
+    SwitchToDataSection(".section .drectve");
+  }
+
+  for (std::set<std::string>::iterator i = DLLExportedFns.begin(),
+         e = DLLExportedFns.end();
+         i != e; ++i) {
+    O << "\t.ascii \" -export:" << *i << "\"\n";
+  }    
+
+  if (Subtarget->isTargetDarwin()) {
+    SwitchToDataSection("");
+
+    // Output stubs for dynamically-linked functions
+    unsigned j = 1;
+    for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i, ++j) {
+      SwitchToDataSection(".section __IMPORT,__jump_table,symbol_stubs,"
+                          "self_modifying_code+pure_instructions,5", 0);
+      O << "L" << *i << "$stub:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\thlt ; hlt ; hlt ; hlt ; hlt\n";
+    }
+
+    O << "\n";
+
+    // Output stubs for external and common global variables.
+    if (GVStubs.begin() != GVStubs.end())
+      SwitchToDataSection(
+                    ".section __IMPORT,__pointers,non_lazy_symbol_pointers");
+    for (std::set<std::string>::iterator i = GVStubs.begin(), e = GVStubs.end();
+         i != e; ++i) {
+      O << "L" << *i << "$non_lazy_ptr:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\t.long\t0\n";
+    }
+
+    // Emit final debug information.
+    DW.EndModule();
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    O << "\t.subsections_via_symbols\n";
+  } else if (Subtarget->isTargetCygMing()) {
+    // Emit type information for external functions
+    for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      O << "\t.def\t " << *i
+        << ";\t.scl\t" << COFF::C_EXT
+        << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+        << ";\t.endef\n";
+    }
+    
+    // Emit final debug information.
+    DW.EndModule();    
+  } else if (Subtarget->isTargetELF()) {
+    // Emit final debug information.
+    DW.EndModule();
+  }
+
+  AsmPrinter::doFinalization(M);
+  return false; // success
+}
+
+/// createX86CodePrinterPass - Returns a pass that prints the X86 assembly code
+/// for a MachineFunction to the given output stream, using the given target
+/// machine description.
+///
+FunctionPass *llvm::createX86CodePrinterPass(std::ostream &o,
+                                             X86TargetMachine &tm) {
+  const X86Subtarget *Subtarget = &tm.getSubtarget<X86Subtarget>();
+
+  if (Subtarget->isFlavorIntel()) {
+    return new X86IntelAsmPrinter(o, tm, tm.getTargetAsmInfo());
+  } else {
+    return new X86ATTAsmPrinter(o, tm, tm.getTargetAsmInfo());
+  }
+}

diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
new file mode 100755
index 0000000..45be89e
--- /dev/null
+++ b/lib/Target/X86/X86AsmPrinter.h

@@ -0,0 +1,97 @@
+//===-- X86AsmPrinter.h - Convert X86 LLVM code to Intel assembly ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file the shared super class printer that converts from our internal
+// representation of machine-dependent LLVM code to Intel and AT&T format
+// assembly language.  This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ASMPRINTER_H
+#define X86ASMPRINTER_H
+
+#include "X86.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+
+
+namespace llvm {
+
+struct VISIBILITY_HIDDEN X86SharedAsmPrinter : public AsmPrinter {
+  DwarfWriter DW;
+
+  X86SharedAsmPrinter(std::ostream &O, X86TargetMachine &TM,
+                      const TargetAsmInfo *T)
+    : AsmPrinter(O, TM, T), DW(O, this, T) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+  }
+
+  // We have to propagate some information about MachineFunction to
+  // AsmPrinter. It's ok, when we're printing the function, since we have
+  // access to MachineFunction and can get the appropriate MachineFunctionInfo.
+  // Unfortunately, this is not possible when we're printing reference to
+  // Function (e.g. calling it and so on). Even more, there is no way to get the
+  // corresponding MachineFunctions: it can even be not created at all. That's
+  // why we should use additional structure, when we're collecting all necessary
+  // information.
+  //
+  // This structure is using e.g. for name decoration for stdcall & fastcall'ed
+  // function, since we have to use arguments' size for decoration.
+  typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap;
+  FMFInfoMap FunctionInfoMap;
+
+  void decorateName(std::string& Name, const GlobalValue* GV);
+
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    if (Subtarget->isTargetDarwin() ||
+        Subtarget->isTargetELF() ||
+        Subtarget->isTargetCygMing()) {
+      AU.addRequired<MachineModuleInfo>();
+    }
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const X86Subtarget *Subtarget;
+
+  // Necessary for Darwin to print out the apprioriate types of linker stubs
+  std::set<std::string> FnStubs, GVStubs, LinkOnceStubs;
+
+  // Necessary for dllexport support
+  std::set<std::string> DLLExportedFns, DLLExportedGVs;
+
+  inline static bool isScale(const MachineOperand &MO) {
+    return MO.isImmediate() &&
+          (MO.getImmedValue() == 1 || MO.getImmedValue() == 2 ||
+          MO.getImmedValue() == 4 || MO.getImmedValue() == 8);
+  }
+
+  inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+    if (MI->getOperand(Op).isFrameIndex()) return true;
+    return Op+4 <= MI->getNumOperands() &&
+      MI->getOperand(Op  ).isRegister() && isScale(MI->getOperand(Op+1)) &&
+      MI->getOperand(Op+2).isRegister() &&
+      (MI->getOperand(Op+3).isImmediate() ||
+       MI->getOperand(Op+3).isGlobalAddress() ||
+       MI->getOperand(Op+3).isConstantPoolIndex() ||
+       MI->getOperand(Op+3).isJumpTableIndex());
+  }
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/X86/X86COFF.h b/lib/Target/X86/X86COFF.h
new file mode 100644
index 0000000..75892ef
--- /dev/null
+++ b/lib/Target/X86/X86COFF.h

@@ -0,0 +1,95 @@
+//===--- X86COFF.h - Some definitions from COFF documentations ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Anton Korobeynikov and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file just defines some symbols found in COFF documentation. They are
+// used to emit function type information for COFF targets (Cygwin/Mingw32).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_H
+#define X86COFF_H
+
+namespace COFF 
+{
+/// Storage class tells where and what the symbol represents
+enum StorageClass {
+  C_EFCN =   -1,  ///< Physical end of function
+  C_NULL    = 0,  ///< No symbol
+  C_AUTO    = 1,  ///< External definition
+  C_EXT     = 2,  ///< External symbol
+  C_STAT    = 3,  ///< Static
+  C_REG     = 4,  ///< Register variable
+  C_EXTDEF  = 5,  ///< External definition
+  C_LABEL   = 6,  ///< Label
+  C_ULABEL  = 7,  ///< Undefined label
+  C_MOS     = 8,  ///< Member of structure
+  C_ARG     = 9,  ///< Function argument
+  C_STRTAG  = 10, ///< Structure tag
+  C_MOU     = 11, ///< Member of union
+  C_UNTAG   = 12, ///< Union tag
+  C_TPDEF   = 13, ///< Type definition
+  C_USTATIC = 14, ///< Undefined static
+  C_ENTAG   = 15, ///< Enumeration tag
+  C_MOE     = 16, ///< Member of enumeration
+  C_REGPARM = 17, ///< Register parameter
+  C_FIELD   = 18, ///< Bit field
+
+  C_BLOCK  = 100, ///< ".bb" or ".eb" - beginning or end of block
+  C_FCN    = 101, ///< ".bf" or ".ef" - beginning or end of function
+  C_EOS    = 102, ///< End of structure
+  C_FILE   = 103, ///< File name
+  C_LINE   = 104, ///< Line number, reformatted as symbol
+  C_ALIAS  = 105, ///< Duplicate tag
+  C_HIDDEN = 106  ///< External symbol in dmert public lib
+};
+
+/// The type of the symbol. This is made up of a base type and a derived type.
+/// For example, pointer to int is "pointer to T" and "int"
+enum SymbolType {
+  T_NULL   = 0,  ///< No type info
+  T_ARG    = 1,  ///< Void function argument (only used by compiler)
+  T_VOID   = 1,  ///< The same as above. Just named differently in some specs.
+  T_CHAR   = 2,  ///< Character
+  T_SHORT  = 3,  ///< Short integer
+  T_INT    = 4,  ///< Integer
+  T_LONG   = 5,  ///< Long integer
+  T_FLOAT  = 6,  ///< Floating point
+  T_DOUBLE = 7,  ///< Double word
+  T_STRUCT = 8,  ///< Structure
+  T_UNION  = 9,  ///< Union
+  T_ENUM   = 10, ///< Enumeration
+  T_MOE    = 11, ///< Member of enumeration
+  T_UCHAR  = 12, ///< Unsigned character
+  T_USHORT = 13, ///< Unsigned short
+  T_UINT   = 14, ///< Unsigned integer
+  T_ULONG  = 15  ///< Unsigned long
+};
+
+/// Derived type of symbol
+enum SymbolDerivedType {
+  DT_NON = 0, ///< No derived type
+  DT_PTR = 1, ///< Pointer to T
+  DT_FCN = 2, ///< Function returning T
+  DT_ARY = 3  ///< Array of T
+};
+
+/// Masks for extracting parts of type
+enum SymbolTypeMasks {
+  N_BTMASK = 017, ///< Mask for base type
+  N_TMASK  = 060  ///< Mask for derived type
+};
+
+/// Offsets of parts of type
+enum Shifts {
+  N_BTSHFT = 4 ///< Type is formed as (base + derived << N_BTSHIFT)
+};
+
+}
+
+#endif // X86COFF_H

diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..39811bd7
--- /dev/null
+++ b/lib/Target/X86/X86CallingConv.td

@@ -0,0 +1,172 @@
+//===- X86CallingConv.td - Calling Conventions for X86 32/64 ----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+  // Scalar values are returned in AX first, then DX.
+  CCIfType<[i8] , CCAssignToReg<[AL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
+  
+  // Vector types are returned in XMM0 and XMM1, when they fit.  If the target
+  // doesn't have XMM registers, it won't have vector types.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1]>>,
+
+  // MMX vector types are always returned in MM0. If the target doesn't have
+  // MM0, it doesn't support these vector types.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToReg<[MM0]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+  // The X86-32 calling convention returns FP values in ST0, otherwise it is the
+  // same as the common X86 calling conv.
+  CCIfType<[f32], CCAssignToReg<[ST0]>>,
+  CCIfType<[f64], CCAssignToReg<[ST0]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+  // The X86-32 fastcc returns FP values in XMM0 if the target has SSE2,
+  // otherwise it is the the C calling conventions.
+  CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0]>>>,
+  CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0]>>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+  // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f32], CCAssignToReg<[XMM0]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+  // If FastCC, use RetCC_X86_32_Fast.
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+  // Otherwise, use RetCC_X86_32_C.
+  CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+  // Always just the same as C calling conv for X86-64.
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+  CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  
+  CCIfStruct<CCStructAssign<[RDI, RSI, RDX, RCX, R8, R9 ]>>,
+
+  // The first 6 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+  
+  // The first 8 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>,
+
+  // The first 8 MMX vector arguments are passed in GPRs.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64],
+              CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack, and the first 4 vector values go in XMM
+/// regs.
+def CC_X86_32_Common : CallingConv<[
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  
+  // Doubles get 8-byte slots that are 4-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+  
+  // The first 4 vector arguments are passed in XMM registers.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // Other vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // __m64 vectors get 8-byte stack slots that are 8-byte aligned. They are
+  // passed in the parameter area.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_32_C : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  
+  // The first 3 integer arguments, if marked 'inreg' and if the call is not
+  // a vararg call, are passed in integer registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+  
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+
+def CC_X86_32_FastCall : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+  
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;

diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
new file mode 100644
index 0000000..8b22634
--- /dev/null
+++ b/lib/Target/X86/X86CodeEmitter.cpp

@@ -0,0 +1,824 @@
+//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the X86 machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "X86Relocations.h"
+#include "X86.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+  class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass {
+    const X86InstrInfo  *II;
+    const TargetData    *TD;
+    TargetMachine       &TM;
+    MachineCodeEmitter  &MCE;
+    bool Is64BitMode;
+  public:
+    static char ID;
+    explicit Emitter(TargetMachine &tm, MachineCodeEmitter &mce)
+      : MachineFunctionPass((intptr_t)&ID), II(0), TD(0), TM(tm), 
+      MCE(mce), Is64BitMode(false) {}
+    Emitter(TargetMachine &tm, MachineCodeEmitter &mce,
+            const X86InstrInfo &ii, const TargetData &td, bool is64)
+      : MachineFunctionPass((intptr_t)&ID), II(&ii), TD(&td), TM(tm), 
+      MCE(mce), Is64BitMode(is64) {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "X86 Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI);
+
+  private:
+    void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
+    void emitPCRelativeValue(intptr_t Address);
+    void emitGlobalAddressForCall(GlobalValue *GV, bool DoesntNeedStub);
+    void emitGlobalAddressForPtr(GlobalValue *GV, unsigned Reloc,
+                                 int Disp = 0, unsigned PCAdj = 0);
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc, int Disp = 0,
+                              unsigned PCAdj = 0);
+    void emitJumpTableAddress(unsigned JTI, unsigned Reloc, unsigned PCAdj = 0);
+
+    void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+                               unsigned PCAdj = 0);
+
+    void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
+    void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
+    void emitConstant(uint64_t Val, unsigned Size);
+
+    void emitMemModRMByte(const MachineInstr &MI,
+                          unsigned Op, unsigned RegOpcodeField,
+                          unsigned PCAdj = 0);
+
+    unsigned getX86RegNum(unsigned RegNo);
+    bool isX86_64ExtendedReg(const MachineOperand &MO);
+    unsigned determineREX(const MachineInstr &MI);
+  };
+  char Emitter::ID = 0;
+}
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *llvm::createX86CodeEmitterPass(X86TargetMachine &TM,
+                                             MachineCodeEmitter &MCE) {
+  return new Emitter(TM, MCE);
+}
+
+bool Emitter::runOnMachineFunction(MachineFunction &MF) {
+  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+          MF.getTarget().getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+  II = ((X86TargetMachine&)MF.getTarget()).getInstrInfo();
+  TD = ((X86TargetMachine&)MF.getTarget()).getTargetData();
+  Is64BitMode =
+    ((X86TargetMachine&)MF.getTarget()).getSubtarget<X86Subtarget>().is64Bit();
+
+  do {
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I)
+        emitInstruction(*I);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+/// emitPCRelativeValue - Emit a PC relative address.
+///
+void Emitter::emitPCRelativeValue(intptr_t Address) {
+  MCE.emitWordLE(Address-MCE.getCurrentPCValue()-4);
+}
+
+/// emitPCRelativeBlockAddress - This method keeps track of the information
+/// necessary to resolve the address of this block later and emits a dummy
+/// value.
+///
+void Emitter::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
+  // Remember where this reference was and where it is to so we can
+  // deal with it later.
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             X86::reloc_pcrel_word, MBB));
+  MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddressForCall - Emit the specified address to the code stream
+/// assuming this is part of a function call, which is PC relative.
+///
+void Emitter::emitGlobalAddressForCall(GlobalValue *GV, bool DoesntNeedStub) {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(),
+                                      X86::reloc_pcrel_word, GV, 0,
+                                      DoesntNeedStub));
+  MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream assuming
+/// this is part of a "take the address of a global" instruction.
+///
+void Emitter::emitGlobalAddressForPtr(GlobalValue *GV, unsigned Reloc,
+                                      int Disp /* = 0 */,
+                                      unsigned PCAdj /* = 0 */) {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                             GV, PCAdj));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitWordLE(0);
+  MCE.emitWordLE(Disp); // The relocated value will be added to the displacement
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitExternalSymbolAddress(const char *ES, unsigned Reloc) {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitWordLE(0);
+  MCE.emitWordLE(0);
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
+                                   int Disp /* = 0 */,
+                                   unsigned PCAdj /* = 0 */) {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, PCAdj));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitWordLE(0);
+  MCE.emitWordLE(Disp); // The relocated value will be added to the displacement
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+                                   unsigned PCAdj /* = 0 */) {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTI, PCAdj));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitWordLE(0);
+  MCE.emitWordLE(0); // The relocated value will be added to the displacement
+}
+
+/// N86 namespace - Native X86 Register numbers... used by X86 backend.
+///
+namespace N86 {
+  enum {
+    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+  };
+}
+
+// getX86RegNum - This function maps LLVM register identifiers to their X86
+// specific numbering, which is used in various places encoding instructions.
+//
+unsigned Emitter::getX86RegNum(unsigned RegNo) {
+  switch(RegNo) {
+  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
+  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
+  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
+  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
+  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
+    return N86::ESP;
+  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
+    return N86::EBP;
+  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
+    return N86::ESI;
+  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
+    return N86::EDI;
+
+  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+    return N86::EAX;
+  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+    return N86::ECX;
+  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+    return N86::EDX;
+  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+    return N86::EBX;
+  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+    return N86::ESP;
+  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+    return N86::EBP;
+  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+    return N86::ESI;
+  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+    return N86::EDI;
+
+  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
+  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
+    return RegNo-X86::ST0;
+
+  case X86::XMM0:  case X86::XMM1:  case X86::XMM2:  case X86::XMM3:
+  case X86::XMM4:  case X86::XMM5:  case X86::XMM6:  case X86::XMM7:
+    return II->getRegisterInfo().getDwarfRegNum(RegNo) -
+           II->getRegisterInfo().getDwarfRegNum(X86::XMM0);
+  case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
+  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+    return II->getRegisterInfo().getDwarfRegNum(RegNo) -
+           II->getRegisterInfo().getDwarfRegNum(X86::XMM8);
+
+  default:
+    assert(MRegisterInfo::isVirtualRegister(RegNo) &&
+           "Unknown physical register!");
+    assert(0 && "Register allocator hasn't allocated reg correctly yet!");
+    return 0;
+  }
+}
+
+inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                      unsigned RM) {
+  assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+  return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+void Emitter::emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeFld){
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+}
+
+void Emitter::emitSIBByte(unsigned SS, unsigned Index, unsigned Base) {
+  // SIB byte is in the same format as the ModRMByte...
+  MCE.emitByte(ModRMByte(SS, Index, Base));
+}
+
+void Emitter::emitConstant(uint64_t Val, unsigned Size) {
+  // Output the constant in little endian byte order...
+  for (unsigned i = 0; i != Size; ++i) {
+    MCE.emitByte(Val & 255);
+    Val >>= 8;
+  }
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit 
+/// sign-extended field. 
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+void Emitter::emitDisplacementField(const MachineOperand *RelocOp,
+                                    int DispVal, unsigned PCAdj) {
+  // If this is a simple integer displacement that doesn't require a relocation,
+  // emit it now.
+  if (!RelocOp) {
+    emitConstant(DispVal, 4);
+    return;
+  }
+  
+  // Otherwise, this is something that requires a relocation.  Emit it as such
+  // now.
+  if (RelocOp->isGlobalAddress()) {
+    // In 64-bit static small code model, we could potentially emit absolute.
+    // But it's probably not beneficial.
+    //  89 05 00 00 00 00    	mov    %eax,0(%rip)  # PC-relative
+    //	89 04 25 00 00 00 00 	mov    %eax,0x0      # Absolute
+    unsigned rt= Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_absolute_word;
+    emitGlobalAddressForPtr(RelocOp->getGlobal(), rt,
+                            RelocOp->getOffset(), PCAdj);
+  } else if (RelocOp->isConstantPoolIndex()) {
+    // Must be in 64-bit mode.
+    emitConstPoolAddress(RelocOp->getConstantPoolIndex(), X86::reloc_pcrel_word,
+                         RelocOp->getOffset(), PCAdj);
+  } else if (RelocOp->isJumpTableIndex()) {
+    // Must be in 64-bit mode.
+    emitJumpTableAddress(RelocOp->getJumpTableIndex(), X86::reloc_pcrel_word,
+                         PCAdj);
+  } else {
+    assert(0 && "Unknown value to relocate!");
+  }
+}
+
+void Emitter::emitMemModRMByte(const MachineInstr &MI,
+                               unsigned Op, unsigned RegOpcodeField,
+                               unsigned PCAdj) {
+  const MachineOperand &Op3 = MI.getOperand(Op+3);
+  int DispVal = 0;
+  const MachineOperand *DispForReloc = 0;
+  
+  // Figure out what sort of displacement we have to handle here.
+  if (Op3.isGlobalAddress()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isConstantPoolIndex()) {
+    if (Is64BitMode) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getConstantPoolEntryAddress(Op3.getConstantPoolIndex());
+      DispVal += Op3.getOffset();
+    }
+  } else if (Op3.isJumpTableIndex()) {
+    if (Is64BitMode) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getJumpTableEntryAddress(Op3.getJumpTableIndex());
+    }
+  } else {
+    DispVal = Op3.getImm();
+  }
+
+  const MachineOperand &Base     = MI.getOperand(Op);
+  const MachineOperand &Scale    = MI.getOperand(Op+1);
+  const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+  unsigned BaseReg = Base.getReg();
+
+  // Is a SIB byte needed?
+  if (IndexReg.getReg() == 0 &&
+      (BaseReg == 0 || getX86RegNum(BaseReg) != N86::ESP)) {
+    if (BaseReg == 0) {  // Just a displacement?
+      // Emit special case [disp32] encoding
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+      
+      emitDisplacementField(DispForReloc, DispVal, PCAdj);
+    } else {
+      unsigned BaseRegNo = getX86RegNum(BaseReg);
+      if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+        // Emit simple indirect register encoding... [EAX] f.e.
+        MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
+      } else if (!DispForReloc && isDisp8(DispVal)) {
+        // Emit the disp8 encoding... [REG+disp8]
+        MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
+        emitConstant(DispVal, 1);
+      } else {
+        // Emit the most general non-SIB encoding: [REG+disp32]
+        MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
+        emitDisplacementField(DispForReloc, DispVal, PCAdj);
+      }
+    }
+
+  } else {  // We need a SIB byte, so start by outputting the ModR/M byte first
+    assert(IndexReg.getReg() != X86::ESP &&
+           IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+    bool ForceDisp32 = false;
+    bool ForceDisp8  = false;
+    if (BaseReg == 0) {
+      // If there is no base register, we emit the special case SIB byte with
+      // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+      ForceDisp32 = true;
+    } else if (DispForReloc) {
+      // Emit the normal disp32 encoding.
+      MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+      ForceDisp32 = true;
+    } else if (DispVal == 0 && getX86RegNum(BaseReg) != N86::EBP) {
+      // Emit no displacement ModR/M byte
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+    } else if (isDisp8(DispVal)) {
+      // Emit the disp8 encoding...
+      MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
+      ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+    } else {
+      // Emit the normal disp32 encoding...
+      MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+    }
+
+    // Calculate what the SS field value should be...
+    static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+    unsigned SS = SSTable[Scale.getImm()];
+
+    if (BaseReg == 0) {
+      // Handle the SIB byte for the case where there is no base.  The
+      // displacement has already been output.
+      assert(IndexReg.getReg() && "Index register must be specified!");
+      emitSIBByte(SS, getX86RegNum(IndexReg.getReg()), 5);
+    } else {
+      unsigned BaseRegNo = getX86RegNum(BaseReg);
+      unsigned IndexRegNo;
+      if (IndexReg.getReg())
+        IndexRegNo = getX86RegNum(IndexReg.getReg());
+      else
+        IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+      emitSIBByte(SS, IndexRegNo, BaseRegNo);
+    }
+
+    // Do we need to output a displacement?
+    if (ForceDisp8) {
+      emitConstant(DispVal, 1);
+    } else if (DispVal != 0 || ForceDisp32) {
+      emitDisplacementField(DispForReloc, DispVal, PCAdj);
+    }
+  }
+}
+
+static unsigned sizeOfImm(const TargetInstrDescriptor *Desc) {
+  switch (Desc->TSFlags & X86II::ImmMask) {
+  case X86II::Imm8:   return 1;
+  case X86II::Imm16:  return 2;
+  case X86II::Imm32:  return 4;
+  case X86II::Imm64:  return 8;
+  default: assert(0 && "Immediate size not set!");
+    return 0;
+  }
+}
+
+/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended register?
+/// e.g. r8, xmm8, etc.
+bool Emitter::isX86_64ExtendedReg(const MachineOperand &MO) {
+  if (!MO.isRegister()) return false;
+  unsigned RegNo = MO.getReg();
+  int DWNum = II->getRegisterInfo().getDwarfRegNum(RegNo);
+  if (DWNum >= II->getRegisterInfo().getDwarfRegNum(X86::R8) &&
+      DWNum <= II->getRegisterInfo().getDwarfRegNum(X86::R15))
+    return true;
+  if (DWNum >= II->getRegisterInfo().getDwarfRegNum(X86::XMM8) &&
+      DWNum <= II->getRegisterInfo().getDwarfRegNum(X86::XMM15))
+    return true;
+  return false;
+}
+
+inline static bool isX86_64TruncToByte(unsigned oc) {
+  return (oc == X86::TRUNC_64to8 || oc == X86::TRUNC_32to8 ||
+          oc == X86::TRUNC_16to8);
+}
+
+
+inline static bool isX86_64NonExtLowByteReg(unsigned reg) {
+  return (reg == X86::SPL || reg == X86::BPL ||
+          reg == X86::SIL || reg == X86::DIL);
+}
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+unsigned Emitter::determineREX(const MachineInstr &MI) {
+  unsigned REX = 0;
+  const TargetInstrDescriptor *Desc = MI.getInstrDescriptor();
+  unsigned Opcode = Desc->Opcode;
+
+  // Pseudo instructions do not need REX prefix byte.
+  if ((Desc->TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return 0;
+  if (Desc->TSFlags & X86II::REX_W)
+    REX |= 1 << 3;
+
+  unsigned NumOps = Desc->numOperands;
+  if (NumOps) {
+    bool isTwoAddr = NumOps > 1 &&
+      Desc->getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+    bool isTrunc8 = isX86_64TruncToByte(Opcode);
+    unsigned i = isTwoAddr ? 1 : 0;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MachineOperand& MO = MI.getOperand(i);
+      if (MO.isRegister()) {
+        unsigned Reg = MO.getReg();
+        // Trunc to byte are actually movb. The real source operand is the low
+        // byte of the register.
+        if (isTrunc8 && i == 1)
+          Reg = getX86SubSuperRegister(Reg, MVT::i8);
+        if (isX86_64NonExtLowByteReg(Reg))
+          REX |= 0x40;
+      }
+    }
+
+    switch (Desc->TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= (1 << 0) | (1 << 2);
+      break;
+    case X86II::MRMSrcReg: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 2;
+      i = isTwoAddr ? 2 : 1;
+      for (unsigned e = NumOps; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (isX86_64ExtendedReg(MO))
+          REX |= 1 << 0;
+      }
+      break;
+    }
+    case X86II::MRMSrcMem: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 2;
+      unsigned Bit = 0;
+      i = isTwoAddr ? 2 : 1;
+      for (; i != NumOps; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (MO.isRegister()) {
+          if (isX86_64ExtendedReg(MO))
+            REX |= 1 << Bit;
+          Bit++;
+        }
+      }
+      break;
+    }
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+    case X86II::MRMDestMem: {
+      unsigned e = isTwoAddr ? 5 : 4;
+      i = isTwoAddr ? 1 : 0;
+      if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e)))
+        REX |= 1 << 2;
+      unsigned Bit = 0;
+      for (; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (MO.isRegister()) {
+          if (isX86_64ExtendedReg(MO))
+            REX |= 1 << Bit;
+          Bit++;
+        }
+      }
+      break;
+    }
+    default: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 0;
+      i = isTwoAddr ? 2 : 1;
+      for (unsigned e = NumOps; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (isX86_64ExtendedReg(MO))
+          REX |= 1 << 2;
+      }
+      break;
+    }
+    }
+  }
+  return REX;
+}
+
+void Emitter::emitInstruction(const MachineInstr &MI) {
+  NumEmitted++;  // Keep track of the # of mi's emitted
+
+  const TargetInstrDescriptor *Desc = MI.getInstrDescriptor();
+  unsigned Opcode = Desc->Opcode;
+
+  // Emit the repeat opcode prefix as needed.
+  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3);
+
+  // Emit the operand size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::OpSize) MCE.emitByte(0x66);
+
+  // Emit the address size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::AdSize) MCE.emitByte(0x67);
+
+  bool Need0FPrefix = false;
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+  case X86II::TB:
+    Need0FPrefix = true;   // Two-byte opcode prefix
+    break;
+  case X86II::T8:
+    MCE.emitByte(0x0F);
+    MCE.emitByte(0x38);
+    break;
+  case X86II::TA:
+    MCE.emitByte(0x0F);
+    MCE.emitByte(0x3A);
+    break;
+  case X86II::REP: break; // already handled.
+  case X86II::XS:   // F3 0F
+    MCE.emitByte(0xF3);
+    Need0FPrefix = true;
+    break;
+  case X86II::XD:   // F2 0F
+    MCE.emitByte(0xF2);
+    Need0FPrefix = true;
+    break;
+  case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+  case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+    MCE.emitByte(0xD8+
+                 (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+                                   >> X86II::Op0Shift));
+    break; // Two-byte opcode prefix
+  default: assert(0 && "Invalid prefix!");
+  case 0: break;  // No prefix!
+  }
+
+  if (Is64BitMode) {
+    // REX prefix
+    unsigned REX = determineREX(MI);
+    if (REX)
+      MCE.emitByte(0x40 | REX);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    MCE.emitByte(0x0F);
+
+  // If this is a two-address instruction, skip one of the register operands.
+  unsigned NumOps = Desc->numOperands;
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+    CurOp++;
+
+  unsigned char BaseOpcode = II->getBaseOpcodeFor(Desc);
+  switch (Desc->TSFlags & X86II::FormMask) {
+  default: assert(0 && "Unknown FormMask value in X86 MachineCodeEmitter!");
+  case X86II::Pseudo:
+#ifndef NDEBUG
+    switch (Opcode) {
+    default: 
+      assert(0 && "psuedo instructions should be removed before code emission");
+    case TargetInstrInfo::INLINEASM:
+      assert(0 && "JIT does not support inline asm!\n");
+    case TargetInstrInfo::LABEL:
+      assert(0 && "JIT does not support meta labels!\n");
+    case X86::IMPLICIT_USE:
+    case X86::IMPLICIT_DEF:
+    case X86::IMPLICIT_DEF_GR8:
+    case X86::IMPLICIT_DEF_GR16:
+    case X86::IMPLICIT_DEF_GR32:
+    case X86::IMPLICIT_DEF_GR64:
+    case X86::IMPLICIT_DEF_FR32:
+    case X86::IMPLICIT_DEF_FR64:
+    case X86::IMPLICIT_DEF_VR64:
+    case X86::IMPLICIT_DEF_VR128:
+    case X86::FP_REG_KILL:
+      break;
+    }
+#endif
+    CurOp = NumOps;
+    break;
+
+  case X86II::RawFrm:
+    MCE.emitByte(BaseOpcode);
+    if (CurOp != NumOps) {
+      const MachineOperand &MO = MI.getOperand(CurOp++);
+      if (MO.isMachineBasicBlock()) {
+        emitPCRelativeBlockAddress(MO.getMachineBasicBlock());
+      } else if (MO.isGlobalAddress()) {
+        bool NeedStub = Is64BitMode ||
+                        Opcode == X86::TAILJMPd ||
+                        Opcode == X86::TAILJMPr || Opcode == X86::TAILJMPm;
+        emitGlobalAddressForCall(MO.getGlobal(), !NeedStub);
+      } else if (MO.isExternalSymbol()) {
+        emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+      } else if (MO.isImmediate()) {
+        emitConstant(MO.getImm(), sizeOfImm(Desc));
+      } else {
+        assert(0 && "Unknown RawFrm operand!");
+      }
+    }
+    break;
+
+  case X86II::AddRegFrm:
+    MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg()));
+    
+    if (CurOp != NumOps) {
+      const MachineOperand &MO1 = MI.getOperand(CurOp++);
+      unsigned Size = sizeOfImm(Desc);
+      if (MO1.isImmediate())
+        emitConstant(MO1.getImm(), Size);
+      else {
+        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_absolute_word;
+        if (Opcode == X86::MOV64ri)
+          rt = X86::reloc_absolute_dword;  // FIXME: add X86II flag?
+        if (MO1.isGlobalAddress())
+          emitGlobalAddressForPtr(MO1.getGlobal(), rt, MO1.getOffset());
+        else if (MO1.isExternalSymbol())
+          emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+        else if (MO1.isConstantPoolIndex())
+          emitConstPoolAddress(MO1.getConstantPoolIndex(), rt);
+        else if (MO1.isJumpTableIndex())
+          emitJumpTableAddress(MO1.getJumpTableIndex(), rt);
+      }
+    }
+    break;
+
+  case X86II::MRMDestReg: {
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+    CurOp += 2;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+    break;
+  }
+  case X86II::MRMDestMem: {
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(CurOp+4).getReg()));
+    CurOp += 5;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+    break;
+  }
+
+  case X86II::MRMSrcReg:
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    CurOp += 2;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+    break;
+
+  case X86II::MRMSrcMem: {
+    unsigned PCAdj = (CurOp+5 != NumOps) ? sizeOfImm(Desc) : 0;
+
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()),
+                     PCAdj);
+    CurOp += 5;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+    break;
+  }
+
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
+                     (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO1 = MI.getOperand(CurOp++);
+      unsigned Size = sizeOfImm(Desc);
+      if (MO1.isImmediate())
+        emitConstant(MO1.getImm(), Size);
+      else {
+        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+          : X86::reloc_absolute_word;
+        if (Opcode == X86::MOV64ri32)
+          rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
+        if (MO1.isGlobalAddress())
+          emitGlobalAddressForPtr(MO1.getGlobal(), rt, MO1.getOffset());
+        else if (MO1.isExternalSymbol())
+          emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+        else if (MO1.isConstantPoolIndex())
+          emitConstPoolAddress(MO1.getConstantPoolIndex(), rt);
+        else if (MO1.isJumpTableIndex())
+          emitJumpTableAddress(MO1.getJumpTableIndex(), rt);
+      }
+    }
+    break;
+
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    unsigned PCAdj = (CurOp+4 != NumOps) ?
+      (MI.getOperand(CurOp+4).isImmediate() ? sizeOfImm(Desc) : 4) : 0;
+
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
+                     PCAdj);
+    CurOp += 4;
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO = MI.getOperand(CurOp++);
+      unsigned Size = sizeOfImm(Desc);
+      if (MO.isImmediate())
+        emitConstant(MO.getImm(), Size);
+      else {
+        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+          : X86::reloc_absolute_word;
+        if (Opcode == X86::MOV64mi32)
+          rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
+        if (MO.isGlobalAddress())
+          emitGlobalAddressForPtr(MO.getGlobal(), rt, MO.getOffset());
+        else if (MO.isExternalSymbol())
+          emitExternalSymbolAddress(MO.getSymbolName(), rt);
+        else if (MO.isConstantPoolIndex())
+          emitConstPoolAddress(MO.getConstantPoolIndex(), rt);
+        else if (MO.isJumpTableIndex())
+          emitJumpTableAddress(MO.getJumpTableIndex(), rt);
+      }
+    }
+    break;
+  }
+
+  case X86II::MRMInitReg:
+    MCE.emitByte(BaseOpcode);
+    // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    ++CurOp;
+    break;
+  }
+
+  assert((Desc->Flags & M_VARIABLE_OPS) != 0 ||
+         CurOp == NumOps && "Unknown encoding!");
+}

diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
new file mode 100644
index 0000000..f8f8d48
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp

@@ -0,0 +1,18 @@
+//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bill Wendling and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ELFWriterInfo.h"
+using namespace llvm;
+
+X86ELFWriterInfo::X86ELFWriterInfo() : TargetELFWriterInfo(EM_386) {}
+X86ELFWriterInfo::~X86ELFWriterInfo() {}

diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
new file mode 100644
index 0000000..eb564fb
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.h

@@ -0,0 +1,29 @@
+//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bill Wendling and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ELF_WRITER_INFO_H
+#define X86_ELF_WRITER_INFO_H
+
+#include "llvm/Target/TargetELFWriterInfo.h"
+
+namespace llvm {
+
+  class X86ELFWriterInfo : public TargetELFWriterInfo {
+  public:
+    X86ELFWriterInfo();
+    virtual ~X86ELFWriterInfo();
+  };
+
+} // end llvm namespace
+
+#endif // X86_ELF_WRITER_INFO_H

diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..c293a32
--- /dev/null
+++ b/lib/Target/X86/X86FloatingPoint.cpp

@@ -0,0 +1,882 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// virtual registers into register stack instructions.  This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// This pass is hampered by the lack of decent CFG manipulation routines for
+// machine code.  In particular, this wants to be able to split critical edges
+// as necessary, traverse the machine basic block CFG in depth-first order, and
+// allow there to be multiple machine basic blocks for each LLVM basicblock
+// (needed for critical edge splitting).
+//
+// In particular, this pass currently barfs on critical edges.  Because of this,
+// it requires the instruction selector to insert FP_REG_KILL instructions on
+// the exits of any basic block that has critical edges going from it, or which
+// branch to a critical basic block.
+//
+// FIXME: this is not implemented yet.  The stackifier pass only works on local
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP  , "Number of floating point instructions");
+
+namespace {
+  struct VISIBILITY_HIDDEN FPS : public MachineFunctionPass {
+    static char ID;
+    FPS() : MachineFunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const { return "X86 FP Stackifier"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LiveVariables>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+    LiveVariables     *LV;      // Live variable info for current function...
+    MachineBasicBlock *MBB;     // Current basic block
+    unsigned Stack[8];          // FP<n> Registers in each stack slot...
+    unsigned RegMap[8];         // Track which stack slot contains each register
+    unsigned StackTop;          // The current top of the FP stack.
+
+    void dumpStack() const {
+      cerr << "Stack contents:";
+      for (unsigned i = 0; i != StackTop; ++i) {
+        cerr << " FP" << Stack[i];
+        assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+      }
+      cerr << "\n";
+    }
+  private:
+    // getSlot - Return the stack slot number a particular register number is
+    // in...
+    unsigned getSlot(unsigned RegNo) const {
+      assert(RegNo < 8 && "Regno out of range!");
+      return RegMap[RegNo];
+    }
+
+    // getStackEntry - Return the X86::FP<n> register in register ST(i)
+    unsigned getStackEntry(unsigned STi) const {
+      assert(STi < StackTop && "Access past stack top!");
+      return Stack[StackTop-1-STi];
+    }
+
+    // getSTReg - Return the X86::ST(i) register which contains the specified
+    // FP<RegNo> register
+    unsigned getSTReg(unsigned RegNo) const {
+      return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0;
+    }
+
+    // pushReg - Push the specified FP<n> register onto the stack
+    void pushReg(unsigned Reg) {
+      assert(Reg < 8 && "Register number out of range!");
+      assert(StackTop < 8 && "Stack overflow!");
+      Stack[StackTop] = Reg;
+      RegMap[Reg] = StackTop++;
+    }
+
+    bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+    void moveToTop(unsigned RegNo, MachineBasicBlock::iterator &I) {
+      if (!isAtTop(RegNo)) {
+        unsigned STReg = getSTReg(RegNo);
+        unsigned RegOnTop = getStackEntry(0);
+
+        // Swap the slots the regs are in
+        std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+        // Swap stack slot contents
+        assert(RegMap[RegOnTop] < StackTop);
+        std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+        // Emit an fxch to update the runtime processors version of the state
+        BuildMI(*MBB, I, TII->get(X86::XCH_F)).addReg(STReg);
+        NumFXCH++;
+      }
+    }
+
+    void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+      unsigned STReg = getSTReg(RegNo);
+      pushReg(AsReg);   // New register on top of stack
+
+      BuildMI(*MBB, I, TII->get(X86::LD_Frr)).addReg(STReg);
+    }
+
+    // popStackAfter - Pop the current value off of the top of the FP stack
+    // after the specified instruction.
+    void popStackAfter(MachineBasicBlock::iterator &I);
+
+    // freeStackSlotAfter - Free the specified register from the register stack,
+    // so that it is no longer in a register.  If the register is currently at
+    // the top of the stack, we just pop the current instruction, otherwise we
+    // store the current top-of-stack into the specified slot, then pop the top
+    // of stack.
+    void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    void handleZeroArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+    void handleTwoArgFP(MachineBasicBlock::iterator &I);
+    void handleCompareFP(MachineBasicBlock::iterator &I);
+    void handleCondMovFP(MachineBasicBlock::iterator &I);
+    void handleSpecialFP(MachineBasicBlock::iterator &I);
+  };
+  char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+  // We only need to run this pass if there are any FP registers used in this
+  // function.  If it is all integer, there is nothing for us to do!
+  bool FPIsUsed = false;
+
+  assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!");
+  for (unsigned i = 0; i <= 6; ++i)
+    if (MF.isPhysRegUsed(X86::FP0+i)) {
+      FPIsUsed = true;
+      break;
+    }
+
+  // Early exit.
+  if (!FPIsUsed) return false;
+
+  TII = MF.getTarget().getInstrInfo();
+  LV = &getAnalysis<LiveVariables>();
+  StackTop = 0;
+
+  // Process the function in depth first order so that we process at least one
+  // of the predecessors for every reachable block in the function.
+  std::set<MachineBasicBlock*> Processed;
+  MachineBasicBlock *Entry = MF.begin();
+
+  bool Changed = false;
+  for (df_ext_iterator<MachineBasicBlock*, std::set<MachineBasicBlock*> >
+         I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
+       I != E; ++I)
+    Changed |= processBasicBlock(MF, **I);
+
+  return Changed;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    unsigned Flags = MI->getInstrDescriptor()->TSFlags;
+    if ((Flags & X86II::FPTypeMask) == X86II::NotFP)
+      continue;  // Efficiently ignore non-fp insts!
+
+    MachineInstr *PrevMI = 0;
+    if (I != BB.begin())
+        PrevMI = prior(I);
+
+    ++NumFP;  // Keep track of # of pseudo instrs
+    DOUT << "\nFPInst:\t" << *MI;
+
+    // Get dead variables list now because the MI pointer may be deleted as part
+    // of processing!
+    SmallVector<unsigned, 8> DeadRegs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead())
+        DeadRegs.push_back(MO.getReg());
+    }
+
+    switch (Flags & X86II::FPTypeMask) {
+    case X86II::ZeroArgFP:  handleZeroArgFP(I); break;
+    case X86II::OneArgFP:   handleOneArgFP(I);  break;  // fstp ST(0)
+    case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+    case X86II::TwoArgFP:   handleTwoArgFP(I);  break;
+    case X86II::CompareFP:  handleCompareFP(I); break;
+    case X86II::CondMovFP:  handleCondMovFP(I); break;
+    case X86II::SpecialFP:  handleSpecialFP(I); break;
+    default: assert(0 && "Unknown FP Type!");
+    }
+
+    // Check to see if any of the values defined by this instruction are dead
+    // after definition.  If so, pop them.
+    for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+      unsigned Reg = DeadRegs[i];
+      if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+        DOUT << "Register FP#" << Reg-X86::FP0 << " is dead!\n";
+        freeStackSlotAfter(I, Reg-X86::FP0);
+      }
+    }
+
+    // Print out all of the instructions expanded to if -debug
+    DEBUG(
+      MachineBasicBlock::iterator PrevI(PrevMI);
+      if (I == PrevI) {
+        cerr << "Just deleted pseudo instruction\n";
+      } else {
+        MachineBasicBlock::iterator Start = I;
+        // Rewind to first instruction newly inserted.
+        while (Start != BB.begin() && prior(Start) != PrevI) --Start;
+        cerr << "Inserted instructions:\n\t";
+        Start->print(*cerr.stream(), &MF.getTarget());
+        while (++Start != next(I));
+      }
+      dumpStack();
+    );
+
+    Changed = true;
+  }
+
+  assert(StackTop == 0 && "Stack not empty at end of basic block?");
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct TableEntry {
+    unsigned from;
+    unsigned to;
+    bool operator<(const TableEntry &TE) const { return from < TE.from; }
+    friend bool operator<(const TableEntry &TE, unsigned V) {
+      return TE.from < V;
+    }
+    friend bool operator<(unsigned V, const TableEntry &TE) {
+      return V < TE.from;
+    }
+  };
+}
+
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+  for (unsigned i = 0; i != NumEntries-1; ++i)
+    if (!(Table[i] < Table[i+1])) return false;
+  return true;
+}
+
+static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+  if (I != Table+N && I->from == Opcode)
+    return I->to;
+  return -1;
+}
+
+#define ARRAY_SIZE(TABLE)  \
+   (sizeof(TABLE)/sizeof(TABLE[0]))
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE)                                              \
+  { static bool TABLE##Checked = false;                                   \
+    if (!TABLE##Checked) {                                                \
+       assert(TableIsSorted(TABLE, ARRAY_SIZE(TABLE)) &&                  \
+              "All lookup tables must be sorted for efficient access!");  \
+       TABLE##Checked = true;                                             \
+    }                                                                     \
+  }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+  { X86::ABS_Fp32     , X86::ABS_F     },
+  { X86::ABS_Fp64     , X86::ABS_F     },
+  { X86::ADD_Fp32m    , X86::ADD_F32m  },
+  { X86::ADD_Fp64m    , X86::ADD_F64m  },
+  { X86::ADD_Fp64m32  , X86::ADD_F32m  },
+  { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+  { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+  { X86::CHS_Fp32     , X86::CHS_F     },
+  { X86::CHS_Fp64     , X86::CHS_F     },
+  { X86::CMOVBE_Fp32  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp64  , X86::CMOVBE_F  },
+  { X86::CMOVB_Fp32   , X86::CMOVB_F   },
+  { X86::CMOVB_Fp64   , X86::CMOVB_F  },
+  { X86::CMOVE_Fp32   , X86::CMOVE_F  },
+  { X86::CMOVE_Fp64   , X86::CMOVE_F   },
+  { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+  { X86::CMOVNB_Fp32  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp64  , X86::CMOVNB_F  },
+  { X86::CMOVNE_Fp32  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp64  , X86::CMOVNE_F  },
+  { X86::CMOVNP_Fp32  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp64  , X86::CMOVNP_F  },
+  { X86::CMOVP_Fp32   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp64   , X86::CMOVP_F   },
+  { X86::COS_Fp32     , X86::COS_F     },
+  { X86::COS_Fp64     , X86::COS_F     },
+  { X86::DIVR_Fp32m   , X86::DIVR_F32m },
+  { X86::DIVR_Fp64m   , X86::DIVR_F64m },
+  { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+  { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+  { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+  { X86::DIV_Fp32m    , X86::DIV_F32m  },
+  { X86::DIV_Fp64m    , X86::DIV_F64m  },
+  { X86::DIV_Fp64m32  , X86::DIV_F32m  },
+  { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+  { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+  { X86::ILD_Fp16m32  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m64  , X86::ILD_F16m  },
+  { X86::ILD_Fp32m32  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m64  , X86::ILD_F32m  },
+  { X86::ILD_Fp64m32  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m64  , X86::ILD_F64m  },
+  { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+  { X86::IST_Fp16m32  , X86::IST_F16m  },
+  { X86::IST_Fp16m64  , X86::IST_F16m  },
+  { X86::IST_Fp32m32  , X86::IST_F32m  },
+  { X86::IST_Fp32m64  , X86::IST_F32m  },
+  { X86::IST_Fp64m32  , X86::IST_FP64m },
+  { X86::IST_Fp64m64  , X86::IST_FP64m },
+  { X86::LD_Fp032     , X86::LD_F0     },
+  { X86::LD_Fp064     , X86::LD_F0     },
+  { X86::LD_Fp132     , X86::LD_F1     },
+  { X86::LD_Fp164     , X86::LD_F1     },
+  { X86::LD_Fp32m     , X86::LD_F32m   },
+  { X86::LD_Fp64m     , X86::LD_F64m   },
+  { X86::MUL_Fp32m    , X86::MUL_F32m  },
+  { X86::MUL_Fp64m    , X86::MUL_F64m  },
+  { X86::MUL_Fp64m32  , X86::MUL_F32m  },
+  { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+  { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+  { X86::SIN_Fp32     , X86::SIN_F     },
+  { X86::SIN_Fp64     , X86::SIN_F     },
+  { X86::SQRT_Fp32    , X86::SQRT_F    },
+  { X86::SQRT_Fp64    , X86::SQRT_F    },
+  { X86::ST_Fp32m     , X86::ST_F32m   },
+  { X86::ST_Fp64m     , X86::ST_F64m   },
+  { X86::ST_Fp64m32   , X86::ST_F32m   },
+  { X86::SUBR_Fp32m   , X86::SUBR_F32m },
+  { X86::SUBR_Fp64m   , X86::SUBR_F64m },
+  { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+  { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+  { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+  { X86::SUB_Fp32m    , X86::SUB_F32m  },
+  { X86::SUB_Fp64m    , X86::SUB_F64m  },
+  { X86::SUB_Fp64m32  , X86::SUB_F32m  },
+  { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+  { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+  { X86::TST_Fp32     , X86::TST_F     },
+  { X86::TST_Fp64     , X86::TST_F     },
+  { X86::UCOM_FpIr32  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr64  , X86::UCOM_FIr  },
+  { X86::UCOM_Fpr32   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr64   , X86::UCOM_Fr   },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+  ASSERT_SORTED(OpcodeTable);
+  int Opc = Lookup(OpcodeTable, ARRAY_SIZE(OpcodeTable), Opcode);
+  assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+  return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version.  The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+  { X86::ADD_FrST0 , X86::ADD_FPrST0  },
+
+  { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+  { X86::DIV_FrST0 , X86::DIV_FPrST0  },
+
+  { X86::IST_F16m  , X86::IST_FP16m   },
+  { X86::IST_F32m  , X86::IST_FP32m   },
+
+  { X86::MUL_FrST0 , X86::MUL_FPrST0  },
+
+  { X86::ST_F32m   , X86::ST_FP32m    },
+  { X86::ST_F64m   , X86::ST_FP64m    },
+  { X86::ST_Frr    , X86::ST_FPrr     },
+
+  { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+  { X86::SUB_FrST0 , X86::SUB_FPrST0  },
+
+  { X86::UCOM_FIr  , X86::UCOM_FIPr   },
+
+  { X86::UCOM_FPr  , X86::UCOM_FPPr   },
+  { X86::UCOM_Fr   , X86::UCOM_FPr    },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction.  This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible.  The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(PopTable);
+  assert(StackTop > 0 && "Cannot pop empty stack!");
+  RegMap[Stack[--StackTop]] = ~0;     // Update state
+
+  // Check to see if there is a popping version of this instruction...
+  int Opcode = Lookup(PopTable, ARRAY_SIZE(PopTable), I->getOpcode());
+  if (Opcode != -1) {
+    I->setInstrDescriptor(TII->get(Opcode));
+    if (Opcode == X86::UCOM_FPPr)
+      I->RemoveOperand(0);
+  } else {    // Insert an explicit pop
+    I = BuildMI(*MBB, ++I, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+  }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register.  If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+  if (getStackEntry(0) == FPRegNo) {  // already at the top of stack? easy.
+    popStackAfter(I);
+    return;
+  }
+
+  // Otherwise, store the top of stack into the dead slot, killing the operand
+  // without having to add in an explicit xchg then pop.
+  //
+  unsigned STReg    = getSTReg(FPRegNo);
+  unsigned OldSlot  = getSlot(FPRegNo);
+  unsigned TopReg   = Stack[StackTop-1];
+  Stack[OldSlot]    = TopReg;
+  RegMap[TopReg]    = OldSlot;
+  RegMap[FPRegNo]   = ~0;
+  Stack[--StackTop] = ~0;
+  I = BuildMI(*MBB, ++I, TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+
+static unsigned getFPReg(const MachineOperand &MO) {
+  assert(MO.isRegister() && "Expected an FP register!");
+  unsigned Reg = MO.getReg();
+  assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+  return Reg - X86::FP0;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+/// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned DestReg = getFPReg(MI->getOperand(0));
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);   // Remove the explicit ST(0) operand
+  MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+  
+  // Result gets pushed on the stack.
+  pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned NumOps = MI->getInstrDescriptor()->numOperands;
+  assert((NumOps == 5 || NumOps == 1) &&
+         "Can only handle fst* & ftst instructions!");
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+  bool KillsSrc = LV->KillsRegister(MI, X86::FP0+Reg);
+
+  // FISTP64m is strange because there isn't a non-popping versions.
+  // If we have one _and_ we don't want to pop the operand, duplicate the value
+  // on the stack instead of moving it.  This ensure that popping the value is
+  // always ok.
+  // Ditto FISTTP16m, FISTTP32m, FISTTP64m.
+  //
+  if (!KillsSrc &&
+      (MI->getOpcode() == X86::IST_Fp64m32 ||
+       MI->getOpcode() == X86::ISTT_Fp16m32 ||
+       MI->getOpcode() == X86::ISTT_Fp32m32 ||
+       MI->getOpcode() == X86::ISTT_Fp64m32 ||
+       MI->getOpcode() == X86::IST_Fp64m64 ||
+       MI->getOpcode() == X86::ISTT_Fp16m64 ||
+       MI->getOpcode() == X86::ISTT_Fp32m64 ||
+       MI->getOpcode() == X86::ISTT_Fp64m64)) {
+    duplicateToTop(Reg, 7 /*temp register*/, I);
+  } else {
+    moveToTop(Reg, I);            // Move to the top of the stack...
+  }
+  
+  // Convert from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(NumOps-1);    // Remove explicit ST(0) operand
+  MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  if (MI->getOpcode() == X86::IST_FP64m ||
+      MI->getOpcode() == X86::ISTT_FP16m ||
+      MI->getOpcode() == X86::ISTT_FP32m ||
+      MI->getOpcode() == X86::ISTT_FP64m) {
+    assert(StackTop > 0 && "Stack empty??");
+    --StackTop;
+  } else if (KillsSrc) { // Last use of operand?
+    popStackAfter(I);
+  }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value.  These instructions may have
+/// non-fp operands after their FP operands.
+///
+///  Examples:
+///     R1 = fchs R2
+///     R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned NumOps = MI->getInstrDescriptor()->numOperands;
+  assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(1));
+  bool KillsSrc = LV->KillsRegister(MI, X86::FP0+Reg);
+
+  if (KillsSrc) {
+    // If this is the last use of the source register, just make sure it's on
+    // the top of the stack.
+    moveToTop(Reg, I);
+    assert(StackTop > 0 && "Stack cannot be empty!");
+    --StackTop;
+    pushReg(getFPReg(MI->getOperand(0)));
+  } else {
+    // If this is not the last use of the source register, _copy_ it to the top
+    // of the stack.
+    duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+  }
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(1);   // Drop the source operand.
+  MI->RemoveOperand(0);   // Drop the destination operand.
+  MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C  into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r },
+  { X86::ADD_Fp64  , X86::ADD_FST0r },
+  { X86::DIV_Fp32  , X86::DIV_FST0r },
+  { X86::DIV_Fp64  , X86::DIV_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r },
+  { X86::MUL_Fp64  , X86::MUL_FST0r },
+  { X86::SUB_Fp32  , X86::SUB_FST0r },
+  { X86::SUB_Fp64  , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C  into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FST0r  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FST0r },
+  { X86::DIV_Fp64  , X86::DIVR_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FST0r  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FST0r },
+  { X86::SUB_Fp64  , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C  into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FrST0  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp64  , X86::DIVR_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FrST0  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp64  , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C  into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0 },
+  { X86::ADD_Fp64  , X86::ADD_FrST0 },
+  { X86::DIV_Fp32  , X86::DIV_FrST0 },
+  { X86::DIV_Fp64  , X86::DIV_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0 },
+  { X86::MUL_Fp64  , X86::MUL_FrST0 },
+  { X86::SUB_Fp32  , X86::SUB_FrST0 },
+  { X86::SUB_Fp64  , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub  ST(0), ST(i)
+///         ST(i) = fsub  ST(0), ST(i)
+///         ST(0) = fsubr ST(0), ST(i)
+///         ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getInstrDescriptor()->numOperands;
+  assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+  unsigned Dest = getFPReg(MI->getOperand(0));
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = LV->KillsRegister(MI, X86::FP0+Op0);
+  bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1);
+
+  unsigned TOS = getStackEntry(0);
+
+  // One of our operands must be on the top of the stack.  If neither is yet, we
+  // need to move one.
+  if (Op0 != TOS && Op1 != TOS) {   // No operand at TOS?
+    // We can choose to move either operand to the top of the stack.  If one of
+    // the operands is killed by this instruction, we want that one so that we
+    // can update right on top of the old version.
+    if (KillsOp0) {
+      moveToTop(Op0, I);         // Move dead operand to TOS.
+      TOS = Op0;
+    } else if (KillsOp1) {
+      moveToTop(Op1, I);
+      TOS = Op1;
+    } else {
+      // All of the operands are live after this instruction executes, so we
+      // cannot update on top of any operand.  Because of this, we must
+      // duplicate one of the stack elements to the top.  It doesn't matter
+      // which one we pick.
+      //
+      duplicateToTop(Op0, Dest, I);
+      Op0 = TOS = Dest;
+      KillsOp0 = true;
+    }
+  } else if (!KillsOp0 && !KillsOp1) {
+    // If we DO have one of our operands at the top of the stack, but we don't
+    // have a dead operand, we must duplicate one of the operands to a new slot
+    // on the stack.
+    duplicateToTop(Op0, Dest, I);
+    Op0 = TOS = Dest;
+    KillsOp0 = true;
+  }
+
+  // Now we know that one of our operands is on the top of the stack, and at
+  // least one of our operands is killed by this instruction.
+  assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+         "Stack conditions not set up right!");
+
+  // We decide which form to use based on what is on the top of the stack, and
+  // which operand is killed by this instruction.
+  const TableEntry *InstTable;
+  bool isForward = TOS == Op0;
+  bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+  if (updateST0) {
+    if (isForward)
+      InstTable = ForwardST0Table;
+    else
+      InstTable = ReverseST0Table;
+  } else {
+    if (isForward)
+      InstTable = ForwardSTiTable;
+    else
+      InstTable = ReverseSTiTable;
+  }
+
+  int Opcode = Lookup(InstTable, ARRAY_SIZE(ForwardST0Table), MI->getOpcode());
+  assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+  // NotTOS - The register which is not on the top of stack...
+  unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+  // Replace the old instruction with a new instruction
+  MBB->remove(I++);
+  I = BuildMI(*MBB, I, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+  // If both operands are killed, pop one off of the stack in addition to
+  // overwriting the other one.
+  if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+    assert(!updateST0 && "Should have updated other operand!");
+    popStackAfter(I);   // Pop the top of stack
+  }
+
+  // Update stack information so that we know the destination register is now on
+  // the stack.
+  unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+  assert(UpdatedSlot < StackTop && Dest < 7);
+  Stack[UpdatedSlot]   = Dest;
+  RegMap[Dest]         = UpdatedSlot;
+  delete MI;   // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getInstrDescriptor()->numOperands;
+  assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = LV->KillsRegister(MI, X86::FP0+Op0);
+  bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1);
+
+  // Make sure the first operand is on the top of stack, the other one can be
+  // anywhere.
+  moveToTop(Op0, I);
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->RemoveOperand(1);
+  MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // If any of the operands are killed by this instruction, free them.
+  if (KillsOp0) freeStackSlotAfter(I, Op0);
+  if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions.  These
+/// instructions move a st(i) register to st(0) iff a condition is true.  These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+
+  unsigned Op0 = getFPReg(MI->getOperand(0));
+  unsigned Op1 = getFPReg(MI->getOperand(2));
+  bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1);
+
+  // The first operand *must* be on the top of the stack.
+  moveToTop(Op0, I);
+
+  // Change the second operand to the stack register that the operand is in.
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);
+  MI->RemoveOperand(1);
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+  
+  // If we kill the second operand, make sure to pop it from the stack.
+  if (Op0 != Op1 && KillsOp1) {
+    // Get this value off of the register stack.
+    freeStackSlotAfter(I, Op1);
+  }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions.  This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  switch (MI->getOpcode()) {
+  default: assert(0 && "Unknown SpecialFP instruction!");
+  case X86::FpGETRESULT32:  // Appears immediately after a call returning FP type!
+  case X86::FpGETRESULT64:  // Appears immediately after a call returning FP type!
+    assert(StackTop == 0 && "Stack should be empty after a call!");
+    pushReg(getFPReg(MI->getOperand(0)));
+    break;
+  case X86::FpSETRESULT32:
+  case X86::FpSETRESULT64:
+    assert(StackTop == 1 && "Stack should have one element on it to return!");
+    --StackTop;   // "Forget" we have something on the top of stack!
+    break;
+  case X86::MOV_Fp3232:
+  case X86::MOV_Fp3264:
+  case X86::MOV_Fp6432:
+  case X86::MOV_Fp6464: {
+    unsigned SrcReg = getFPReg(MI->getOperand(1));
+    unsigned DestReg = getFPReg(MI->getOperand(0));
+
+    if (LV->KillsRegister(MI, X86::FP0+SrcReg)) {
+      // If the input operand is killed, we can just change the owner of the
+      // incoming stack slot into the result.
+      unsigned Slot = getSlot(SrcReg);
+      assert(Slot < 7 && DestReg < 7 && "FpMOV operands invalid!");
+      Stack[Slot] = DestReg;
+      RegMap[DestReg] = Slot;
+
+    } else {
+      // For FMOV we just duplicate the specified value to a new stack slot.
+      // This could be made better, but would require substantial changes.
+      duplicateToTop(SrcReg, DestReg, I);
+    }
+    break;
+  }
+  }
+
+  I = MBB->erase(I);  // Remove the pseudo instruction
+  --I;
+}

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..8b1690c
--- /dev/null
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp

@@ -0,0 +1,1342 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include <queue>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumFPKill   , "Number of FP_REG_KILL instructions added");
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+
+//===----------------------------------------------------------------------===//
+//                      Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
+  /// SDOperand's instead of register numbers for the leaves of the matched
+  /// tree.
+  struct X86ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    struct {            // This is really a union, discriminated by BaseType!
+      SDOperand Reg;
+      int FrameIndex;
+    } Base;
+
+    bool isRIPRel;     // RIP relative?
+    unsigned Scale;
+    SDOperand IndexReg; 
+    unsigned Disp;
+    GlobalValue *GV;
+    Constant *CP;
+    const char *ES;
+    int JT;
+    unsigned Align;    // CP alignment.
+
+    X86ISelAddressMode()
+      : BaseType(RegBase), isRIPRel(false), Scale(1), IndexReg(), Disp(0),
+        GV(0), CP(0), ES(0), JT(-1), Align(0) {
+    }
+  };
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// ISel - X86 specific code to select X86 machine instructions for
+  /// SelectionDAG operations.
+  ///
+  class VISIBILITY_HIDDEN X86DAGToDAGISel : public SelectionDAGISel {
+    /// ContainsFPCode - Every instruction we select that uses or defines a FP
+    /// register should set this to true.
+    bool ContainsFPCode;
+
+    /// FastISel - Enable fast(er) instruction selection.
+    ///
+    bool FastISel;
+
+    /// TM - Keep a reference to X86TargetMachine.
+    ///
+    X86TargetMachine &TM;
+
+    /// X86Lowering - This object fully describes how to lower LLVM code to an
+    /// X86-specific SelectionDAG.
+    X86TargetLowering X86Lowering;
+
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+
+    /// GlobalBaseReg - keeps track of the virtual register mapped onto global
+    /// base register.
+    unsigned GlobalBaseReg;
+
+  public:
+    X86DAGToDAGISel(X86TargetMachine &tm, bool fast)
+      : SelectionDAGISel(X86Lowering),
+        ContainsFPCode(false), FastISel(fast), TM(tm),
+        X86Lowering(*TM.getTargetLowering()),
+        Subtarget(&TM.getSubtarget<X86Subtarget>()) {}
+
+    virtual bool runOnFunction(Function &Fn) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      return SelectionDAGISel::runOnFunction(Fn);
+    }
+   
+    virtual const char *getPassName() const {
+      return "X86 DAG->DAG Instruction Selection";
+    }
+
+    /// InstructionSelectBasicBlock - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelectBasicBlock(SelectionDAG &DAG);
+
+    virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF);
+
+    virtual bool CanBeFoldedBy(SDNode *N, SDNode *U, SDNode *Root);
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDOperand N);
+
+    bool MatchAddress(SDOperand N, X86ISelAddressMode &AM,
+                      bool isRoot = true, unsigned Depth = 0);
+    bool SelectAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+                    SDOperand &Scale, SDOperand &Index, SDOperand &Disp);
+    bool SelectLEAAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+                       SDOperand &Scale, SDOperand &Index, SDOperand &Disp);
+    bool SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
+                             SDOperand N, SDOperand &Base, SDOperand &Scale,
+                             SDOperand &Index, SDOperand &Disp,
+                             SDOperand &InChain, SDOperand &OutChain);
+    bool TryFoldLoad(SDOperand P, SDOperand N,
+                     SDOperand &Base, SDOperand &Scale,
+                     SDOperand &Index, SDOperand &Disp);
+    void InstructionSelectPreprocess(SelectionDAG &DAG);
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDOperand &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDOperand> &OutOps,
+                                              SelectionDAG &DAG);
+    
+    void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+
+    inline void getAddressOperands(X86ISelAddressMode &AM, SDOperand &Base, 
+                                   SDOperand &Scale, SDOperand &Index,
+                                   SDOperand &Disp) {
+      Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
+        CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+        AM.Base.Reg;
+      Scale = getI8Imm(AM.Scale);
+      Index = AM.IndexReg;
+      // These are 32-bit even in 64-bit mode since RIP relative offset
+      // is 32-bit.
+      if (AM.GV)
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp);
+      else if (AM.CP)
+        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Align, AM.Disp);
+      else if (AM.ES)
+        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32);
+      else if (AM.JT != -1)
+        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32);
+      else
+        Disp = getI32Imm(AM.Disp);
+    }
+
+    /// getI8Imm - Return a target constant with the specified value, of type
+    /// i8.
+    inline SDOperand getI8Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i8);
+    }
+
+    /// getI16Imm - Return a target constant with the specified value, of type
+    /// i16.
+    inline SDOperand getI16Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i16);
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDOperand getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+    /// base register.  Return the virtual register that holds this value.
+    SDNode *getGlobalBaseReg();
+
+#ifndef NDEBUG
+    unsigned Indent;
+#endif
+  };
+}
+
+static SDNode *findFlagUse(SDNode *N) {
+  unsigned FlagResNo = N->getNumValues()-1;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+      SDOperand Op = User->getOperand(i);
+      if (Op.Val == N && Op.ResNo == FlagResNo)
+        return User;
+    }
+  }
+  return NULL;
+}
+
+static void findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
+                          SDNode *Root, SDNode *Skip, bool &found,
+                          std::set<SDNode *> &Visited) {
+  if (found ||
+      Use->getNodeId() > Def->getNodeId() ||
+      !Visited.insert(Use).second)
+    return;
+
+  for (unsigned i = 0, e = Use->getNumOperands(); !found && i != e; ++i) {
+    SDNode *N = Use->getOperand(i).Val;
+    if (N == Skip)
+      continue;
+    if (N == Def) {
+      if (Use == ImmedUse)
+        continue; // Immediate use is ok.
+      if (Use == Root) {
+        assert(Use->getOpcode() == ISD::STORE ||
+               Use->getOpcode() == X86ISD::CMP);
+        continue;
+      }
+      found = true;
+      break;
+    }
+    findNonImmUse(N, Def, ImmedUse, Root, Skip, found, Visited);
+  }
+}
+
+/// isNonImmUse - Start searching from Root up the DAG to check is Def can
+/// be reached. Return true if that's the case. However, ignore direct uses
+/// by ImmedUse (which would be U in the example illustrated in
+/// CanBeFoldedBy) and by Root (which can happen in the store case).
+/// FIXME: to be really generic, we should allow direct use by any node
+/// that is being folded. But realisticly since we only fold loads which
+/// have one non-chain use, we only need to watch out for load/op/store
+/// and load/op/cmp case where the root (store / cmp) may reach the load via
+/// its chain operand.
+static inline bool isNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse,
+                               SDNode *Skip = NULL) {
+  std::set<SDNode *> Visited;
+  bool found = false;
+  findNonImmUse(Root, Def, ImmedUse, Root, Skip, found, Visited);
+  return found;
+}
+
+
+bool X86DAGToDAGISel::CanBeFoldedBy(SDNode *N, SDNode *U, SDNode *Root) {
+  if (FastISel) return false;
+
+  // If U use can somehow reach N through another path then U can't fold N or
+  // it will create a cycle. e.g. In the following diagram, U can reach N
+  // through X. If N is folded into into U, then X is both a predecessor and
+  // a successor of U.
+  //
+  //         [ N ]
+  //         ^  ^
+  //         |  |
+  //        /   \---
+  //      /        [X]
+  //      |         ^
+  //     [U]--------|
+
+  if (isNonImmUse(Root, N, U))
+    return false;
+
+  // If U produces a flag, then it gets (even more) interesting. Since it
+  // would have been "glued" together with its flag use, we need to check if
+  // it might reach N:
+  //
+  //       [ N ]
+  //        ^ ^
+  //        | |
+  //       [U] \--
+  //        ^   [TF]
+  //        |    ^
+  //        |    |
+  //         \  /
+  //          [FU]
+  //
+  // If FU (flag use) indirectly reach N (the load), and U fold N (call it
+  // NU), then TF is a predecessor of FU and a successor of NU. But since
+  // NU and FU are flagged together, this effectively creates a cycle.
+  bool HasFlagUse = false;
+  MVT::ValueType VT = Root->getValueType(Root->getNumValues()-1);
+  while ((VT == MVT::Flag && !Root->use_empty())) {
+    SDNode *FU = findFlagUse(Root);
+    if (FU == NULL)
+      break;
+    else {
+      Root = FU;
+      HasFlagUse = true;
+    }
+    VT = Root->getValueType(Root->getNumValues()-1);
+  }
+
+  if (HasFlagUse)
+    return !isNonImmUse(Root, N, Root, U);
+  return true;
+}
+
+/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
+/// and move load below the TokenFactor. Replace store's chain operand with
+/// load's chain result.
+static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load,
+                                 SDOperand Store, SDOperand TF) {
+  std::vector<SDOperand> Ops;
+  for (unsigned i = 0, e = TF.Val->getNumOperands(); i != e; ++i)
+    if (Load.Val == TF.Val->getOperand(i).Val)
+      Ops.push_back(Load.Val->getOperand(0));
+    else
+      Ops.push_back(TF.Val->getOperand(i));
+  DAG.UpdateNodeOperands(TF, &Ops[0], Ops.size());
+  DAG.UpdateNodeOperands(Load, TF, Load.getOperand(1), Load.getOperand(2));
+  DAG.UpdateNodeOperands(Store, Load.getValue(1), Store.getOperand(1),
+                         Store.getOperand(2), Store.getOperand(3));
+}
+
+/// InstructionSelectPreprocess - Preprocess the DAG to allow the instruction
+/// selector to pick more load-modify-store instructions. This is a common
+/// case:
+///
+///     [Load chain]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///      /      \-
+///     /         |
+/// [TokenFactor] [Op]
+///     ^          ^
+///     |          |
+///      \        /
+///       \      /
+///       [Store]
+///
+/// The fact the store's chain operand != load's chain will prevent the
+/// (store (op (load))) instruction from being selected. We can transform it to:
+///
+///     [Load chain]
+///         ^
+///         |
+///    [TokenFactor]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///       |     \- 
+///       |       | 
+///       |     [Op]
+///       |       ^
+///       |       |
+///       \      /
+///        \    /
+///       [Store]
+void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) {
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+         E = DAG.allnodes_end(); I != E; ++I) {
+    if (!ISD::isNON_TRUNCStore(I))
+      continue;
+    SDOperand Chain = I->getOperand(0);
+    if (Chain.Val->getOpcode() != ISD::TokenFactor)
+      continue;
+
+    SDOperand N1 = I->getOperand(1);
+    SDOperand N2 = I->getOperand(2);
+    if (MVT::isFloatingPoint(N1.getValueType()) ||
+        MVT::isVector(N1.getValueType()) ||
+        !N1.hasOneUse())
+      continue;
+
+    bool RModW = false;
+    SDOperand Load;
+    unsigned Opcode = N1.Val->getOpcode();
+    switch (Opcode) {
+      case ISD::ADD:
+      case ISD::MUL:
+      case ISD::AND:
+      case ISD::OR:
+      case ISD::XOR:
+      case ISD::ADDC:
+      case ISD::ADDE: {
+        SDOperand N10 = N1.getOperand(0);
+        SDOperand N11 = N1.getOperand(1);
+        if (ISD::isNON_EXTLoad(N10.Val))
+          RModW = true;
+        else if (ISD::isNON_EXTLoad(N11.Val)) {
+          RModW = true;
+          std::swap(N10, N11);
+        }
+        RModW = RModW && N10.Val->isOperand(Chain.Val) && N10.hasOneUse() &&
+          (N10.getOperand(1) == N2) &&
+          (N10.Val->getValueType(0) == N1.getValueType());
+        if (RModW)
+          Load = N10;
+        break;
+      }
+      case ISD::SUB:
+      case ISD::SHL:
+      case ISD::SRA:
+      case ISD::SRL:
+      case ISD::ROTL:
+      case ISD::ROTR:
+      case ISD::SUBC:
+      case ISD::SUBE:
+      case X86ISD::SHLD:
+      case X86ISD::SHRD: {
+        SDOperand N10 = N1.getOperand(0);
+        if (ISD::isNON_EXTLoad(N10.Val))
+          RModW = N10.Val->isOperand(Chain.Val) && N10.hasOneUse() &&
+            (N10.getOperand(1) == N2) &&
+            (N10.Val->getValueType(0) == N1.getValueType());
+        if (RModW)
+          Load = N10;
+        break;
+      }
+    }
+
+    if (RModW) {
+      MoveBelowTokenFactor(DAG, Load, SDOperand(I, 0), Chain);
+      ++NumLoadMoved;
+    }
+  }
+}
+
+/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
+/// when it has created a SelectionDAG for us to codegen.
+void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
+  DEBUG(BB->dump());
+  MachineFunction::iterator FirstMBB = BB;
+
+  if (!FastISel)
+    InstructionSelectPreprocess(DAG);
+
+  // Codegen the basic block.
+#ifndef NDEBUG
+  DOUT << "===== Instruction selection begins:\n";
+  Indent = 0;
+#endif
+  DAG.setRoot(SelectRoot(DAG.getRoot()));
+#ifndef NDEBUG
+  DOUT << "===== Instruction selection ends:\n";
+#endif
+
+  DAG.RemoveDeadNodes();
+
+  // Emit machine code to BB. 
+  ScheduleAndEmitDAG(DAG);
+  
+  // If we are emitting FP stack code, scan the basic block to determine if this
+  // block defines any FP values.  If so, put an FP_REG_KILL instruction before
+  // the terminator of the block.
+  if (!Subtarget->hasSSE2()) {
+    // Note that FP stack instructions *are* used in SSE code when returning
+    // values, but these are not live out of the basic block, so we don't need
+    // an FP_REG_KILL in this case either.
+    bool ContainsFPCode = false;
+    
+    // Scan all of the machine instructions in these MBBs, checking for FP
+    // stores.
+    MachineFunction::iterator MBBI = FirstMBB;
+    do {
+      for (MachineBasicBlock::iterator I = MBBI->begin(), E = MBBI->end();
+           !ContainsFPCode && I != E; ++I) {
+        if (I->getNumOperands() != 0 && I->getOperand(0).isRegister()) {
+          const TargetRegisterClass *clas;
+          for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+            if (I->getOperand(op).isRegister() && I->getOperand(op).isDef() &&
+                MRegisterInfo::isVirtualRegister(I->getOperand(op).getReg()) &&
+                ((clas = RegMap->getRegClass(I->getOperand(0).getReg())) == 
+                   X86::RFP32RegisterClass ||
+                 clas == X86::RFP64RegisterClass)) {
+              ContainsFPCode = true;
+              break;
+            }
+          }
+        }
+      }
+    } while (!ContainsFPCode && &*(MBBI++) != BB);
+    
+    // Check PHI nodes in successor blocks.  These PHI's will be lowered to have
+    // a copy of the input value in this block.
+    if (!ContainsFPCode) {
+      // Final check, check LLVM BB's that are successors to the LLVM BB
+      // corresponding to BB for FP PHI nodes.
+      const BasicBlock *LLVMBB = BB->getBasicBlock();
+      const PHINode *PN;
+      for (succ_const_iterator SI = succ_begin(LLVMBB), E = succ_end(LLVMBB);
+           !ContainsFPCode && SI != E; ++SI) {
+        for (BasicBlock::const_iterator II = SI->begin();
+             (PN = dyn_cast<PHINode>(II)); ++II) {
+          if (PN->getType()->isFloatingPoint()) {
+            ContainsFPCode = true;
+            break;
+          }
+        }
+      }
+    }
+
+    // Finally, if we found any FP code, emit the FP_REG_KILL instruction.
+    if (ContainsFPCode) {
+      BuildMI(*BB, BB->getFirstTerminator(),
+              TM.getInstrInfo()->get(X86::FP_REG_KILL));
+      ++NumFPKill;
+    }
+  }
+}
+
+/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
+/// the main function.
+void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
+                                             MachineFrameInfo *MFI) {
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  if (Subtarget->isTargetCygMing())
+    BuildMI(BB, TII->get(X86::CALLpcrel32)).addExternalSymbol("__main");
+
+  // Switch the FPU to 64-bit precision mode for better compatibility and speed.
+  int CWFrameIdx = MFI->CreateStackObject(2, 2);
+  addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+  // Set the high part to be 64-bit precision.
+  addFrameReference(BuildMI(BB, TII->get(X86::MOV8mi)),
+                    CWFrameIdx, 1).addImm(2);
+
+  // Reload the modified control word now.
+  addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
+  // If this is main, emit special code for main.
+  MachineBasicBlock *BB = MF.begin();
+  if (Fn.hasExternalLinkage() && Fn.getName() == "main")
+    EmitSpecialCodeForMain(BB, MF.getFrameInfo());
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done.  This just pattern matches for the
+/// addressing mode
+bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
+                                   bool isRoot, unsigned Depth) {
+  if (Depth > 5) {
+    // Default, generate it as a register.
+    AM.BaseType = X86ISelAddressMode::RegBase;
+    AM.Base.Reg = N;
+    return false;
+  }
+  
+  // RIP relative addressing: %rip + 32-bit displacement!
+  if (AM.isRIPRel) {
+    if (!AM.ES && AM.JT != -1 && N.getOpcode() == ISD::Constant) {
+      int64_t Val = cast<ConstantSDNode>(N)->getSignExtended();
+      if (isInt32(AM.Disp + Val)) {
+        AM.Disp += Val;
+        return false;
+      }
+    }
+    return true;
+  }
+
+  int id = N.Val->getNodeId();
+  bool Available = isSelected(id);
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    int64_t Val = cast<ConstantSDNode>(N)->getSignExtended();
+    if (isInt32(AM.Disp + Val)) {
+      AM.Disp += Val;
+      return false;
+    }
+    break;
+  }
+
+  case X86ISD::Wrapper: {
+    bool is64Bit = Subtarget->is64Bit();
+    // Under X86-64 non-small code model, GV (and friends) are 64-bits.
+    if (is64Bit && TM.getCodeModel() != CodeModel::Small)
+      break;
+    if (AM.GV != 0 || AM.CP != 0 || AM.ES != 0 || AM.JT != -1)
+      break;
+    // If value is available in a register both base and index components have
+    // been picked, we can't fit the result available in the register in the
+    // addressing mode. Duplicate GlobalAddress or ConstantPool as displacement.
+    if (!Available || (AM.Base.Reg.Val && AM.IndexReg.Val)) {
+      bool isStatic = TM.getRelocationModel() == Reloc::Static;
+      SDOperand N0 = N.getOperand(0);
+      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+        GlobalValue *GV = G->getGlobal();
+        bool isAbs32 = !is64Bit || isStatic;
+        if (isAbs32 || isRoot) {
+          AM.GV = GV;
+          AM.Disp += G->getOffset();
+          AM.isRIPRel = !isAbs32;
+          return false;
+        }
+      } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+        if (!is64Bit || isStatic || isRoot) {
+          AM.CP = CP->getConstVal();
+          AM.Align = CP->getAlignment();
+          AM.Disp += CP->getOffset();
+          AM.isRIPRel = !isStatic;
+          return false;
+        }
+      } else if (ExternalSymbolSDNode *S =dyn_cast<ExternalSymbolSDNode>(N0)) {
+        if (isStatic || isRoot) {
+          AM.ES = S->getSymbol();
+          AM.isRIPRel = !isStatic;
+          return false;
+        }
+      } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+        if (isStatic || isRoot) {
+          AM.JT = J->getIndex();
+          AM.isRIPRel = !isStatic;
+          return false;
+        }
+      }
+    }
+    break;
+  }
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base.Reg.Val == 0) {
+      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::SHL:
+    if (!Available && AM.IndexReg.Val == 0 && AM.Scale == 1)
+      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1))) {
+        unsigned Val = CN->getValue();
+        if (Val == 1 || Val == 2 || Val == 3) {
+          AM.Scale = 1 << Val;
+          SDOperand ShVal = N.Val->getOperand(0);
+
+          // Okay, we know that we have a scale by now.  However, if the scaled
+          // value is an add of something and a constant, we can fold the
+          // constant into the disp field here.
+          if (ShVal.Val->getOpcode() == ISD::ADD && ShVal.hasOneUse() &&
+              isa<ConstantSDNode>(ShVal.Val->getOperand(1))) {
+            AM.IndexReg = ShVal.Val->getOperand(0);
+            ConstantSDNode *AddVal =
+              cast<ConstantSDNode>(ShVal.Val->getOperand(1));
+            uint64_t Disp = AM.Disp + (AddVal->getValue() << Val);
+            if (isInt32(Disp))
+              AM.Disp = Disp;
+            else
+              AM.IndexReg = ShVal;
+          } else {
+            AM.IndexReg = ShVal;
+          }
+          return false;
+        }
+      }
+    break;
+
+  case ISD::MUL:
+    // X*[3,5,9] -> X+X*[2,4,8]
+    if (!Available &&
+        AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base.Reg.Val == 0 &&
+        AM.IndexReg.Val == 0) {
+      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1)))
+        if (CN->getValue() == 3 || CN->getValue() == 5 || CN->getValue() == 9) {
+          AM.Scale = unsigned(CN->getValue())-1;
+
+          SDOperand MulVal = N.Val->getOperand(0);
+          SDOperand Reg;
+
+          // Okay, we know that we have a scale by now.  However, if the scaled
+          // value is an add of something and a constant, we can fold the
+          // constant into the disp field here.
+          if (MulVal.Val->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+              isa<ConstantSDNode>(MulVal.Val->getOperand(1))) {
+            Reg = MulVal.Val->getOperand(0);
+            ConstantSDNode *AddVal =
+              cast<ConstantSDNode>(MulVal.Val->getOperand(1));
+            uint64_t Disp = AM.Disp + AddVal->getValue() * CN->getValue();
+            if (isInt32(Disp))
+              AM.Disp = Disp;
+            else
+              Reg = N.Val->getOperand(0);
+          } else {
+            Reg = N.Val->getOperand(0);
+          }
+
+          AM.IndexReg = AM.Base.Reg = Reg;
+          return false;
+        }
+    }
+    break;
+
+  case ISD::ADD:
+    if (!Available) {
+      X86ISelAddressMode Backup = AM;
+      if (!MatchAddress(N.Val->getOperand(0), AM, false, Depth+1) &&
+          !MatchAddress(N.Val->getOperand(1), AM, false, Depth+1))
+        return false;
+      AM = Backup;
+      if (!MatchAddress(N.Val->getOperand(1), AM, false, Depth+1) &&
+          !MatchAddress(N.Val->getOperand(0), AM, false, Depth+1))
+        return false;
+      AM = Backup;
+    }
+    break;
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (!Available) {
+      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+        X86ISelAddressMode Backup = AM;
+        // Start with the LHS as an addr mode.
+        if (!MatchAddress(N.getOperand(0), AM, false) &&
+            // Address could not have picked a GV address for the displacement.
+            AM.GV == NULL &&
+            // On x86-64, the resultant disp must fit in 32-bits.
+            isInt32(AM.Disp + CN->getSignExtended()) &&
+            // Check to see if the LHS & C is zero.
+            CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getValue())) {
+          AM.Disp += CN->getValue();
+          return false;
+        }
+        AM = Backup;
+      }
+    }
+    break;
+  }
+
+  // Is the base register already occupied?
+  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base.Reg.Val) {
+    // If so, check to see if the scale index register is set.
+    if (AM.IndexReg.Val == 0) {
+      AM.IndexReg = N;
+      AM.Scale = 1;
+      return false;
+    }
+
+    // Otherwise, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = X86ISelAddressMode::RegBase;
+  AM.Base.Reg = N;
+  return false;
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool X86DAGToDAGISel::SelectAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+                                 SDOperand &Scale, SDOperand &Index,
+                                 SDOperand &Disp) {
+  X86ISelAddressMode AM;
+  if (MatchAddress(N, AM))
+    return false;
+
+  MVT::ValueType VT = N.getValueType();
+  if (AM.BaseType == X86ISelAddressMode::RegBase) {
+    if (!AM.Base.Reg.Val)
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM.IndexReg.Val)
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM, Base, Scale, Index, Disp);
+  return true;
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+static inline bool isZeroNode(SDOperand Elt) {
+  return ((isa<ConstantSDNode>(Elt) &&
+  cast<ConstantSDNode>(Elt)->getValue() == 0) ||
+  (isa<ConstantFPSDNode>(Elt) &&
+  cast<ConstantFPSDNode>(Elt)->isExactlyValue(0.0)));
+}
+
+
+/// SelectScalarSSELoad - Match a scalar SSE load.  In particular, we want to
+/// match a load whose top elements are either undef or zeros.  The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
+                                          SDOperand N, SDOperand &Base,
+                                          SDOperand &Scale, SDOperand &Index,
+                                          SDOperand &Disp, SDOperand &InChain,
+                                          SDOperand &OutChain) {
+  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    InChain = N.getOperand(0).getValue(1);
+    if (ISD::isNON_EXTLoad(InChain.Val) &&
+        InChain.getValue(0).hasOneUse() &&
+        N.hasOneUse() &&
+        CanBeFoldedBy(N.Val, Pred.Val, Op.Val)) {
+      LoadSDNode *LD = cast<LoadSDNode>(InChain);
+      if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
+        return false;
+      OutChain = LD->getChain();
+      return true;
+    }
+  }
+
+  // Also handle the case where we explicitly require zeros in the top
+  // elements.  This is a vector shuffle from the zero vector.
+  if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() &&
+      N.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+      N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR && 
+      N.getOperand(1).Val->hasOneUse() &&
+      ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) &&
+      N.getOperand(1).getOperand(0).hasOneUse()) {
+    // Check to see if the BUILD_VECTOR is building a zero vector.
+    SDOperand BV = N.getOperand(0);
+    for (unsigned i = 0, e = BV.getNumOperands(); i != e; ++i)
+      if (!isZeroNode(BV.getOperand(i)) &&
+          BV.getOperand(i).getOpcode() != ISD::UNDEF)
+        return false;  // Not a zero/undef vector.
+    // Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something
+    // from the LHS.
+    unsigned VecWidth = BV.getNumOperands();
+    SDOperand ShufMask = N.getOperand(2);
+    assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!");
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ShufMask.getOperand(0))) {
+      if (C->getValue() == VecWidth) {
+        for (unsigned i = 1; i != VecWidth; ++i) {
+          if (ShufMask.getOperand(i).getOpcode() == ISD::UNDEF) {
+            // ok.
+          } else {
+            ConstantSDNode *C = cast<ConstantSDNode>(ShufMask.getOperand(i));
+            if (C->getValue() >= VecWidth) return false;
+          }
+        }
+      }
+      
+      // Okay, this is a zero extending load.  Fold it.
+      LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(1).getOperand(0));
+      if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
+        return false;
+      OutChain = LD->getChain();
+      InChain = SDOperand(LD, 1);
+      return true;
+    }
+  }
+  return false;
+}
+
+
+/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::SelectLEAAddr(SDOperand Op, SDOperand N,
+                                    SDOperand &Base, SDOperand &Scale,
+                                    SDOperand &Index, SDOperand &Disp) {
+  X86ISelAddressMode AM;
+  if (MatchAddress(N, AM))
+    return false;
+
+  MVT::ValueType VT = N.getValueType();
+  unsigned Complexity = 0;
+  if (AM.BaseType == X86ISelAddressMode::RegBase)
+    if (AM.Base.Reg.Val)
+      Complexity = 1;
+    else
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+    Complexity = 4;
+
+  if (AM.IndexReg.Val)
+    Complexity++;
+  else
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+  // a simple shift.
+  if (AM.Scale > 1)
+    Complexity++;
+
+  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+  // to a LEA. This is determined with some expermentation but is by no means
+  // optimal (especially for code size consideration). LEA is nice because of
+  // its three-address nature. Tweak the cost function again when we can run
+  // convertToThreeAddress() at register allocation time.
+  if (AM.GV || AM.CP || AM.ES || AM.JT != -1) {
+    // For X86-64, we should always use lea to materialize RIP relative
+    // addresses.
+    if (Subtarget->is64Bit())
+      Complexity = 4;
+    else
+      Complexity += 2;
+  }
+
+  if (AM.Disp && (AM.Base.Reg.Val || AM.IndexReg.Val))
+    Complexity++;
+
+  if (Complexity > 2) {
+    getAddressOperands(AM, Base, Scale, Index, Disp);
+    return true;
+  }
+  return false;
+}
+
+bool X86DAGToDAGISel::TryFoldLoad(SDOperand P, SDOperand N,
+                                  SDOperand &Base, SDOperand &Scale,
+                                  SDOperand &Index, SDOperand &Disp) {
+  if (ISD::isNON_EXTLoad(N.Val) &&
+      N.hasOneUse() &&
+      CanBeFoldedBy(N.Val, P.Val, P.Val))
+    return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp);
+  return false;
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+  assert(!Subtarget->is64Bit() && "X86-64 PIC uses RIP relative addressing");
+  if (!GlobalBaseReg) {
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = BB->getParent()->front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    SSARegMap *RegMap = BB->getParent()->getSSARegMap();
+    unsigned PC = RegMap->createVirtualRegister(X86::GR32RegisterClass);
+    
+    const TargetInstrInfo *TII = TM.getInstrInfo();
+    BuildMI(FirstMBB, MBBI, TII->get(X86::MovePCtoStack));
+    BuildMI(FirstMBB, MBBI, TII->get(X86::POP32r), PC);
+    
+    // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+    // not to pc, but to _GLOBAL_ADDRESS_TABLE_ external
+    if (TM.getRelocationModel() == Reloc::PIC_ &&
+        Subtarget->isPICStyleGOT()) {
+      GlobalBaseReg = RegMap->createVirtualRegister(X86::GR32RegisterClass);
+      BuildMI(FirstMBB, MBBI, TII->get(X86::ADD32ri), GlobalBaseReg).
+        addReg(PC).
+        addExternalSymbol("_GLOBAL_OFFSET_TABLE_");
+    } else {
+      GlobalBaseReg = PC;
+    }
+    
+  }
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).Val;
+}
+
+static SDNode *FindCallStartFromCall(SDNode *Node) {
+  if (Node->getOpcode() == ISD::CALLSEQ_START) return Node;
+    assert(Node->getOperand(0).getValueType() == MVT::Other &&
+         "Node doesn't have a token chain argument!");
+  return FindCallStartFromCall(Node->getOperand(0).Val);
+}
+
+SDNode *X86DAGToDAGISel::Select(SDOperand N) {
+  SDNode *Node = N.Val;
+  MVT::ValueType NVT = Node->getValueType(0);
+  unsigned Opc, MOpc;
+  unsigned Opcode = Node->getOpcode();
+
+#ifndef NDEBUG
+  DOUT << std::string(Indent, ' ') << "Selecting: ";
+  DEBUG(Node->dump(CurDAG));
+  DOUT << "\n";
+  Indent += 2;
+#endif
+
+  if (Opcode >= ISD::BUILTIN_OP_END && Opcode < X86ISD::FIRST_NUMBER) {
+#ifndef NDEBUG
+    DOUT << std::string(Indent-2, ' ') << "== ";
+    DEBUG(Node->dump(CurDAG));
+    DOUT << "\n";
+    Indent -= 2;
+#endif
+    return NULL;   // Already selected.
+  }
+
+  switch (Opcode) {
+    default: break;
+    case X86ISD::GlobalBaseReg: 
+      return getGlobalBaseReg();
+
+    case ISD::ADD: {
+      // Turn ADD X, c to MOV32ri X+c. This cannot be done with tblgen'd
+      // code and is matched first so to prevent it from being turned into
+      // LEA32r X+c.
+      // In 64-bit mode, use LEA to take advantage of RIP-relative addressing.
+      MVT::ValueType PtrVT = TLI.getPointerTy();
+      SDOperand N0 = N.getOperand(0);
+      SDOperand N1 = N.getOperand(1);
+      if (N.Val->getValueType(0) == PtrVT &&
+          N0.getOpcode() == X86ISD::Wrapper &&
+          N1.getOpcode() == ISD::Constant) {
+        unsigned Offset = (unsigned)cast<ConstantSDNode>(N1)->getValue();
+        SDOperand C(0, 0);
+        // TODO: handle ExternalSymbolSDNode.
+        if (GlobalAddressSDNode *G =
+            dyn_cast<GlobalAddressSDNode>(N0.getOperand(0))) {
+          C = CurDAG->getTargetGlobalAddress(G->getGlobal(), PtrVT,
+                                             G->getOffset() + Offset);
+        } else if (ConstantPoolSDNode *CP =
+                   dyn_cast<ConstantPoolSDNode>(N0.getOperand(0))) {
+          C = CurDAG->getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                            CP->getAlignment(),
+                                            CP->getOffset()+Offset);
+        }
+
+        if (C.Val) {
+          if (Subtarget->is64Bit()) {
+            SDOperand Ops[] = { CurDAG->getRegister(0, PtrVT), getI8Imm(1),
+                                CurDAG->getRegister(0, PtrVT), C };
+            return CurDAG->SelectNodeTo(N.Val, X86::LEA64r, MVT::i64, Ops, 4);
+          } else
+            return CurDAG->SelectNodeTo(N.Val, X86::MOV32ri, PtrVT, C);
+        }
+      }
+
+      // Other cases are handled by auto-generated code.
+      break;
+    }
+
+    case ISD::MULHU:
+    case ISD::MULHS: {
+      if (Opcode == ISD::MULHU)
+        switch (NVT) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
+        case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+        case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
+        case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
+        }
+      else
+        switch (NVT) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
+        case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+        case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+        case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+        }
+
+      unsigned LoReg, HiReg;
+      switch (NVT) {
+      default: assert(0 && "Unsupported VT!");
+      case MVT::i8:  LoReg = X86::AL;  HiReg = X86::AH;  break;
+      case MVT::i16: LoReg = X86::AX;  HiReg = X86::DX;  break;
+      case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
+      case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+      }
+
+      SDOperand N0 = Node->getOperand(0);
+      SDOperand N1 = Node->getOperand(1);
+
+      bool foldedLoad = false;
+      SDOperand Tmp0, Tmp1, Tmp2, Tmp3;
+      foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3);
+      // MULHU and MULHS are commmutative
+      if (!foldedLoad) {
+        foldedLoad = TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3);
+        if (foldedLoad) {
+          N0 = Node->getOperand(1);
+          N1 = Node->getOperand(0);
+        }
+      }
+
+      SDOperand Chain;
+      if (foldedLoad) {
+        Chain = N1.getOperand(0);
+        AddToISelQueue(Chain);
+      } else
+        Chain = CurDAG->getEntryNode();
+
+      SDOperand InFlag(0, 0);
+      AddToISelQueue(N0);
+      Chain  = CurDAG->getCopyToReg(Chain, CurDAG->getRegister(LoReg, NVT),
+                                    N0, InFlag);
+      InFlag = Chain.getValue(1);
+
+      if (foldedLoad) {
+        AddToISelQueue(Tmp0);
+        AddToISelQueue(Tmp1);
+        AddToISelQueue(Tmp2);
+        AddToISelQueue(Tmp3);
+        SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Chain, InFlag };
+        SDNode *CNode =
+          CurDAG->getTargetNode(MOpc, MVT::Other, MVT::Flag, Ops, 6);
+        Chain  = SDOperand(CNode, 0);
+        InFlag = SDOperand(CNode, 1);
+      } else {
+        AddToISelQueue(N1);
+        InFlag =
+          SDOperand(CurDAG->getTargetNode(Opc, MVT::Flag, N1, InFlag), 0);
+      }
+
+      SDOperand Result = CurDAG->getCopyFromReg(Chain, HiReg, NVT, InFlag);
+      ReplaceUses(N.getValue(0), Result);
+      if (foldedLoad)
+        ReplaceUses(N1.getValue(1), Result.getValue(1));
+
+#ifndef NDEBUG
+      DOUT << std::string(Indent-2, ' ') << "=> ";
+      DEBUG(Result.Val->dump(CurDAG));
+      DOUT << "\n";
+      Indent -= 2;
+#endif
+      return NULL;
+    }
+      
+    case ISD::SDIV:
+    case ISD::UDIV:
+    case ISD::SREM:
+    case ISD::UREM: {
+      bool isSigned = Opcode == ISD::SDIV || Opcode == ISD::SREM;
+      bool isDiv    = Opcode == ISD::SDIV || Opcode == ISD::UDIV;
+      if (!isSigned)
+        switch (NVT) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+        case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+        case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+        case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+        }
+      else
+        switch (NVT) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+        case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+        case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+        case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+        }
+
+      unsigned LoReg, HiReg;
+      unsigned ClrOpcode, SExtOpcode;
+      switch (NVT) {
+      default: assert(0 && "Unsupported VT!");
+      case MVT::i8:
+        LoReg = X86::AL;  HiReg = X86::AH;
+        ClrOpcode  = 0;
+        SExtOpcode = X86::CBW;
+        break;
+      case MVT::i16:
+        LoReg = X86::AX;  HiReg = X86::DX;
+        ClrOpcode  = X86::MOV16r0;
+        SExtOpcode = X86::CWD;
+        break;
+      case MVT::i32:
+        LoReg = X86::EAX; HiReg = X86::EDX;
+        ClrOpcode  = X86::MOV32r0;
+        SExtOpcode = X86::CDQ;
+        break;
+      case MVT::i64:
+        LoReg = X86::RAX; HiReg = X86::RDX;
+        ClrOpcode  = X86::MOV64r0;
+        SExtOpcode = X86::CQO;
+        break;
+      }
+
+      SDOperand N0 = Node->getOperand(0);
+      SDOperand N1 = Node->getOperand(1);
+      SDOperand InFlag(0, 0);
+      if (NVT == MVT::i8 && !isSigned) {
+        // Special case for div8, just use a move with zero extension to AX to
+        // clear the upper 8 bits (AH).
+        SDOperand Tmp0, Tmp1, Tmp2, Tmp3, Move, Chain;
+        if (TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3)) {
+          SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N0.getOperand(0) };
+          AddToISelQueue(N0.getOperand(0));
+          AddToISelQueue(Tmp0);
+          AddToISelQueue(Tmp1);
+          AddToISelQueue(Tmp2);
+          AddToISelQueue(Tmp3);
+          Move =
+            SDOperand(CurDAG->getTargetNode(X86::MOVZX16rm8, MVT::i16, MVT::Other,
+                                            Ops, 5), 0);
+          Chain = Move.getValue(1);
+          ReplaceUses(N0.getValue(1), Chain);
+        } else {
+          AddToISelQueue(N0);
+          Move =
+            SDOperand(CurDAG->getTargetNode(X86::MOVZX16rr8, MVT::i16, N0), 0);
+          Chain = CurDAG->getEntryNode();
+        }
+        Chain  = CurDAG->getCopyToReg(Chain, X86::AX, Move, InFlag);
+        InFlag = Chain.getValue(1);
+      } else {
+        AddToISelQueue(N0);
+        InFlag =
+          CurDAG->getCopyToReg(CurDAG->getEntryNode(), LoReg, N0,
+                               InFlag).getValue(1);
+        if (isSigned) {
+          // Sign extend the low part into the high part.
+          InFlag =
+            SDOperand(CurDAG->getTargetNode(SExtOpcode, MVT::Flag, InFlag), 0);
+        } else {
+          // Zero out the high part, effectively zero extending the input.
+          SDOperand ClrNode = SDOperand(CurDAG->getTargetNode(ClrOpcode, NVT), 0);
+          InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), HiReg, ClrNode,
+                                        InFlag).getValue(1);
+        }
+      }
+
+      SDOperand Tmp0, Tmp1, Tmp2, Tmp3, Chain;
+      bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3);
+      if (foldedLoad) {
+        AddToISelQueue(N1.getOperand(0));
+        AddToISelQueue(Tmp0);
+        AddToISelQueue(Tmp1);
+        AddToISelQueue(Tmp2);
+        AddToISelQueue(Tmp3);
+        SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N1.getOperand(0), InFlag };
+        SDNode *CNode =
+          CurDAG->getTargetNode(MOpc, MVT::Other, MVT::Flag, Ops, 6);
+        Chain  = SDOperand(CNode, 0);
+        InFlag = SDOperand(CNode, 1);
+      } else {
+        AddToISelQueue(N1);
+        Chain = CurDAG->getEntryNode();
+        InFlag =
+          SDOperand(CurDAG->getTargetNode(Opc, MVT::Flag, N1, InFlag), 0);
+      }
+
+      SDOperand Result =
+        CurDAG->getCopyFromReg(Chain, isDiv ? LoReg : HiReg, NVT, InFlag);
+      ReplaceUses(N.getValue(0), Result);
+      if (foldedLoad)
+        ReplaceUses(N1.getValue(1), Result.getValue(1));
+
+#ifndef NDEBUG
+      DOUT << std::string(Indent-2, ' ') << "=> ";
+      DEBUG(Result.Val->dump(CurDAG));
+      DOUT << "\n";
+      Indent -= 2;
+#endif
+
+      return NULL;
+    }
+
+    case ISD::TRUNCATE: {
+      if (!Subtarget->is64Bit() && NVT == MVT::i8) {
+        unsigned Opc2;
+        MVT::ValueType VT;
+        switch (Node->getOperand(0).getValueType()) {
+        default: assert(0 && "Unknown truncate!");
+        case MVT::i16:
+          Opc = X86::MOV16to16_;
+          VT = MVT::i16;
+          Opc2 = X86::TRUNC_16_to8;
+          break;
+        case MVT::i32:
+          Opc = X86::MOV32to32_;
+          VT = MVT::i32;
+          Opc2 = X86::TRUNC_32_to8;
+          break;
+        }
+
+        AddToISelQueue(Node->getOperand(0));
+        SDOperand Tmp =
+          SDOperand(CurDAG->getTargetNode(Opc, VT, Node->getOperand(0)), 0);
+        SDNode *ResNode = CurDAG->getTargetNode(Opc2, NVT, Tmp);
+      
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
+        DEBUG(ResNode->dump(CurDAG));
+        DOUT << "\n";
+        Indent -= 2;
+#endif
+        return ResNode;
+      }
+
+      break;
+    }
+  }
+
+  SDNode *ResNode = SelectCode(N);
+
+#ifndef NDEBUG
+  DOUT << std::string(Indent-2, ' ') << "=> ";
+  if (ResNode == NULL || ResNode == N.Val)
+    DEBUG(N.Val->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DOUT << "\n";
+  Indent -= 2;
+#endif
+
+  return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDOperand &Op, char ConstraintCode,
+                             std::vector<SDOperand> &OutOps, SelectionDAG &DAG){
+  SDOperand Op0, Op1, Op2, Op3;
+  switch (ConstraintCode) {
+  case 'o':   // offsetable        ??
+  case 'v':   // not offsetable    ??
+  default: return true;
+  case 'm':   // memory
+    if (!SelectAddr(Op, Op, Op0, Op1, Op2, Op3))
+      return true;
+    break;
+  }
+  
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  OutOps.push_back(Op2);
+  OutOps.push_back(Op3);
+  AddToISelQueue(Op0);
+  AddToISelQueue(Op1);
+  AddToISelQueue(Op2);
+  AddToISelQueue(Op3);
+  return false;
+}
+
+/// createX86ISelDag - This pass converts a legalized DAG into a 
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, bool Fast) {
+  return new X86DAGToDAGISel(TM, Fast);
+}

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..37dea79
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.cpp

@@ -0,0 +1,5094 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+X86TargetLowering::X86TargetLowering(TargetMachine &TM)
+  : TargetLowering(TM) {
+  Subtarget = &TM.getSubtarget<X86Subtarget>();
+  X86ScalarSSE = Subtarget->hasSSE2();
+  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+
+  RegInfo = TM.getRegisterInfo();
+
+  // Set up the TargetLowering object.
+
+  // X86 is weird, it always uses i8 for shift amounts and setcc results.
+  setShiftAmountType(MVT::i8);
+  setSetCCResultType(MVT::i8);
+  setSetCCResultContents(ZeroOrOneSetCCResult);
+  setSchedulingPreference(SchedulingForRegPressure);
+  setShiftAmountFlavor(Mask);   // shl X, 32 == shl X, 0
+  setStackPointerRegisterToSaveRestore(X86StackPtr);
+
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+    setUseUnderscoreSetJmp(false);
+    setUseUnderscoreLongJmp(false);
+  } else if (Subtarget->isTargetMingw()) {
+    // MS runtime is weird: it exports _setjmp, but longjmp!
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(false);
+  } else {
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(true);
+  }
+  
+  // Set up the register classes.
+  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
+  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
+  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+  if (Subtarget->is64Bit())
+    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+
+  setLoadXAction(ISD::SEXTLOAD, MVT::i1, Expand);
+
+  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+  // operation.
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
+  } else {
+    if (X86ScalarSSE)
+      // If SSE i64 SINT_TO_FP is not available, expand i32 UINT_TO_FP.
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Expand);
+    else
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
+  }
+
+  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
+  // SSE has no i16 to fp conversion, only i32
+  if (X86ScalarSSE)
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+  else {
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+  }
+
+  if (!Subtarget->is64Bit()) {
+    // Custom lower SINT_TO_FP and FP_TO_SINT from/to i64 in 32-bit mode.
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
+  }
+
+  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
+
+  if (X86ScalarSSE) {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+  } else {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  }
+
+  // Handle FP_TO_UINT by promoting the destination to a larger signed
+  // conversion.
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
+  } else {
+    if (X86ScalarSSE && !Subtarget->hasSSE3())
+      // Expand FP_TO_UINT into a select.
+      // FIXME: We would like to use a Custom expander here eventually to do
+      // the optimal thing for SSE vs. the default expansion in the legalizer.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
+    else
+      // With SSE3 we can use fisttpll to convert to a signed i64.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
+  }
+
+  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+  if (!X86ScalarSSE) {
+    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
+    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
+  }
+
+  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
+  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
+  setOperationAction(ISD::MEMMOVE          , MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
+
+  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i8   , Expand);
+  setOperationAction(ISD::CTLZ             , MVT::i8   , Expand);
+  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i16  , Expand);
+  setOperationAction(ISD::CTLZ             , MVT::i16  , Expand);
+  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i32  , Expand);
+  setOperationAction(ISD::CTLZ             , MVT::i32  , Expand);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
+    setOperationAction(ISD::CTTZ           , MVT::i64  , Expand);
+    setOperationAction(ISD::CTLZ           , MVT::i64  , Expand);
+  }
+
+  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
+  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
+
+  // These should be promoted to a larger select which is supported.
+  setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
+  setOperationAction(ISD::SELECT           , MVT::i8   , Promote);
+  // X86 wants to expand cmov itself.
+  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
+    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
+  }
+  // X86 ret instruction may pop stack.
+  setOperationAction(ISD::RET             , MVT::Other, Custom);
+  if (!Subtarget->is64Bit())
+    setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+
+  // Darwin ABI issue.
+  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
+  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
+  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
+    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
+    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
+    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
+  }
+  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
+  // X86 wants to expand memset / memcpy itself.
+  setOperationAction(ISD::MEMSET          , MVT::Other, Custom);
+  setOperationAction(ISD::MEMCPY          , MVT::Other, Custom);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  // FIXME - use subtarget debug flags
+  if (!Subtarget->isTargetDarwin() &&
+      !Subtarget->isTargetELF() &&
+      !Subtarget->isTargetCygMing())
+    setOperationAction(ISD::LABEL, MVT::Other, Expand);
+
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+  if (Subtarget->is64Bit()) {
+    // FIXME: Verify
+    setExceptionPointerRegister(X86::RAX);
+    setExceptionSelectorRegister(X86::RDX);
+  } else {
+    setExceptionPointerRegister(X86::EAX);
+    setExceptionSelectorRegister(X86::EDX);
+  }
+  
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
+  else
+    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
+
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+  if (Subtarget->isTargetCygMing())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+  if (X86ScalarSSE) {
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+
+    // Use ANDPD to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f64, Custom);
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f64, Custom);
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    // Use ANDPD and ORPD to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN , MVT::f64, Expand);
+    setOperationAction(ISD::FCOS , MVT::f64, Expand);
+    setOperationAction(ISD::FREM , MVT::f64, Expand);
+    setOperationAction(ISD::FSIN , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+    setOperationAction(ISD::FREM , MVT::f32, Expand);
+
+    // Expand FP immediates into loads from the stack, except for the special
+    // cases we handle.
+    setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+    setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+    addLegalFPImmediate(+0.0); // xorps / xorpd
+  } else {
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FP_ROUND,  MVT::f32, Expand);
+
+    if (!UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+    }
+
+    setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+    setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+    addLegalFPImmediate(+0.0); // FLD0
+    addLegalFPImmediate(+1.0); // FLD1
+    addLegalFPImmediate(-0.0); // FLD0/FCHS
+    addLegalFPImmediate(-1.0); // FLD1/FCHS
+  }
+
+  // First set operation action for all vector types to expand. Then we
+  // will selectively turn on ones that can be effectively codegen'd.
+  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+    setOperationAction(ISD::ADD , (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::SUB , (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FADD, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FNEG, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FSUB, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::MUL , (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FMUL, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::SDIV, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::UDIV, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FDIV, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::SREM, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::UREM, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::LOAD, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FABS, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FSIN, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FCOS, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FREM, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FPOWI, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FSQRT, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, (MVT::ValueType)VT, Expand);
+  }
+
+  if (Subtarget->hasMMX()) {
+    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
+    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
+
+    // FIXME: add MMX packed arithmetics
+
+    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
+
+    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
+    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
+
+    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
+    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
+    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
+
+    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
+
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
+  }
+
+  if (Subtarget->hasSSE1()) {
+    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+
+    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
+    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+  }
+
+  if (Subtarget->hasSSE2()) {
+    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+
+    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
+    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
+    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
+    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    // Implement v4f32 insert_vector_elt in terms of SSE2 v8i16 ones.
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+      setOperationAction(ISD::BUILD_VECTOR,        (MVT::ValueType)VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,      (MVT::ValueType)VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT,  (MVT::ValueType)VT, Custom);
+    }
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+
+    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+      setOperationAction(ISD::AND,    (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::AND,    (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::OR,     (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::XOR,    (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::LOAD,   (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::SELECT, (MVT::ValueType)VT, MVT::v2i64);
+    }
+
+    // Custom lower v2i64 and v2f64 selects.
+    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::SELECT);
+
+  computeRegisterProperties();
+
+  // FIXME: These should be based on subtarget info. Plus, the values should
+  // be smaller when we are in optimizing for size mode.
+  maxStoresPerMemset = 16; // For %llvm.memset -> sequence of stores
+  maxStoresPerMemcpy = 16; // For %llvm.memcpy -> sequence of stores
+  maxStoresPerMemmove = 16; // For %llvm.memmove -> sequence of stores
+  allowUnalignedMemoryAccesses = true; // x86 supports it!
+}
+
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+    
+/// LowerRET - Lower an ISD::RET node.
+SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
+  assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args");
+  
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+  CCInfo.AnalyzeReturn(Op.Val, RetCC_X86);
+  
+  
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());
+  }
+  
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Flag;
+  
+  // Copy the result values into the output registers.
+  if (RVLocs.size() != 1 || !RVLocs[0].isRegLoc() ||
+      RVLocs[0].getLocReg() != X86::ST0) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i) {
+      CCValAssign &VA = RVLocs[i];
+      assert(VA.isRegLoc() && "Can only return in registers!");
+      Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), Op.getOperand(i*2+1),
+                               Flag);
+      Flag = Chain.getValue(1);
+    }
+  } else {
+    // We need to handle a destination of ST0 specially, because it isn't really
+    // a register.
+    SDOperand Value = Op.getOperand(1);
+    
+    // If this is an FP return with ScalarSSE, we need to move the value from
+    // an XMM register onto the fp-stack.
+    if (X86ScalarSSE) {
+      SDOperand MemLoc;
+      
+      // If this is a load into a scalarsse value, don't store the loaded value
+      // back to the stack, only to reload it: just replace the scalar-sse load.
+      if (ISD::isNON_EXTLoad(Value.Val) &&
+          (Chain == Value.getValue(1) || Chain == Value.getOperand(0))) {
+        Chain  = Value.getOperand(0);
+        MemLoc = Value.getOperand(1);
+      } else {
+        // Spill the value to memory and reload it into top of stack.
+        unsigned Size = MVT::getSizeInBits(RVLocs[0].getValVT())/8;
+        MachineFunction &MF = DAG.getMachineFunction();
+        int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
+        MemLoc = DAG.getFrameIndex(SSFI, getPointerTy());
+        Chain = DAG.getStore(Op.getOperand(0), Value, MemLoc, NULL, 0);
+      }
+      SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other);
+      SDOperand Ops[] = {Chain, MemLoc, DAG.getValueType(RVLocs[0].getValVT())};
+      Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3);
+      Chain = Value.getValue(1);
+    }
+    
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDOperand Ops[] = { Chain, Value };
+    Chain = DAG.getNode(X86ISD::FP_SET_RESULT, Tys, Ops, 2);
+    Flag = Chain.getValue(1);
+  }
+  
+  SDOperand BytesToPop = DAG.getConstant(getBytesToPopOnReturn(), MVT::i16);
+  if (Flag.Val)
+    return DAG.getNode(X86ISD::RET_FLAG, MVT::Other, Chain, BytesToPop, Flag);
+  else
+    return DAG.getNode(X86ISD::RET_FLAG, MVT::Other, Chain, BytesToPop);
+}
+
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered.  The returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDNode *X86TargetLowering::
+LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, 
+                unsigned CallingConv, SelectionDAG &DAG) {
+  
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  bool isVarArg = cast<ConstantSDNode>(TheCall->getOperand(2))->getValue() != 0;
+  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+  CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
+
+  
+  SmallVector<SDOperand, 8> ResultVals;
+  
+  // Copy all of the result registers out of their specified physreg.
+  if (RVLocs.size() != 1 || RVLocs[0].getLocReg() != X86::ST0) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i) {
+      Chain = DAG.getCopyFromReg(Chain, RVLocs[i].getLocReg(),
+                                 RVLocs[i].getValVT(), InFlag).getValue(1);
+      InFlag = Chain.getValue(2);
+      ResultVals.push_back(Chain.getValue(0));
+    }
+  } else {
+    // Copies from the FP stack are special, as ST0 isn't a valid register
+    // before the fp stackifier runs.
+    
+    // Copy ST0 into an RFP register with FP_GET_RESULT.
+    SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other, MVT::Flag);
+    SDOperand GROps[] = { Chain, InFlag };
+    SDOperand RetVal = DAG.getNode(X86ISD::FP_GET_RESULT, Tys, GROps, 2);
+    Chain  = RetVal.getValue(1);
+    InFlag = RetVal.getValue(2);
+    
+    // If we are using ScalarSSE, store ST(0) to the stack and reload it into
+    // an XMM register.
+    if (X86ScalarSSE) {
+      // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This
+      // shouldn't be necessary except that RFP cannot be live across
+      // multiple blocks. When stackifier is fixed, they can be uncoupled.
+      MachineFunction &MF = DAG.getMachineFunction();
+      int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
+      SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+      SDOperand Ops[] = {
+        Chain, RetVal, StackSlot, DAG.getValueType(RVLocs[0].getValVT()), InFlag
+      };
+      Chain = DAG.getNode(X86ISD::FST, MVT::Other, Ops, 5);
+      RetVal = DAG.getLoad(RVLocs[0].getValVT(), Chain, StackSlot, NULL, 0);
+      Chain = RetVal.getValue(1);
+    }
+    ResultVals.push_back(RetVal);
+  }
+  
+  // Merge everything together with a MERGE_VALUES node.
+  ResultVals.push_back(Chain);
+  return DAG.getNode(ISD::MERGE_VALUES, TheCall->getVTList(),
+                     &ResultVals[0], ResultVals.size()).Val;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                C & StdCall Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//  StdCall calling convention seems to be standard for many Windows' API
+//  routines and around. It differs from C calling convention just a little:
+//  callee should clean up the stack, not caller. Symbols should be also
+//  decorated in some fancy way :) It doesn't support any vector arguments.
+
+/// AddLiveIn - This helper function adds the specified physical register to the
+/// MachineFunction as a live in value.  It also creates a corresponding virtual
+/// register for it.
+static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
+                          const TargetRegisterClass *RC) {
+  assert(RC->contains(PReg) && "Not the correct regclass!");
+  unsigned VReg = MF.getSSARegMap()->createVirtualRegister(RC);
+  MF.addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
+                                               bool isStdCall) {
+  unsigned NumArgs = Op.Val->getNumValues() - 1;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SDOperand Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+                 getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C);
+   
+  SmallVector<SDOperand, 8> ArgValues;
+  unsigned LastVal = ~0U;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    LastVal = VA.getValNo();
+    
+    if (VA.isRegLoc()) {
+      MVT::ValueType RegVT = VA.getLocVT();
+      TargetRegisterClass *RC;
+      if (RegVT == MVT::i32)
+        RC = X86::GR32RegisterClass;
+      else {
+        assert(MVT::isVector(RegVT));
+        RC = X86::VR128RegisterClass;
+      }
+      
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+      SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
+      
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      
+      if (VA.getLocInfo() != CCValAssign::Full)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
+      
+      ArgValues.push_back(ArgValue);
+    } else {
+      assert(VA.isMemLoc());
+      
+      // Create the nodes corresponding to a load from this parameter slot.
+      int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+                                      VA.getLocMemOffset());
+      SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0));
+    }
+  }
+  
+  unsigned StackSize = CCInfo.getNextStackOffset();
+
+  ArgValues.push_back(Root);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg)
+    VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
+
+  if (isStdCall && !isVarArg) {
+    BytesToPopOnReturn  = StackSize;    // Callee pops everything..
+    BytesCallerReserves = 0;
+  } else {
+    BytesToPopOnReturn  = 0; // Callee pops nothing.
+    
+    // If this is an sret function, the return should pop the hidden pointer.
+    if (NumArgs &&
+        (cast<ConstantSDNode>(Op.getOperand(3))->getValue() &
+         ISD::ParamFlags::StructReturn))
+      BytesToPopOnReturn = 4;  
+    
+    BytesCallerReserves = StackSize;
+  }
+  
+  RegSaveFrameIndex = 0xAAAAAAA;  // X86-64 only.
+  ReturnAddrIndex = 0;            // No return address slot generated yet.
+
+  MF.getInfo<X86MachineFunctionInfo>()
+    ->setBytesToPopOnReturn(BytesToPopOnReturn);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+}
+
+SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,
+                                            unsigned CC) {
+  SDOperand Chain     = Op.getOperand(0);
+  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+  SDOperand Callee    = Op.getOperand(4);
+  unsigned NumOps     = (Op.getNumOperands() - 5) / 2;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C);
+  
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
+
+  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+  SmallVector<SDOperand, 8> MemOpChains;
+
+  SDOperand StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+    
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+      break;
+    }
+    
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+      if (StackPtr.Val == 0)
+        StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+      SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+      PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+    }
+  }
+
+  // If the first argument is an sret pointer, remember it.
+  bool isSRet = NumOps &&
+    (cast<ConstantSDNode>(Op.getOperand(6))->getValue() &
+     ISD::ParamFlags::StructReturn);
+  
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT()) {
+    Chain = DAG.getCopyToReg(Chain, X86::EBX,
+                             DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+  
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+                                        getTargetMachine(), true))
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add an implicit use GOT pointer in EBX.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT())
+    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
+  
+  if (InFlag.Val)
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+                      NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  unsigned NumBytesForCalleeToPush = 0;
+
+  if (CC == CallingConv::X86_StdCall) {
+    if (isVarArg)
+      NumBytesForCalleeToPush = isSRet ? 4 : 0;
+    else
+      NumBytesForCalleeToPush = NumBytes;
+  } else {
+    // If this is is a call to a struct-return function, the callee
+    // pops the hidden struct pointer, so we have to push it back.
+    // This is common for Darwin/X86, Linux & Mingw32 targets.
+    NumBytesForCalleeToPush = isSRet ? 4 : 0;
+  }
+  
+  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  Ops.clear();
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+  Ops.push_back(DAG.getConstant(NumBytesForCalleeToPush, getPointerTy()));
+  Ops.push_back(InFlag);
+  Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                   FastCall Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//
+// The X86 'fastcall' calling convention passes up to two integer arguments in
+// registers (an appropriate portion of ECX/EDX), passes arguments in C order,
+// and requires that the callee pop its arguments off the stack (allowing proper
+// tail calls), and has the same return value conventions as C calling convs.
+//
+// This calling convention always arranges for the callee pop value to be 8n+4
+// bytes, which is needed for tail recursion elimination and stack alignment
+// reasons.
+SDOperand
+X86TargetLowering::LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SDOperand Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+                 getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_FastCall);
+  
+  SmallVector<SDOperand, 8> ArgValues;
+  unsigned LastVal = ~0U;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    LastVal = VA.getValNo();
+    
+    if (VA.isRegLoc()) {
+      MVT::ValueType RegVT = VA.getLocVT();
+      TargetRegisterClass *RC;
+      if (RegVT == MVT::i32)
+        RC = X86::GR32RegisterClass;
+      else {
+        assert(MVT::isVector(RegVT));
+        RC = X86::VR128RegisterClass;
+      }
+      
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+      SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
+      
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      
+      if (VA.getLocInfo() != CCValAssign::Full)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
+      
+      ArgValues.push_back(ArgValue);
+    } else {
+      assert(VA.isMemLoc());
+      
+      // Create the nodes corresponding to a load from this parameter slot.
+      int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+                                      VA.getLocMemOffset());
+      SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0));
+    }
+  }
+  
+  ArgValues.push_back(Root);
+
+  unsigned StackSize = CCInfo.getNextStackOffset();
+
+  if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
+    // Make sure the instruction takes 8n+4 bytes to make sure the start of the
+    // arguments and the arguments after the retaddr has been pushed are aligned.
+    if ((StackSize & 7) == 0)
+      StackSize += 4;
+  }
+
+  VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
+  RegSaveFrameIndex = 0xAAAAAAA;   // X86-64 only.
+  ReturnAddrIndex = 0;             // No return address slot generated yet.
+  BytesToPopOnReturn = StackSize;  // Callee pops all stack arguments.
+  BytesCallerReserves = 0;
+
+  MF.getInfo<X86MachineFunctionInfo>()
+    ->setBytesToPopOnReturn(BytesToPopOnReturn);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+}
+
+SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
+                                               unsigned CC) {
+  SDOperand Chain     = Op.getOperand(0);
+  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  SDOperand Callee    = Op.getOperand(4);
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_FastCall);
+  
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
+    // Make sure the instruction takes 8n+4 bytes to make sure the start of the
+    // arguments and the arguments after the retaddr has been pushed are aligned.
+    if ((NumBytes & 7) == 0)
+      NumBytes += 4;
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
+  
+  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+  SmallVector<SDOperand, 8> MemOpChains;
+  
+  SDOperand StackPtr;
+  
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+    
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+        break;
+    }
+    
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+      if (StackPtr.Val == 0)
+        StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+      SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+      PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+                                        getTargetMachine(), true))
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT()) {
+    Chain = DAG.getCopyToReg(Chain, X86::EBX,
+                             DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add an implicit use GOT pointer in EBX.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT())
+    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
+
+  if (InFlag.Val)
+    Ops.push_back(InFlag);
+
+  // FIXME: Do not generate X86ISD::TAILCALL for now.
+  Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+                      NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Returns a flag for retval copy to use.
+  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  Ops.clear();
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+  Ops.push_back(InFlag);
+  Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                 X86-64 C Calling Convention implementation
+//===----------------------------------------------------------------------===//
+
+SDOperand
+X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SDOperand Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+  static const unsigned GPR64ArgRegs[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8,  X86::R9
+  };
+  static const unsigned XMMArgRegs[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
+
+  
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+                 getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C);
+  
+  SmallVector<SDOperand, 8> ArgValues;
+  unsigned LastVal = ~0U;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    LastVal = VA.getValNo();
+    
+    if (VA.isRegLoc()) {
+      MVT::ValueType RegVT = VA.getLocVT();
+      TargetRegisterClass *RC;
+      if (RegVT == MVT::i32)
+        RC = X86::GR32RegisterClass;
+      else if (RegVT == MVT::i64)
+        RC = X86::GR64RegisterClass;
+      else if (RegVT == MVT::f32)
+        RC = X86::FR32RegisterClass;
+      else if (RegVT == MVT::f64)
+        RC = X86::FR64RegisterClass;
+      else {
+        assert(MVT::isVector(RegVT));
+        if (MVT::getSizeInBits(RegVT) == 64) {
+          RC = X86::GR64RegisterClass;       // MMX values are passed in GPRs.
+          RegVT = MVT::i64;
+        } else
+          RC = X86::VR128RegisterClass;
+      }
+
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+      SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
+      
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      
+      if (VA.getLocInfo() != CCValAssign::Full)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
+      
+      // Handle MMX values passed in GPRs.
+      if (RegVT != VA.getLocVT() && RC == X86::GR64RegisterClass &&
+          MVT::getSizeInBits(RegVT) == 64)
+        ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
+      
+      ArgValues.push_back(ArgValue);
+    } else {
+      assert(VA.isMemLoc());
+    
+      // Create the nodes corresponding to a load from this parameter slot.
+      int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+                                      VA.getLocMemOffset());
+      SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0));
+    }
+  }
+  
+  unsigned StackSize = CCInfo.getNextStackOffset();
+  
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6);
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    
+    // For X86-64, if there are vararg parameters that are passed via
+    // registers, then we must store them to their spots on the stack so they
+    // may be loaded by deferencing the result of va_next.
+    VarArgsGPOffset = NumIntRegs * 8;
+    VarArgsFPOffset = 6 * 8 + NumXMMRegs * 16;
+    VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
+    RegSaveFrameIndex = MFI->CreateStackObject(6 * 8 + 8 * 16, 16);
+
+    // Store the integer parameter registers.
+    SmallVector<SDOperand, 8> MemOps;
+    SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+    SDOperand FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
+                              DAG.getConstant(VarArgsGPOffset, getPointerTy()));
+    for (; NumIntRegs != 6; ++NumIntRegs) {
+      unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs],
+                                X86::GR64RegisterClass);
+      SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::i64);
+      SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                        DAG.getConstant(8, getPointerTy()));
+    }
+
+    // Now store the XMM (fp + vector) parameter registers.
+    FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
+                      DAG.getConstant(VarArgsFPOffset, getPointerTy()));
+    for (; NumXMMRegs != 8; ++NumXMMRegs) {
+      unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs],
+                                X86::VR128RegisterClass);
+      SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::v4f32);
+      SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                        DAG.getConstant(16, getPointerTy()));
+    }
+    if (!MemOps.empty())
+        Root = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                           &MemOps[0], MemOps.size());
+  }
+
+  ArgValues.push_back(Root);
+
+  ReturnAddrIndex = 0;     // No return address slot generated yet.
+  BytesToPopOnReturn = 0;  // Callee pops nothing.
+  BytesCallerReserves = StackSize;
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+}
+
+SDOperand
+X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,
+                                        unsigned CC) {
+  SDOperand Chain     = Op.getOperand(0);
+  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+  SDOperand Callee    = Op.getOperand(4);
+  
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_C);
+    
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
+
+  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+  SmallVector<SDOperand, 8> MemOpChains;
+
+  SDOperand StackPtr;
+  
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+    
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+      break;
+    }
+    
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+      if (StackPtr.Val == 0)
+        StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+      SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+      PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+    }
+  }
+  
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isVarArg) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+    
+    // Count the number of XMM registers allocated.
+    static const unsigned XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    
+    Chain = DAG.getCopyToReg(Chain, X86::AL,
+                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    if (getTargetMachine().getCodeModel() != CodeModel::Large
+        && !Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+                                           getTargetMachine(), true))
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    if (getTargetMachine().getCodeModel() != CodeModel::Large)
+      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.Val)
+    Ops.push_back(InFlag);
+
+  // FIXME: Do not generate X86ISD::TAILCALL for now.
+  Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+                      NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Returns a flag for retval copy to use.
+  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  Ops.clear();
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+  Ops.push_back(DAG.getConstant(0, getPointerTy()));
+  Ops.push_back(InFlag);
+  Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+  
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+
+SDOperand X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    MachineFunction &MF = DAG.getMachineFunction();
+    if (Subtarget->is64Bit())
+      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(8, -8);
+    else
+      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(4, -4);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+
+
+/// translateX86CC - do a one to one translation of a ISD::CondCode to the X86
+/// specific condition code. It returns a false if it cannot do a direct
+/// translation. X86CC is the translated CondCode.  LHS/RHS are modified as
+/// needed.
+static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+                           unsigned &X86CC, SDOperand &LHS, SDOperand &RHS,
+                           SelectionDAG &DAG) {
+  X86CC = X86::COND_INVALID;
+  if (!isFP) {
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+        // X > -1   -> X == 0, jump !sign.
+        RHS = DAG.getConstant(0, RHS.getValueType());
+        X86CC = X86::COND_NS;
+        return true;
+      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+        // X < 0   -> X == 0, jump on sign.
+        X86CC = X86::COND_S;
+        return true;
+      }
+    }
+
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETEQ:  X86CC = X86::COND_E;  break;
+    case ISD::SETGT:  X86CC = X86::COND_G;  break;
+    case ISD::SETGE:  X86CC = X86::COND_GE; break;
+    case ISD::SETLT:  X86CC = X86::COND_L;  break;
+    case ISD::SETLE:  X86CC = X86::COND_LE; break;
+    case ISD::SETNE:  X86CC = X86::COND_NE; break;
+    case ISD::SETULT: X86CC = X86::COND_B;  break;
+    case ISD::SETUGT: X86CC = X86::COND_A;  break;
+    case ISD::SETULE: X86CC = X86::COND_BE; break;
+    case ISD::SETUGE: X86CC = X86::COND_AE; break;
+    }
+  } else {
+    // On a floating point condition, the flags are set as follows:
+    // ZF  PF  CF   op
+    //  0 | 0 | 0 | X > Y
+    //  0 | 0 | 1 | X < Y
+    //  1 | 0 | 0 | X == Y
+    //  1 | 1 | 1 | unordered
+    bool Flip = false;
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETUEQ:
+    case ISD::SETEQ: X86CC = X86::COND_E;  break;
+    case ISD::SETOLT: Flip = true; // Fallthrough
+    case ISD::SETOGT:
+    case ISD::SETGT: X86CC = X86::COND_A;  break;
+    case ISD::SETOLE: Flip = true; // Fallthrough
+    case ISD::SETOGE:
+    case ISD::SETGE: X86CC = X86::COND_AE; break;
+    case ISD::SETUGT: Flip = true; // Fallthrough
+    case ISD::SETULT:
+    case ISD::SETLT: X86CC = X86::COND_B;  break;
+    case ISD::SETUGE: Flip = true; // Fallthrough
+    case ISD::SETULE:
+    case ISD::SETLE: X86CC = X86::COND_BE; break;
+    case ISD::SETONE:
+    case ISD::SETNE: X86CC = X86::COND_NE; break;
+    case ISD::SETUO: X86CC = X86::COND_P;  break;
+    case ISD::SETO:  X86CC = X86::COND_NP; break;
+    }
+    if (Flip)
+      std::swap(LHS, RHS);
+  }
+
+  return X86CC != X86::COND_INVALID;
+}
+
+/// hasFPCMov - is there a floating point cmov for the specific X86 condition
+/// code. Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+  switch (X86CC) {
+  default:
+    return false;
+  case X86::COND_B:
+  case X86::COND_BE:
+  case X86::COND_E:
+  case X86::COND_P:
+  case X86::COND_A:
+  case X86::COND_AE:
+  case X86::COND_NE:
+  case X86::COND_NP:
+    return true;
+  }
+}
+
+/// isUndefOrInRange - Op is either an undef node or a ConstantSDNode.  Return
+/// true if Op is undef or if its value falls within the specified range (L, H].
+static bool isUndefOrInRange(SDOperand Op, unsigned Low, unsigned Hi) {
+  if (Op.getOpcode() == ISD::UNDEF)
+    return true;
+
+  unsigned Val = cast<ConstantSDNode>(Op)->getValue();
+  return (Val >= Low && Val < Hi);
+}
+
+/// isUndefOrEqual - Op is either an undef node or a ConstantSDNode.  Return
+/// true if Op is undef or if its value equal to the specified value.
+static bool isUndefOrEqual(SDOperand Op, unsigned Val) {
+  if (Op.getOpcode() == ISD::UNDEF)
+    return true;
+  return cast<ConstantSDNode>(Op)->getValue() == Val;
+}
+
+/// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to PSHUFD.
+bool X86::isPSHUFDMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 4)
+    return false;
+
+  // Check if the value doesn't reference the second vector.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    if (cast<ConstantSDNode>(Arg)->getValue() >= 4)
+      return false;
+  }
+
+  return true;
+}
+
+/// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to PSHUFHW.
+bool X86::isPSHUFHWMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 8)
+    return false;
+
+  // Lower quadword copied in order.
+  for (unsigned i = 0; i != 4; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    if (cast<ConstantSDNode>(Arg)->getValue() != i)
+      return false;
+  }
+
+  // Upper quadword shuffled.
+  for (unsigned i = 4; i != 8; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val < 4 || Val > 7)
+      return false;
+  }
+
+  return true;
+}
+
+/// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to PSHUFLW.
+bool X86::isPSHUFLWMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 8)
+    return false;
+
+  // Upper quadword copied in order.
+  for (unsigned i = 4; i != 8; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i))
+      return false;
+
+  // Lower quadword shuffled.
+  for (unsigned i = 0; i != 4; ++i)
+    if (!isUndefOrInRange(N->getOperand(i), 0, 4))
+      return false;
+
+  return true;
+}
+
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+static bool isSHUFPMask(const SDOperand *Elems, unsigned NumElems) {
+  if (NumElems != 2 && NumElems != 4) return false;
+
+  unsigned Half = NumElems / 2;
+  for (unsigned i = 0; i < Half; ++i)
+    if (!isUndefOrInRange(Elems[i], 0, NumElems))
+      return false;
+  for (unsigned i = Half; i < NumElems; ++i)
+    if (!isUndefOrInRange(Elems[i], NumElems, NumElems*2))
+      return false;
+
+  return true;
+}
+
+bool X86::isSHUFPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return ::isSHUFPMask(N->op_begin(), N->getNumOperands());
+}
+
+/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
+/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
+/// half elements to come from vector 1 (which would equal the dest.) and
+/// the upper half to come from vector 2.
+static bool isCommutedSHUFP(const SDOperand *Ops, unsigned NumOps) {
+  if (NumOps != 2 && NumOps != 4) return false;
+
+  unsigned Half = NumOps / 2;
+  for (unsigned i = 0; i < Half; ++i)
+    if (!isUndefOrInRange(Ops[i], NumOps, NumOps*2))
+      return false;
+  for (unsigned i = Half; i < NumOps; ++i)
+    if (!isUndefOrInRange(Ops[i], 0, NumOps))
+      return false;
+  return true;
+}
+
+static bool isCommutedSHUFP(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return isCommutedSHUFP(N->op_begin(), N->getNumOperands());
+}
+
+/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+bool X86::isMOVHLPSMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 4)
+    return false;
+
+  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
+  return isUndefOrEqual(N->getOperand(0), 6) &&
+         isUndefOrEqual(N->getOperand(1), 7) &&
+         isUndefOrEqual(N->getOperand(2), 2) &&
+         isUndefOrEqual(N->getOperand(3), 3);
+}
+
+/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+/// <2, 3, 2, 3>
+bool X86::isMOVHLPS_v_undef_Mask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 4)
+    return false;
+
+  // Expect bit0 == 2, bit1 == 3, bit2 == 2, bit3 == 3
+  return isUndefOrEqual(N->getOperand(0), 2) &&
+         isUndefOrEqual(N->getOperand(1), 3) &&
+         isUndefOrEqual(N->getOperand(2), 2) &&
+         isUndefOrEqual(N->getOperand(3), 3);
+}
+
+/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+bool X86::isMOVLPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  unsigned NumElems = N->getNumOperands();
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i + NumElems))
+      return false;
+
+  for (unsigned i = NumElems/2; i < NumElems; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i))
+      return false;
+
+  return true;
+}
+
+/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
+/// and MOVLHPS.
+bool X86::isMOVHPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  unsigned NumElems = N->getNumOperands();
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i))
+      return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i) {
+    SDOperand Arg = N->getOperand(i + NumElems/2);
+    if (!isUndefOrEqual(Arg, i + NumElems))
+      return false;
+  }
+
+  return true;
+}
+
+/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKL.
+bool static isUNPCKLMask(const SDOperand *Elts, unsigned NumElts,
+                         bool V2IsSplat = false) {
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+
+  for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) {
+    SDOperand BitI  = Elts[i];
+    SDOperand BitI1 = Elts[i+1];
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (V2IsSplat) {
+      if (isUndefOrEqual(BitI1, NumElts))
+        return false;
+    } else {
+      if (!isUndefOrEqual(BitI1, j + NumElts))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+bool X86::isUNPCKLMask(SDNode *N, bool V2IsSplat) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return ::isUNPCKLMask(N->op_begin(), N->getNumOperands(), V2IsSplat);
+}
+
+/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKH.
+bool static isUNPCKHMask(const SDOperand *Elts, unsigned NumElts,
+                         bool V2IsSplat = false) {
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+
+  for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) {
+    SDOperand BitI  = Elts[i];
+    SDOperand BitI1 = Elts[i+1];
+    if (!isUndefOrEqual(BitI, j + NumElts/2))
+      return false;
+    if (V2IsSplat) {
+      if (isUndefOrEqual(BitI1, NumElts))
+        return false;
+    } else {
+      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+bool X86::isUNPCKHMask(SDNode *N, bool V2IsSplat) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return ::isUNPCKHMask(N->op_begin(), N->getNumOperands(), V2IsSplat);
+}
+
+/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+/// <0, 0, 1, 1>
+bool X86::isUNPCKL_v_undef_Mask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  unsigned NumElems = N->getNumOperands();
+  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+    return false;
+
+  for (unsigned i = 0, j = 0; i != NumElems; i += 2, ++j) {
+    SDOperand BitI  = N->getOperand(i);
+    SDOperand BitI1 = N->getOperand(i+1);
+
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (!isUndefOrEqual(BitI1, j))
+      return false;
+  }
+
+  return true;
+}
+
+/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+/// <2, 2, 3, 3>
+bool X86::isUNPCKH_v_undef_Mask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  unsigned NumElems = N->getNumOperands();
+  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+    return false;
+
+  for (unsigned i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
+    SDOperand BitI  = N->getOperand(i);
+    SDOperand BitI1 = N->getOperand(i + 1);
+
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (!isUndefOrEqual(BitI1, j))
+      return false;
+  }
+
+  return true;
+}
+
+/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSS,
+/// MOVSD, and MOVD, i.e. setting the lowest element.
+static bool isMOVLMask(const SDOperand *Elts, unsigned NumElts) {
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+
+  if (!isUndefOrEqual(Elts[0], NumElts))
+    return false;
+
+  for (unsigned i = 1; i < NumElts; ++i) {
+    if (!isUndefOrEqual(Elts[i], i))
+      return false;
+  }
+
+  return true;
+}
+
+bool X86::isMOVLMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return ::isMOVLMask(N->op_begin(), N->getNumOperands());
+}
+
+/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
+/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
+/// element of vector 2 and the other elements to come from vector 1 in order.
+static bool isCommutedMOVL(const SDOperand *Ops, unsigned NumOps,
+                           bool V2IsSplat = false,
+                           bool V2IsUndef = false) {
+  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
+    return false;
+
+  if (!isUndefOrEqual(Ops[0], 0))
+    return false;
+
+  for (unsigned i = 1; i < NumOps; ++i) {
+    SDOperand Arg = Ops[i];
+    if (!(isUndefOrEqual(Arg, i+NumOps) ||
+          (V2IsUndef && isUndefOrInRange(Arg, NumOps, NumOps*2)) ||
+          (V2IsSplat && isUndefOrEqual(Arg, NumOps))))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isCommutedMOVL(SDNode *N, bool V2IsSplat = false,
+                           bool V2IsUndef = false) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return isCommutedMOVL(N->op_begin(), N->getNumOperands(),
+                        V2IsSplat, V2IsUndef);
+}
+
+/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+bool X86::isMOVSHDUPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 4)
+    return false;
+
+  // Expect 1, 1, 3, 3
+  for (unsigned i = 0; i < 2; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val != 1) return false;
+  }
+
+  bool HasHi = false;
+  for (unsigned i = 2; i < 4; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val != 3) return false;
+    HasHi = true;
+  }
+
+  // Don't use movshdup if it can be done with a shufps.
+  return HasHi;
+}
+
+/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+bool X86::isMOVSLDUPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 4)
+    return false;
+
+  // Expect 0, 0, 2, 2
+  for (unsigned i = 0; i < 2; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val != 0) return false;
+  }
+
+  bool HasHi = false;
+  for (unsigned i = 2; i < 4; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val != 2) return false;
+    HasHi = true;
+  }
+
+  // Don't use movshdup if it can be done with a shufps.
+  return HasHi;
+}
+
+/// isIdentityMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a identity operation on the LHS or RHS.
+static bool isIdentityMask(SDNode *N, bool RHS = false) {
+  unsigned NumElems = N->getNumOperands();
+  for (unsigned i = 0; i < NumElems; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i + (RHS ? NumElems : 0)))
+      return false;
+  return true;
+}
+
+/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
+/// a splat of a single element.
+static bool isSplatMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  // This is a splat operation if each element of the permute is the same, and
+  // if the value doesn't reference the second vector.
+  unsigned NumElems = N->getNumOperands();
+  SDOperand ElementBase;
+  unsigned i = 0;
+  for (; i != NumElems; ++i) {
+    SDOperand Elt = N->getOperand(i);
+    if (isa<ConstantSDNode>(Elt)) {
+      ElementBase = Elt;
+      break;
+    }
+  }
+
+  if (!ElementBase.Val)
+    return false;
+
+  for (; i != NumElems; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    if (Arg != ElementBase) return false;
+  }
+
+  // Make sure it is a splat of the first vector operand.
+  return cast<ConstantSDNode>(ElementBase)->getValue() < NumElems;
+}
+
+/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
+/// a splat of a single element and it's a 2 or 4 element mask.
+bool X86::isSplatMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  // We can only splat 64-bit, and 32-bit quantities with a single instruction.
+  if (N->getNumOperands() != 4 && N->getNumOperands() != 2)
+    return false;
+  return ::isSplatMask(N);
+}
+
+/// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a splat of zero element.
+bool X86::isSplatLoMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), 0))
+      return false;
+  return true;
+}
+
+/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+/// instructions.
+unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
+  unsigned NumOperands = N->getNumOperands();
+  unsigned Shift = (NumOperands == 4) ? 2 : 1;
+  unsigned Mask = 0;
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    unsigned Val = 0;
+    SDOperand Arg = N->getOperand(NumOperands-i-1);
+    if (Arg.getOpcode() != ISD::UNDEF)
+      Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val >= NumOperands) Val -= NumOperands;
+    Mask |= Val;
+    if (i != NumOperands - 1)
+      Mask <<= Shift;
+  }
+
+  return Mask;
+}
+
+/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
+/// instructions.
+unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
+  unsigned Mask = 0;
+  // 8 nodes, but we only care about the last 4.
+  for (unsigned i = 7; i >= 4; --i) {
+    unsigned Val = 0;
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() != ISD::UNDEF)
+      Val = cast<ConstantSDNode>(Arg)->getValue();
+    Mask |= (Val - 4);
+    if (i != 4)
+      Mask <<= 2;
+  }
+
+  return Mask;
+}
+
+/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
+/// instructions.
+unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
+  unsigned Mask = 0;
+  // 8 nodes, but we only care about the first 4.
+  for (int i = 3; i >= 0; --i) {
+    unsigned Val = 0;
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() != ISD::UNDEF)
+      Val = cast<ConstantSDNode>(Arg)->getValue();
+    Mask |= Val;
+    if (i != 0)
+      Mask <<= 2;
+  }
+
+  return Mask;
+}
+
+/// isPSHUFHW_PSHUFLWMask - true if the specified VECTOR_SHUFFLE operand
+/// specifies a 8 element shuffle that can be broken into a pair of
+/// PSHUFHW and PSHUFLW.
+static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 8)
+    return false;
+
+  // Lower quadword shuffled.
+  for (unsigned i = 0; i != 4; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val > 4)
+      return false;
+  }
+
+  // Upper quadword shuffled.
+  for (unsigned i = 4; i != 8; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val < 4 || Val > 7)
+      return false;
+  }
+
+  return true;
+}
+
+/// CommuteVectorShuffle - Swap vector_shuffle operandsas well as
+/// values in ther permute mask.
+static SDOperand CommuteVectorShuffle(SDOperand Op, SDOperand &V1,
+                                      SDOperand &V2, SDOperand &Mask,
+                                      SelectionDAG &DAG) {
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType MaskVT = Mask.getValueType();
+  MVT::ValueType EltVT = MVT::getVectorElementType(MaskVT);
+  unsigned NumElems = Mask.getNumOperands();
+  SmallVector<SDOperand, 8> MaskVec;
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDOperand Arg = Mask.getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) {
+      MaskVec.push_back(DAG.getNode(ISD::UNDEF, EltVT));
+      continue;
+    }
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val < NumElems)
+      MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT));
+    else
+      MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT));
+  }
+
+  std::swap(V1, V2);
+  Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+  return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+}
+
+/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
+/// match movhlps. The lower half elements should come from upper half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order).
+static bool ShouldXformToMOVHLPS(SDNode *Mask) {
+  unsigned NumElems = Mask->getNumOperands();
+  if (NumElems != 4)
+    return false;
+  for (unsigned i = 0, e = 2; i != e; ++i)
+    if (!isUndefOrEqual(Mask->getOperand(i), i+2))
+      return false;
+  for (unsigned i = 2; i != 4; ++i)
+    if (!isUndefOrEqual(Mask->getOperand(i), i+4))
+      return false;
+  return true;
+}
+
+/// isScalarLoadToVector - Returns true if the node is a scalar load that
+/// is promoted to a vector.
+static inline bool isScalarLoadToVector(SDNode *N) {
+  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    N = N->getOperand(0).Val;
+    return ISD::isNON_EXTLoad(N);
+  }
+  return false;
+}
+
+/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
+/// match movlp{s|d}. The lower half elements should come from lower half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order). And since V1 will become the source of the
+/// MOVLP, it must be either a vector load or a scalar load to vector.
+static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, SDNode *Mask) {
+  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
+    return false;
+  // Is V2 is a vector load, don't do this transformation. We will try to use
+  // load folding shufps op.
+  if (ISD::isNON_EXTLoad(V2))
+    return false;
+
+  unsigned NumElems = Mask->getNumOperands();
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask->getOperand(i), i))
+      return false;
+  for (unsigned i = NumElems/2; i != NumElems; ++i)
+    if (!isUndefOrEqual(Mask->getOperand(i), i+NumElems))
+      return false;
+  return true;
+}
+
+/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
+/// all the same.
+static bool isSplatVector(SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  SDOperand SplatValue = N->getOperand(0);
+  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+    if (N->getOperand(i) != SplatValue)
+      return false;
+  return true;
+}
+
+/// isUndefShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an undef.
+static bool isUndefShuffle(SDNode *N) {
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  SDOperand V1 = N->getOperand(0);
+  SDOperand V2 = N->getOperand(1);
+  SDOperand Mask = N->getOperand(2);
+  unsigned NumElems = Mask.getNumOperands();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDOperand Arg = Mask.getOperand(i);
+    if (Arg.getOpcode() != ISD::UNDEF) {
+      unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+      if (Val < NumElems && V1.getOpcode() != ISD::UNDEF)
+        return false;
+      else if (Val >= NumElems && V2.getOpcode() != ISD::UNDEF)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+static inline bool isZeroNode(SDOperand Elt) {
+  return ((isa<ConstantSDNode>(Elt) &&
+           cast<ConstantSDNode>(Elt)->getValue() == 0) ||
+          (isa<ConstantFPSDNode>(Elt) &&
+           cast<ConstantFPSDNode>(Elt)->isExactlyValue(0.0)));
+}
+
+/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an zero vector.
+static bool isZeroShuffle(SDNode *N) {
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  SDOperand V1 = N->getOperand(0);
+  SDOperand V2 = N->getOperand(1);
+  SDOperand Mask = N->getOperand(2);
+  unsigned NumElems = Mask.getNumOperands();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDOperand Arg = Mask.getOperand(i);
+    if (Arg.getOpcode() != ISD::UNDEF) {
+      unsigned Idx = cast<ConstantSDNode>(Arg)->getValue();
+      if (Idx < NumElems) {
+        unsigned Opc = V1.Val->getOpcode();
+        if (Opc == ISD::UNDEF)
+          continue;
+        if (Opc != ISD::BUILD_VECTOR ||
+            !isZeroNode(V1.Val->getOperand(Idx)))
+          return false;
+      } else if (Idx >= NumElems) {
+        unsigned Opc = V2.Val->getOpcode();
+        if (Opc == ISD::UNDEF)
+          continue;
+        if (Opc != ISD::BUILD_VECTOR ||
+            !isZeroNode(V2.Val->getOperand(Idx - NumElems)))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDOperand getZeroVector(MVT::ValueType VT, SelectionDAG &DAG) {
+  assert(MVT::isVector(VT) && "Expected a vector type");
+  unsigned NumElems = MVT::getVectorNumElements(VT);
+  MVT::ValueType EVT = MVT::getVectorElementType(VT);
+  bool isFP = MVT::isFloatingPoint(EVT);
+  SDOperand Zero = isFP ? DAG.getConstantFP(0.0, EVT) : DAG.getConstant(0, EVT);
+  SmallVector<SDOperand, 8> ZeroVec(NumElems, Zero);
+  return DAG.getNode(ISD::BUILD_VECTOR, VT, &ZeroVec[0], ZeroVec.size());
+}
+
+/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
+/// that point to V2 points to its first element.
+static SDOperand NormalizeMask(SDOperand Mask, SelectionDAG &DAG) {
+  assert(Mask.getOpcode() == ISD::BUILD_VECTOR);
+
+  bool Changed = false;
+  SmallVector<SDOperand, 8> MaskVec;
+  unsigned NumElems = Mask.getNumOperands();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDOperand Arg = Mask.getOperand(i);
+    if (Arg.getOpcode() != ISD::UNDEF) {
+      unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+      if (Val > NumElems) {
+        Arg = DAG.getConstant(NumElems, Arg.getValueType());
+        Changed = true;
+      }
+    }
+    MaskVec.push_back(Arg);
+  }
+
+  if (Changed)
+    Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(),
+                       &MaskVec[0], MaskVec.size());
+  return Mask;
+}
+
+/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDOperand getMOVLMask(unsigned NumElems, SelectionDAG &DAG) {
+  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+
+  SmallVector<SDOperand, 8> MaskVec;
+  MaskVec.push_back(DAG.getConstant(NumElems, BaseVT));
+  for (unsigned i = 1; i != NumElems; ++i)
+    MaskVec.push_back(DAG.getConstant(i, BaseVT));
+  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+}
+
+/// getUnpacklMask - Returns a vector_shuffle mask for an unpackl operation
+/// of specified width.
+static SDOperand getUnpacklMask(unsigned NumElems, SelectionDAG &DAG) {
+  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+  SmallVector<SDOperand, 8> MaskVec;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+    MaskVec.push_back(DAG.getConstant(i,            BaseVT));
+    MaskVec.push_back(DAG.getConstant(i + NumElems, BaseVT));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+}
+
+/// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation
+/// of specified width.
+static SDOperand getUnpackhMask(unsigned NumElems, SelectionDAG &DAG) {
+  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+  unsigned Half = NumElems/2;
+  SmallVector<SDOperand, 8> MaskVec;
+  for (unsigned i = 0; i != Half; ++i) {
+    MaskVec.push_back(DAG.getConstant(i + Half,            BaseVT));
+    MaskVec.push_back(DAG.getConstant(i + NumElems + Half, BaseVT));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+}
+
+/// PromoteSplat - Promote a splat of v8i16 or v16i8 to v4i32.
+///
+static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand V1 = Op.getOperand(0);
+  SDOperand Mask = Op.getOperand(2);
+  MVT::ValueType VT = Op.getValueType();
+  unsigned NumElems = Mask.getNumOperands();
+  Mask = getUnpacklMask(NumElems, DAG);
+  while (NumElems != 4) {
+    V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask);
+    NumElems >>= 1;
+  }
+  V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1);
+
+  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+  Mask = getZeroVector(MaskVT, DAG);
+  SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1,
+                                  DAG.getNode(ISD::UNDEF, MVT::v4i32), Mask);
+  return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
+}
+
+/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
+/// vector of zero or undef vector.
+static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, MVT::ValueType VT,
+                                             unsigned NumElems, unsigned Idx,
+                                             bool isZero, SelectionDAG &DAG) {
+  SDOperand V1 = isZero ? getZeroVector(VT, DAG) : DAG.getNode(ISD::UNDEF, VT);
+  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
+  SDOperand Zero = DAG.getConstant(0, EVT);
+  SmallVector<SDOperand, 8> MaskVec(NumElems, Zero);
+  MaskVec[Idx] = DAG.getConstant(NumElems, EVT);
+  SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                               &MaskVec[0], MaskVec.size());
+  return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+}
+
+/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
+///
+static SDOperand LowerBuildVectorv16i8(SDOperand Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG, TargetLowering &TLI) {
+  if (NumNonZero > 8)
+    return SDOperand();
+
+  SDOperand V(0, 0);
+  bool First = true;
+  for (unsigned i = 0; i < 16; ++i) {
+    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+    if (ThisIsNonZero && First) {
+      if (NumZero)
+        V = getZeroVector(MVT::v8i16, DAG);
+      else
+        V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
+      First = false;
+    }
+
+    if ((i & 1) != 0) {
+      SDOperand ThisElt(0, 0), LastElt(0, 0);
+      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      if (LastIsNonZero) {
+        LastElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i-1));
+      }
+      if (ThisIsNonZero) {
+        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i));
+        ThisElt = DAG.getNode(ISD::SHL, MVT::i16,
+                              ThisElt, DAG.getConstant(8, MVT::i8));
+        if (LastIsNonZero)
+          ThisElt = DAG.getNode(ISD::OR, MVT::i16, ThisElt, LastElt);
+      } else
+        ThisElt = LastElt;
+
+      if (ThisElt.Val)
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, ThisElt,
+                        DAG.getConstant(i/2, TLI.getPointerTy()));
+    }
+  }
+
+  return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, V);
+}
+
+/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+///
+static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG, TargetLowering &TLI) {
+  if (NumNonZero > 4)
+    return SDOperand();
+
+  SDOperand V(0, 0);
+  bool First = true;
+  for (unsigned i = 0; i < 8; ++i) {
+    bool isNonZero = (NonZeros & (1 << i)) != 0;
+    if (isNonZero) {
+      if (First) {
+        if (NumZero)
+          V = getZeroVector(MVT::v8i16, DAG);
+        else
+          V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
+        First = false;
+      }
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, Op.getOperand(i),
+                      DAG.getConstant(i, TLI.getPointerTy()));
+    }
+  }
+
+  return V;
+}
+
+SDOperand
+X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
+  // All zero's are handled with pxor.
+  if (ISD::isBuildVectorAllZeros(Op.Val))
+    return Op;
+
+  // All one's are handled with pcmpeqd.
+  if (ISD::isBuildVectorAllOnes(Op.Val))
+    return Op;
+
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType EVT = MVT::getVectorElementType(VT);
+  unsigned EVTBits = MVT::getSizeInBits(EVT);
+
+  unsigned NumElems = Op.getNumOperands();
+  unsigned NumZero  = 0;
+  unsigned NumNonZero = 0;
+  unsigned NonZeros = 0;
+  std::set<SDOperand> Values;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDOperand Elt = Op.getOperand(i);
+    if (Elt.getOpcode() != ISD::UNDEF) {
+      Values.insert(Elt);
+      if (isZeroNode(Elt))
+        NumZero++;
+      else {
+        NonZeros |= (1 << i);
+        NumNonZero++;
+      }
+    }
+  }
+
+  if (NumNonZero == 0) {
+    if (NumZero == 0)
+      // All undef vector. Return an UNDEF.
+      return DAG.getNode(ISD::UNDEF, VT);
+    else
+      // A mix of zero and undef. Return a zero vector.
+      return getZeroVector(VT, DAG);
+  }
+
+  // Splat is obviously ok. Let legalizer expand it to a shuffle.
+  if (Values.size() == 1)
+    return SDOperand();
+
+  // Special case for single non-zero element.
+  if (NumNonZero == 1) {
+    unsigned Idx = CountTrailingZeros_32(NonZeros);
+    SDOperand Item = Op.getOperand(Idx);
+    Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
+    if (Idx == 0)
+      // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+      return getShuffleVectorZeroOrUndef(Item, VT, NumElems, Idx,
+                                         NumZero > 0, DAG);
+
+    if (EVTBits == 32) {
+      // Turn it into a shuffle of zero and zero-extended scalar to vector.
+      Item = getShuffleVectorZeroOrUndef(Item, VT, NumElems, 0, NumZero > 0,
+                                         DAG);
+      MVT::ValueType MaskVT  = MVT::getIntVectorWithNumElements(NumElems);
+      MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
+      SmallVector<SDOperand, 8> MaskVec;
+      for (unsigned i = 0; i < NumElems; i++)
+        MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
+      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                   &MaskVec[0], MaskVec.size());
+      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Item,
+                         DAG.getNode(ISD::UNDEF, VT), Mask);
+    }
+  }
+
+  // Let legalizer expand 2-wide build_vectors.
+  if (EVTBits == 64)
+    return SDOperand();
+
+  // If element VT is < 32 bits, convert it to inserts into a zero vector.
+  if (EVTBits == 8 && NumElems == 16) {
+    SDOperand V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                        *this);
+    if (V.Val) return V;
+  }
+
+  if (EVTBits == 16 && NumElems == 8) {
+    SDOperand V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                        *this);
+    if (V.Val) return V;
+  }
+
+  // If element VT is == 32 bits, turn it into a number of shuffles.
+  SmallVector<SDOperand, 8> V;
+  V.resize(NumElems);
+  if (NumElems == 4 && NumZero > 0) {
+    for (unsigned i = 0; i < 4; ++i) {
+      bool isZero = !(NonZeros & (1 << i));
+      if (isZero)
+        V[i] = getZeroVector(VT, DAG);
+      else
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+        default: break;
+        case 0:
+          V[i] = V[i*2];  // Must be a zero vector.
+          break;
+        case 1:
+          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2+1], V[i*2],
+                             getMOVLMask(NumElems, DAG));
+          break;
+        case 2:
+          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
+                             getMOVLMask(NumElems, DAG));
+          break;
+        case 3:
+          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
+                             getUnpacklMask(NumElems, DAG));
+          break;
+      }
+    }
+
+    // Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd)
+    // clears the upper bits.
+    // FIXME: we can do the same for v4f32 case when we know both parts of
+    // the lower half come from scalar_to_vector (loadf32). We should do
+    // that in post legalizer dag combiner with target specific hooks.
+    if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0)
+      return V[0];
+    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+    MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
+    SmallVector<SDOperand, 8> MaskVec;
+    bool Reverse = (NonZeros & 0x3) == 2;
+    for (unsigned i = 0; i < 2; ++i)
+      if (Reverse)
+        MaskVec.push_back(DAG.getConstant(1-i, EVT));
+      else
+        MaskVec.push_back(DAG.getConstant(i, EVT));
+    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+    for (unsigned i = 0; i < 2; ++i)
+      if (Reverse)
+        MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT));
+      else
+        MaskVec.push_back(DAG.getConstant(i+NumElems, EVT));
+    SDOperand ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                     &MaskVec[0], MaskVec.size());
+    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[0], V[1], ShufMask);
+  }
+
+  if (Values.size() > 2) {
+    // Expand into a number of unpckl*.
+    // e.g. for v4f32
+    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
+    SDOperand UnpckMask = getUnpacklMask(NumElems, DAG);
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
+    NumElems >>= 1;
+    while (NumElems != 0) {
+      for (unsigned i = 0; i < NumElems; ++i)
+        V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i], V[i + NumElems],
+                           UnpckMask);
+      NumElems >>= 1;
+    }
+    return V[0];
+  }
+
+  return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand V1 = Op.getOperand(0);
+  SDOperand V2 = Op.getOperand(1);
+  SDOperand PermMask = Op.getOperand(2);
+  MVT::ValueType VT = Op.getValueType();
+  unsigned NumElems = PermMask.getNumOperands();
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsSplat = false;
+  bool V2IsSplat = false;
+
+  if (isUndefShuffle(Op.Val))
+    return DAG.getNode(ISD::UNDEF, VT);
+
+  if (isZeroShuffle(Op.Val))
+    return getZeroVector(VT, DAG);
+
+  if (isIdentityMask(PermMask.Val))
+    return V1;
+  else if (isIdentityMask(PermMask.Val, true))
+    return V2;
+
+  if (isSplatMask(PermMask.Val)) {
+    if (NumElems <= 4) return Op;
+    // Promote it to a v4i32 splat.
+    return PromoteSplat(Op, DAG);
+  }
+
+  if (X86::isMOVLMask(PermMask.Val))
+    return (V1IsUndef) ? V2 : Op;
+
+  if (X86::isMOVSHDUPMask(PermMask.Val) ||
+      X86::isMOVSLDUPMask(PermMask.Val) ||
+      X86::isMOVHLPSMask(PermMask.Val) ||
+      X86::isMOVHPMask(PermMask.Val) ||
+      X86::isMOVLPMask(PermMask.Val))
+    return Op;
+
+  if (ShouldXformToMOVHLPS(PermMask.Val) ||
+      ShouldXformToMOVLP(V1.Val, V2.Val, PermMask.Val))
+    return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+
+  bool Commuted = false;
+  V1IsSplat = isSplatVector(V1.Val);
+  V2IsSplat = isSplatVector(V2.Val);
+  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
+    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+    std::swap(V1IsSplat, V2IsSplat);
+    std::swap(V1IsUndef, V2IsUndef);
+    Commuted = true;
+  }
+
+  if (isCommutedMOVL(PermMask.Val, V2IsSplat, V2IsUndef)) {
+    if (V2IsUndef) return V1;
+    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+    if (V2IsSplat) {
+      // V2 is a splat, so the mask may be malformed. That is, it may point
+      // to any V2 element. The instruction selectior won't like this. Get
+      // a corrected mask and commute to form a proper MOVS{S|D}.
+      SDOperand NewMask = getMOVLMask(NumElems, DAG);
+      if (NewMask.Val != PermMask.Val)
+        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
+    }
+    return Op;
+  }
+
+  if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
+      X86::isUNPCKH_v_undef_Mask(PermMask.Val) ||
+      X86::isUNPCKLMask(PermMask.Val) ||
+      X86::isUNPCKHMask(PermMask.Val))
+    return Op;
+
+  if (V2IsSplat) {
+    // Normalize mask so all entries that point to V2 points to its first
+    // element then try to match unpck{h|l} again. If match, return a
+    // new vector_shuffle with the corrected mask.
+    SDOperand NewMask = NormalizeMask(PermMask, DAG);
+    if (NewMask.Val != PermMask.Val) {
+      if (X86::isUNPCKLMask(PermMask.Val, true)) {
+        SDOperand NewMask = getUnpacklMask(NumElems, DAG);
+        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
+      } else if (X86::isUNPCKHMask(PermMask.Val, true)) {
+        SDOperand NewMask = getUnpackhMask(NumElems, DAG);
+        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
+      }
+    }
+  }
+
+  // Normalize the node to match x86 shuffle ops if needed
+  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(PermMask.Val))
+      Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+
+  if (Commuted) {
+    // Commute is back and try unpck* again.
+    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+    if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
+        X86::isUNPCKH_v_undef_Mask(PermMask.Val) ||
+        X86::isUNPCKLMask(PermMask.Val) ||
+        X86::isUNPCKHMask(PermMask.Val))
+      return Op;
+  }
+
+  // If VT is integer, try PSHUF* first, then SHUFP*.
+  if (MVT::isInteger(VT)) {
+    if (X86::isPSHUFDMask(PermMask.Val) ||
+        X86::isPSHUFHWMask(PermMask.Val) ||
+        X86::isPSHUFLWMask(PermMask.Val)) {
+      if (V2.getOpcode() != ISD::UNDEF)
+        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
+                           DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask);
+      return Op;
+    }
+
+    if (X86::isSHUFPMask(PermMask.Val) &&
+        MVT::getSizeInBits(VT) != 64)    // Don't do this for MMX.
+      return Op;
+
+    // Handle v8i16 shuffle high / low shuffle node pair.
+    if (VT == MVT::v8i16 && isPSHUFHW_PSHUFLWMask(PermMask.Val)) {
+      MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+      MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+      SmallVector<SDOperand, 8> MaskVec;
+      for (unsigned i = 0; i != 4; ++i)
+        MaskVec.push_back(PermMask.getOperand(i));
+      for (unsigned i = 4; i != 8; ++i)
+        MaskVec.push_back(DAG.getConstant(i, BaseVT));
+      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                   &MaskVec[0], MaskVec.size());
+      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+      MaskVec.clear();
+      for (unsigned i = 0; i != 4; ++i)
+        MaskVec.push_back(DAG.getConstant(i, BaseVT));
+      for (unsigned i = 4; i != 8; ++i)
+        MaskVec.push_back(PermMask.getOperand(i));
+      Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0],MaskVec.size());
+      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+    }
+  } else {
+    // Floating point cases in the other order.
+    if (X86::isSHUFPMask(PermMask.Val))
+      return Op;
+    if (X86::isPSHUFDMask(PermMask.Val) ||
+        X86::isPSHUFHWMask(PermMask.Val) ||
+        X86::isPSHUFLWMask(PermMask.Val)) {
+      if (V2.getOpcode() != ISD::UNDEF)
+        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
+                           DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask);
+      return Op;
+    }
+  }
+
+  if (NumElems == 4 && 
+      // Don't do this for MMX.
+      MVT::getSizeInBits(VT) != 64) {
+    MVT::ValueType MaskVT = PermMask.getValueType();
+    MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
+    SmallVector<std::pair<int, int>, 8> Locs;
+    Locs.reserve(NumElems);
+    SmallVector<SDOperand, 8> Mask1(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+    SmallVector<SDOperand, 8> Mask2(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+    unsigned NumHi = 0;
+    unsigned NumLo = 0;
+    // If no more than two elements come from either vector. This can be
+    // implemented with two shuffles. First shuffle gather the elements.
+    // The second shuffle, which takes the first shuffle as both of its
+    // vector operands, put the elements into the right order.
+    for (unsigned i = 0; i != NumElems; ++i) {
+      SDOperand Elt = PermMask.getOperand(i);
+      if (Elt.getOpcode() == ISD::UNDEF) {
+        Locs[i] = std::make_pair(-1, -1);
+      } else {
+        unsigned Val = cast<ConstantSDNode>(Elt)->getValue();
+        if (Val < NumElems) {
+          Locs[i] = std::make_pair(0, NumLo);
+          Mask1[NumLo] = Elt;
+          NumLo++;
+        } else {
+          Locs[i] = std::make_pair(1, NumHi);
+          if (2+NumHi < NumElems)
+            Mask1[2+NumHi] = Elt;
+          NumHi++;
+        }
+      }
+    }
+    if (NumLo <= 2 && NumHi <= 2) {
+      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
+                       DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                   &Mask1[0], Mask1.size()));
+      for (unsigned i = 0; i != NumElems; ++i) {
+        if (Locs[i].first == -1)
+          continue;
+        else {
+          unsigned Idx = (i < NumElems/2) ? 0 : NumElems;
+          Idx += Locs[i].first * (NumElems/2) + Locs[i].second;
+          Mask2[i] = DAG.getConstant(Idx, MaskEVT);
+        }
+      }
+
+      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1,
+                         DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                     &Mask2[0], Mask2.size()));
+    }
+
+    // Break it into (shuffle shuffle_hi, shuffle_lo).
+    Locs.clear();
+    SmallVector<SDOperand,8> LoMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+    SmallVector<SDOperand,8> HiMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+    SmallVector<SDOperand,8> *MaskPtr = &LoMask;
+    unsigned MaskIdx = 0;
+    unsigned LoIdx = 0;
+    unsigned HiIdx = NumElems/2;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      if (i == NumElems/2) {
+        MaskPtr = &HiMask;
+        MaskIdx = 1;
+        LoIdx = 0;
+        HiIdx = NumElems/2;
+      }
+      SDOperand Elt = PermMask.getOperand(i);
+      if (Elt.getOpcode() == ISD::UNDEF) {
+        Locs[i] = std::make_pair(-1, -1);
+      } else if (cast<ConstantSDNode>(Elt)->getValue() < NumElems) {
+        Locs[i] = std::make_pair(MaskIdx, LoIdx);
+        (*MaskPtr)[LoIdx] = Elt;
+        LoIdx++;
+      } else {
+        Locs[i] = std::make_pair(MaskIdx, HiIdx);
+        (*MaskPtr)[HiIdx] = Elt;
+        HiIdx++;
+      }
+    }
+
+    SDOperand LoShuffle =
+      DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
+                  DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                              &LoMask[0], LoMask.size()));
+    SDOperand HiShuffle =
+      DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
+                  DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                              &HiMask[0], HiMask.size()));
+    SmallVector<SDOperand, 8> MaskOps;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      if (Locs[i].first == -1) {
+        MaskOps.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
+      } else {
+        unsigned Idx = Locs[i].first * NumElems + Locs[i].second;
+        MaskOps.push_back(DAG.getConstant(Idx, MaskEVT));
+      }
+    }
+    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, LoShuffle, HiShuffle,
+                       DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                   &MaskOps[0], MaskOps.size()));
+  }
+
+  return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return SDOperand();
+
+  MVT::ValueType VT = Op.getValueType();
+  // TODO: handle v16i8.
+  if (MVT::getSizeInBits(VT) == 16) {
+    // Transform it so it match pextrw which produces a 32-bit result.
+    MVT::ValueType EVT = (MVT::ValueType)(VT+1);
+    SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, EVT,
+                                    Op.getOperand(0), Op.getOperand(1));
+    SDOperand Assert  = DAG.getNode(ISD::AssertZext, EVT, Extract,
+                                    DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, VT, Assert);
+  } else if (MVT::getSizeInBits(VT) == 32) {
+    SDOperand Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+    if (Idx == 0)
+      return Op;
+    // SHUFPS the element to the lowest double word, then movss.
+    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+    SmallVector<SDOperand, 8> IdxVec;
+    IdxVec.push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT)));
+    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                 &IdxVec[0], IdxVec.size());
+    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
+                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
+                       DAG.getConstant(0, getPointerTy()));
+  } else if (MVT::getSizeInBits(VT) == 64) {
+    SDOperand Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+    if (Idx == 0)
+      return Op;
+
+    // UNPCKHPD the element to the lowest double word, then movsd.
+    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+    SmallVector<SDOperand, 8> IdxVec;
+    IdxVec.push_back(DAG.getConstant(1, MVT::getVectorElementType(MaskVT)));
+    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                 &IdxVec[0], IdxVec.size());
+    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
+                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
+                       DAG.getConstant(0, getPointerTy()));
+  }
+
+  return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
+  // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+  // as its second argument.
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType BaseVT = MVT::getVectorElementType(VT);
+  SDOperand N0 = Op.getOperand(0);
+  SDOperand N1 = Op.getOperand(1);
+  SDOperand N2 = Op.getOperand(2);
+  if (MVT::getSizeInBits(BaseVT) == 16) {
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getConstant(cast<ConstantSDNode>(N2)->getValue(),getPointerTy());
+    return DAG.getNode(X86ISD::PINSRW, VT, N0, N1, N2);
+  } else if (MVT::getSizeInBits(BaseVT) == 32) {
+    unsigned Idx = cast<ConstantSDNode>(N2)->getValue();
+    if (Idx == 0) {
+      // Use a movss.
+      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, N1);
+      MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+      MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+      SmallVector<SDOperand, 8> MaskVec;
+      MaskVec.push_back(DAG.getConstant(4, BaseVT));
+      for (unsigned i = 1; i <= 3; ++i)
+        MaskVec.push_back(DAG.getConstant(i, BaseVT));
+      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, N0, N1,
+                         DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                     &MaskVec[0], MaskVec.size()));
+    } else {
+      // Use two pinsrw instructions to insert a 32 bit value.
+      Idx <<= 1;
+      if (MVT::isFloatingPoint(N1.getValueType())) {
+        if (ISD::isNON_EXTLoad(N1.Val)) {
+          // Just load directly from f32mem to GR32.
+          LoadSDNode *LD = cast<LoadSDNode>(N1);
+          N1 = DAG.getLoad(MVT::i32, LD->getChain(), LD->getBasePtr(),
+                           LD->getSrcValue(), LD->getSrcValueOffset());
+        } else {
+          N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v4f32, N1);
+          N1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, N1);
+          N1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, N1,
+                           DAG.getConstant(0, getPointerTy()));
+        }
+      }
+      N0 = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, N0);
+      N0 = DAG.getNode(X86ISD::PINSRW, MVT::v8i16, N0, N1,
+                       DAG.getConstant(Idx, getPointerTy()));
+      N1 = DAG.getNode(ISD::SRL, MVT::i32, N1, DAG.getConstant(16, MVT::i8));
+      N0 = DAG.getNode(X86ISD::PINSRW, MVT::v8i16, N0, N1,
+                       DAG.getConstant(Idx+1, getPointerTy()));
+      return DAG.getNode(ISD::BIT_CONVERT, VT, N0);
+    }
+  }
+
+  return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0));
+  return DAG.getNode(X86ISD::S2VEC, Op.getValueType(), AnyExt);
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDOperand
+X86TargetLowering::LowerConstantPool(SDOperand Op, SelectionDAG &DAG) {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDOperand Result = DAG.getTargetConstantPool(CP->getConstVal(),
+                                               getPointerTy(),
+                                               CP->getAlignment());
+  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDOperand
+X86TargetLowering::LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) {
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDOperand Result = DAG.getTargetGlobalAddress(GV, getPointerTy());
+  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                         Result);
+  }
+  
+  // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
+  // load the value at address GV, not the value of GV itself. This means that
+  // the GlobalAddress must be in the base or index register of the address, not
+  // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
+  // The same applies for external symbols during PIC codegen
+  if (Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false))
+    Result = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), Result, NULL, 0);
+
+  return Result;
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+static SDOperand
+LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                              const MVT::ValueType PtrVT) {
+  SDOperand InFlag;
+  SDOperand Chain = DAG.getCopyToReg(DAG.getEntryNode(), X86::EBX,
+                                     DAG.getNode(X86ISD::GlobalBaseReg,
+                                                 PtrVT), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // emit leal symbol@TLSGD(,%ebx,1), %eax
+  SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
+  SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                             GA->getValueType(0),
+                                             GA->getOffset());
+  SDOperand Ops[] = { Chain,  TGA, InFlag };
+  SDOperand Result = DAG.getNode(X86ISD::TLSADDR, NodeTys, Ops, 3);
+  InFlag = Result.getValue(2);
+  Chain = Result.getValue(1);
+
+  // call ___tls_get_addr. This function receives its argument in
+  // the register EAX.
+  Chain = DAG.getCopyToReg(Chain, X86::EAX, Result, InFlag);
+  InFlag = Chain.getValue(1);
+
+  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDOperand Ops1[] = { Chain,
+                      DAG.getTargetExternalSymbol("___tls_get_addr",
+                                                  PtrVT),
+                      DAG.getRegister(X86::EAX, PtrVT),
+                      DAG.getRegister(X86::EBX, PtrVT),
+                      InFlag };
+  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops1, 5);
+  InFlag = Chain.getValue(1);
+
+  return DAG.getCopyFromReg(Chain, X86::EAX, PtrVT, InFlag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
+// "local exec" model.
+static SDOperand
+LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                         const MVT::ValueType PtrVT) {
+  // Get the Thread Pointer
+  SDOperand ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER, PtrVT);
+  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
+  // exec)
+  SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                             GA->getValueType(0),
+                                             GA->getOffset());
+  SDOperand Offset = DAG.getNode(X86ISD::Wrapper, PtrVT, TGA);
+
+  if (GA->getGlobal()->isDeclaration()) // initial exec TLS model
+    Offset = DAG.getLoad(PtrVT, DAG.getEntryNode(), Offset, NULL, 0);
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, PtrVT, ThreadPointer, Offset);
+}
+
+SDOperand
+X86TargetLowering::LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) {
+  // TODO: implement the "local dynamic" model
+  // TODO: implement the "initial exec"model for pic executables
+  assert(!Subtarget->is64Bit() && Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF and 64-bit targets");
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
+  // otherwise use the "Local Exec"TLS Model
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
+    return LowerToTLSGeneralDynamicModel(GA, DAG, getPointerTy());
+  else
+    return LowerToTLSExecModel(GA, DAG, getPointerTy());
+}
+
+SDOperand
+X86TargetLowering::LowerExternalSymbol(SDOperand Op, SelectionDAG &DAG) {
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  SDOperand Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDOperand X86TargetLowering::LowerJumpTable(SDOperand Op, SelectionDAG &DAG) {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDOperand Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
+  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDOperand X86TargetLowering::LowerShift(SDOperand Op, SelectionDAG &DAG) {
+    assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 &&
+           "Not an i64 shift!");
+    bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+    SDOperand ShOpLo = Op.getOperand(0);
+    SDOperand ShOpHi = Op.getOperand(1);
+    SDOperand ShAmt  = Op.getOperand(2);
+    SDOperand Tmp1 = isSRA ?
+      DAG.getNode(ISD::SRA, MVT::i32, ShOpHi, DAG.getConstant(31, MVT::i8)) :
+      DAG.getConstant(0, MVT::i32);
+
+    SDOperand Tmp2, Tmp3;
+    if (Op.getOpcode() == ISD::SHL_PARTS) {
+      Tmp2 = DAG.getNode(X86ISD::SHLD, MVT::i32, ShOpHi, ShOpLo, ShAmt);
+      Tmp3 = DAG.getNode(ISD::SHL, MVT::i32, ShOpLo, ShAmt);
+    } else {
+      Tmp2 = DAG.getNode(X86ISD::SHRD, MVT::i32, ShOpLo, ShOpHi, ShAmt);
+      Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, MVT::i32, ShOpHi, ShAmt);
+    }
+
+    const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+    SDOperand AndNode = DAG.getNode(ISD::AND, MVT::i8, ShAmt,
+                                    DAG.getConstant(32, MVT::i8));
+    SDOperand COps[]={DAG.getEntryNode(), AndNode, DAG.getConstant(0, MVT::i8)};
+    SDOperand InFlag = DAG.getNode(X86ISD::CMP, VTs, 2, COps, 3).getValue(1);
+
+    SDOperand Hi, Lo;
+    SDOperand CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+
+    VTs = DAG.getNodeValueTypes(MVT::i32, MVT::Flag);
+    SmallVector<SDOperand, 4> Ops;
+    if (Op.getOpcode() == ISD::SHL_PARTS) {
+      Ops.push_back(Tmp2);
+      Ops.push_back(Tmp3);
+      Ops.push_back(CC);
+      Ops.push_back(InFlag);
+      Hi = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+      InFlag = Hi.getValue(1);
+
+      Ops.clear();
+      Ops.push_back(Tmp3);
+      Ops.push_back(Tmp1);
+      Ops.push_back(CC);
+      Ops.push_back(InFlag);
+      Lo = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+    } else {
+      Ops.push_back(Tmp2);
+      Ops.push_back(Tmp3);
+      Ops.push_back(CC);
+      Ops.push_back(InFlag);
+      Lo = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+      InFlag = Lo.getValue(1);
+
+      Ops.clear();
+      Ops.push_back(Tmp3);
+      Ops.push_back(Tmp1);
+      Ops.push_back(CC);
+      Ops.push_back(InFlag);
+      Hi = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+    }
+
+    VTs = DAG.getNodeValueTypes(MVT::i32, MVT::i32);
+    Ops.clear();
+    Ops.push_back(Lo);
+    Ops.push_back(Hi);
+    return DAG.getNode(ISD::MERGE_VALUES, VTs, 2, &Ops[0], Ops.size());
+}
+
+SDOperand X86TargetLowering::LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG) {
+  assert(Op.getOperand(0).getValueType() <= MVT::i64 &&
+         Op.getOperand(0).getValueType() >= MVT::i16 &&
+         "Unknown SINT_TO_FP to lower!");
+
+  SDOperand Result;
+  MVT::ValueType SrcVT = Op.getOperand(0).getValueType();
+  unsigned Size = MVT::getSizeInBits(SrcVT)/8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
+  SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDOperand Chain = DAG.getStore(DAG.getEntryNode(), Op.getOperand(0),
+                                 StackSlot, NULL, 0);
+
+  // Build the FILD
+  SDVTList Tys;
+  if (X86ScalarSSE)
+    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
+  else
+    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(StackSlot);
+  Ops.push_back(DAG.getValueType(SrcVT));
+  Result = DAG.getNode(X86ScalarSSE ? X86ISD::FILD_FLAG :X86ISD::FILD,
+                       Tys, &Ops[0], Ops.size());
+
+  if (X86ScalarSSE) {
+    Chain = Result.getValue(1);
+    SDOperand InFlag = Result.getValue(2);
+
+    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+    // shouldn't be necessary except that RFP cannot be live across
+    // multiple blocks. When stackifier is fixed, they can be uncoupled.
+    MachineFunction &MF = DAG.getMachineFunction();
+    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
+    SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    Tys = DAG.getVTList(MVT::Other);
+    SmallVector<SDOperand, 8> Ops;
+    Ops.push_back(Chain);
+    Ops.push_back(Result);
+    Ops.push_back(StackSlot);
+    Ops.push_back(DAG.getValueType(Op.getValueType()));
+    Ops.push_back(InFlag);
+    Chain = DAG.getNode(X86ISD::FST, Tys, &Ops[0], Ops.size());
+    Result = DAG.getLoad(Op.getValueType(), Chain, StackSlot, NULL, 0);
+  }
+
+  return Result;
+}
+
+SDOperand X86TargetLowering::LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) {
+  assert(Op.getValueType() <= MVT::i64 && Op.getValueType() >= MVT::i16 &&
+         "Unknown FP_TO_SINT to lower!");
+  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
+  // stack slot.
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned MemSize = MVT::getSizeInBits(Op.getValueType())/8;
+  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
+  SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+  unsigned Opc;
+  switch (Op.getValueType()) {
+    default: assert(0 && "Invalid FP_TO_SINT to lower!");
+    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+  }
+
+  SDOperand Chain = DAG.getEntryNode();
+  SDOperand Value = Op.getOperand(0);
+  if (X86ScalarSSE) {
+    assert(Op.getValueType() == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+    Chain = DAG.getStore(Chain, Value, StackSlot, NULL, 0);
+    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+    SDOperand Ops[] = {
+      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
+    };
+    Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3);
+    Chain = Value.getValue(1);
+    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
+    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  }
+
+  // Build the FP_TO_INT*_IN_MEM
+  SDOperand Ops[] = { Chain, Value, StackSlot };
+  SDOperand FIST = DAG.getNode(Opc, MVT::Other, Ops, 3);
+
+  // Load the result.
+  return DAG.getLoad(Op.getValueType(), FIST, StackSlot, NULL, 0);
+}
+
+SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType EltVT = VT;
+  if (MVT::isVector(VT))
+    EltVT = MVT::getVectorElementType(VT);
+  const Type *OpNTy =  MVT::getTypeForValueType(EltVT);
+  std::vector<Constant*> CV;
+  if (EltVT == MVT::f64) {
+    Constant *C = ConstantFP::get(OpNTy, BitsToDouble(~(1ULL << 63)));
+    CV.push_back(C);
+    CV.push_back(C);
+  } else {
+    Constant *C = ConstantFP::get(OpNTy, BitsToFloat(~(1U << 31)));
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+  }
+  Constant *CS = ConstantStruct::get(CV);
+  SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+  SmallVector<SDOperand, 3> Ops;
+  Ops.push_back(DAG.getEntryNode());
+  Ops.push_back(CPIdx);
+  Ops.push_back(DAG.getSrcValue(NULL));
+  SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  return DAG.getNode(X86ISD::FAND, VT, Op.getOperand(0), Mask);
+}
+
+SDOperand X86TargetLowering::LowerFNEG(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType EltVT = VT;
+  if (MVT::isVector(VT))
+    EltVT = MVT::getVectorElementType(VT);
+  const Type *OpNTy =  MVT::getTypeForValueType(EltVT);
+  std::vector<Constant*> CV;
+  if (EltVT == MVT::f64) {
+    Constant *C = ConstantFP::get(OpNTy, BitsToDouble(1ULL << 63));
+    CV.push_back(C);
+    CV.push_back(C);
+  } else {
+    Constant *C = ConstantFP::get(OpNTy, BitsToFloat(1U << 31));
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+  }
+  Constant *CS = ConstantStruct::get(CV);
+  SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+  SmallVector<SDOperand, 3> Ops;
+  Ops.push_back(DAG.getEntryNode());
+  Ops.push_back(CPIdx);
+  Ops.push_back(DAG.getSrcValue(NULL));
+  SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  return DAG.getNode(X86ISD::FXOR, VT, Op.getOperand(0), Mask);
+}
+
+SDOperand X86TargetLowering::LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Op0 = Op.getOperand(0);
+  SDOperand Op1 = Op.getOperand(1);
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType SrcVT = Op1.getValueType();
+  const Type *SrcTy =  MVT::getTypeForValueType(SrcVT);
+
+  // If second operand is smaller, extend it first.
+  if (MVT::getSizeInBits(SrcVT) < MVT::getSizeInBits(VT)) {
+    Op1 = DAG.getNode(ISD::FP_EXTEND, VT, Op1);
+    SrcVT = VT;
+  }
+
+  // First get the sign bit of second operand.
+  std::vector<Constant*> CV;
+  if (SrcVT == MVT::f64) {
+    CV.push_back(ConstantFP::get(SrcTy, BitsToDouble(1ULL << 63)));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+  } else {
+    CV.push_back(ConstantFP::get(SrcTy, BitsToFloat(1U << 31)));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+  }
+  Constant *CS = ConstantStruct::get(CV);
+  SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+  SDVTList Tys = DAG.getVTList(SrcVT, MVT::Other);
+  SmallVector<SDOperand, 3> Ops;
+  Ops.push_back(DAG.getEntryNode());
+  Ops.push_back(CPIdx);
+  Ops.push_back(DAG.getSrcValue(NULL));
+  SDOperand Mask1 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  SDOperand SignBit = DAG.getNode(X86ISD::FAND, SrcVT, Op1, Mask1);
+
+  // Shift sign bit right or left if the two operands have different types.
+  if (MVT::getSizeInBits(SrcVT) > MVT::getSizeInBits(VT)) {
+    // Op0 is MVT::f32, Op1 is MVT::f64.
+    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2f64, SignBit);
+    SignBit = DAG.getNode(X86ISD::FSRL, MVT::v2f64, SignBit,
+                          DAG.getConstant(32, MVT::i32));
+    SignBit = DAG.getNode(ISD::BIT_CONVERT, MVT::v4f32, SignBit);
+    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::f32, SignBit,
+                          DAG.getConstant(0, getPointerTy()));
+  }
+
+  // Clear first operand sign bit.
+  CV.clear();
+  if (VT == MVT::f64) {
+    CV.push_back(ConstantFP::get(SrcTy, BitsToDouble(~(1ULL << 63))));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+  } else {
+    CV.push_back(ConstantFP::get(SrcTy, BitsToFloat(~(1U << 31))));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+  }
+  CS = ConstantStruct::get(CV);
+  CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+  Tys = DAG.getVTList(VT, MVT::Other);
+  Ops.clear();
+  Ops.push_back(DAG.getEntryNode());
+  Ops.push_back(CPIdx);
+  Ops.push_back(DAG.getSrcValue(NULL));
+  SDOperand Mask2 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  SDOperand Val = DAG.getNode(X86ISD::FAND, VT, Op0, Mask2);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(X86ISD::FOR, VT, Val, SignBit);
+}
+
+SDOperand X86TargetLowering::LowerSETCC(SDOperand Op, SelectionDAG &DAG,
+                                        SDOperand Chain) {
+  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
+  SDOperand Cond;
+  SDOperand Op0 = Op.getOperand(0);
+  SDOperand Op1 = Op.getOperand(1);
+  SDOperand CC = Op.getOperand(2);
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  const MVT::ValueType *VTs1 = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+  const MVT::ValueType *VTs2 = DAG.getNodeValueTypes(MVT::i8, MVT::Flag);
+  bool isFP = MVT::isFloatingPoint(Op.getOperand(1).getValueType());
+  unsigned X86CC;
+
+  if (translateX86CC(cast<CondCodeSDNode>(CC)->get(), isFP, X86CC,
+                     Op0, Op1, DAG)) {
+    SDOperand Ops1[] = { Chain, Op0, Op1 };
+    Cond = DAG.getNode(X86ISD::CMP, VTs1, 2, Ops1, 3).getValue(1);
+    SDOperand Ops2[] = { DAG.getConstant(X86CC, MVT::i8), Cond };
+    return DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2);
+  }
+
+  assert(isFP && "Illegal integer SetCC!");
+
+  SDOperand COps[] = { Chain, Op0, Op1 };
+  Cond = DAG.getNode(X86ISD::CMP, VTs1, 2, COps, 3).getValue(1);
+
+  switch (SetCCOpcode) {
+  default: assert(false && "Illegal floating point SetCC!");
+  case ISD::SETOEQ: {  // !PF & ZF
+    SDOperand Ops1[] = { DAG.getConstant(X86::COND_NP, MVT::i8), Cond };
+    SDOperand Tmp1 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops1, 2);
+    SDOperand Ops2[] = { DAG.getConstant(X86::COND_E, MVT::i8),
+                         Tmp1.getValue(1) };
+    SDOperand Tmp2 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2);
+    return DAG.getNode(ISD::AND, MVT::i8, Tmp1, Tmp2);
+  }
+  case ISD::SETUNE: {  // PF | !ZF
+    SDOperand Ops1[] = { DAG.getConstant(X86::COND_P, MVT::i8), Cond };
+    SDOperand Tmp1 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops1, 2);
+    SDOperand Ops2[] = { DAG.getConstant(X86::COND_NE, MVT::i8),
+                         Tmp1.getValue(1) };
+    SDOperand Tmp2 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2);
+    return DAG.getNode(ISD::OR, MVT::i8, Tmp1, Tmp2);
+  }
+  }
+}
+
+SDOperand X86TargetLowering::LowerSELECT(SDOperand Op, SelectionDAG &DAG) {
+  bool addTest = true;
+  SDOperand Chain = DAG.getEntryNode();
+  SDOperand Cond  = Op.getOperand(0);
+  SDOperand CC;
+  const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+
+  if (Cond.getOpcode() == ISD::SETCC)
+    Cond = LowerSETCC(Cond, DAG, Chain);
+
+  if (Cond.getOpcode() == X86ISD::SETCC) {
+    CC = Cond.getOperand(0);
+
+    // If condition flag is set by a X86ISD::CMP, then make a copy of it
+    // (since flag operand cannot be shared). Use it as the condition setting
+    // operand in place of the X86ISD::SETCC.
+    // If the X86ISD::SETCC has more than one use, then perhaps it's better
+    // to use a test instead of duplicating the X86ISD::CMP (for register
+    // pressure reason)?
+    SDOperand Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    bool IllegalFPCMov = !X86ScalarSSE &&
+      MVT::isFloatingPoint(Op.getValueType()) &&
+      !hasFPCMov(cast<ConstantSDNode>(CC)->getSignExtended());
+    if ((Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) &&
+        !IllegalFPCMov) {
+      SDOperand Ops[] = { Chain, Cmp.getOperand(1), Cmp.getOperand(2) };
+      Cond = DAG.getNode(Opc, VTs, 2, Ops, 3);
+      addTest = false;
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    SDOperand Ops[] = { Chain, Cond, DAG.getConstant(0, MVT::i8) };
+    Cond = DAG.getNode(X86ISD::CMP, VTs, 2, Ops, 3);
+  }
+
+  VTs = DAG.getNodeValueTypes(Op.getValueType(), MVT::Flag);
+  SmallVector<SDOperand, 4> Ops;
+  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+  // condition is true.
+  Ops.push_back(Op.getOperand(2));
+  Ops.push_back(Op.getOperand(1));
+  Ops.push_back(CC);
+  Ops.push_back(Cond.getValue(1));
+  return DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+}
+
+SDOperand X86TargetLowering::LowerBRCOND(SDOperand Op, SelectionDAG &DAG) {
+  bool addTest = true;
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Cond  = Op.getOperand(1);
+  SDOperand Dest  = Op.getOperand(2);
+  SDOperand CC;
+  const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+
+  if (Cond.getOpcode() == ISD::SETCC)
+    Cond = LowerSETCC(Cond, DAG, Chain);
+
+  if (Cond.getOpcode() == X86ISD::SETCC) {
+    CC = Cond.getOperand(0);
+
+    // If condition flag is set by a X86ISD::CMP, then make a copy of it
+    // (since flag operand cannot be shared). Use it as the condition setting
+    // operand in place of the X86ISD::SETCC.
+    // If the X86ISD::SETCC has more than one use, then perhaps it's better
+    // to use a test instead of duplicating the X86ISD::CMP (for register
+    // pressure reason)?
+    SDOperand Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) {
+      SDOperand Ops[] = { Chain, Cmp.getOperand(1), Cmp.getOperand(2) };
+      Cond = DAG.getNode(Opc, VTs, 2, Ops, 3);
+      addTest = false;
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    SDOperand Ops[] = { Chain, Cond, DAG.getConstant(0, MVT::i8) };
+    Cond = DAG.getNode(X86ISD::CMP, VTs, 2, Ops, 3);
+  }
+  return DAG.getNode(X86ISD::BRCOND, Op.getValueType(),
+                     Cond, Op.getOperand(2), CC, Cond.getValue(1));
+}
+
+SDOperand X86TargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) {
+  unsigned CallingConv= cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+
+  if (Subtarget->is64Bit())
+    return LowerX86_64CCCCallTo(Op, DAG, CallingConv);
+  else
+    switch (CallingConv) {
+    default:
+      assert(0 && "Unsupported calling convention");
+    case CallingConv::Fast:
+      // TODO: Implement fastcc
+      // Falls through
+    case CallingConv::C:
+    case CallingConv::X86_StdCall:
+      return LowerCCCCallTo(Op, DAG, CallingConv);
+    case CallingConv::X86_FastCall:
+      return LowerFastCCCallTo(Op, DAG, CallingConv);
+    }
+}
+
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDOperand
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDOperand Op,
+                                           SelectionDAG &DAG) {
+  assert(Subtarget->isTargetCygMing() &&
+         "This should be used only on Cygwin/Mingw targets");
+  
+  // Get the inputs.
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Size  = Op.getOperand(1);
+  // FIXME: Ensure alignment here
+
+  SDOperand Flag;
+  
+  MVT::ValueType IntPtr = getPointerTy();
+  MVT::ValueType SPTy = (Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
+
+  Chain = DAG.getCopyToReg(Chain, X86::EAX, Size, Flag);
+  Flag = Chain.getValue(1);
+
+  SDVTList  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDOperand Ops[] = { Chain,
+                      DAG.getTargetExternalSymbol("_alloca", IntPtr),
+                      DAG.getRegister(X86::EAX, IntPtr),
+                      Flag };
+  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops, 4);
+  Flag = Chain.getValue(1);
+
+  Chain = DAG.getCopyFromReg(Chain, X86StackPtr, SPTy).getValue(1);
+  
+  std::vector<MVT::ValueType> Tys;
+  Tys.push_back(SPTy);
+  Tys.push_back(MVT::Other);
+  SDOperand Ops1[2] = { Chain.getValue(0), Chain };
+  return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops1, 2);
+}
+
+SDOperand
+X86TargetLowering::LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function* Fn = MF.getFunction();
+  if (Fn->hasExternalLinkage() &&
+      Subtarget->isTargetCygMing() &&
+      Fn->getName() == "main")
+    MF.getInfo<X86MachineFunctionInfo>()->setForceFramePointer(true);
+
+  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+  if (Subtarget->is64Bit())
+    return LowerX86_64CCCArguments(Op, DAG);
+  else
+    switch(CC) {
+    default:
+      assert(0 && "Unsupported calling convention");
+    case CallingConv::Fast:
+      // TODO: implement fastcc.
+      
+      // Falls through
+    case CallingConv::C:
+      return LowerCCCArguments(Op, DAG);
+    case CallingConv::X86_StdCall:
+      MF.getInfo<X86MachineFunctionInfo>()->setDecorationStyle(StdCall);
+      return LowerCCCArguments(Op, DAG, true);
+    case CallingConv::X86_FastCall:
+      MF.getInfo<X86MachineFunctionInfo>()->setDecorationStyle(FastCall);
+      return LowerFastCCArguments(Op, DAG);
+    }
+}
+
+SDOperand X86TargetLowering::LowerMEMSET(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand InFlag(0, 0);
+  SDOperand Chain = Op.getOperand(0);
+  unsigned Align =
+    (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue();
+  if (Align == 0) Align = 1;
+
+  ConstantSDNode *I = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+  // If not DWORD aligned, call memset if size is less than the threshold.
+  // It knows how to align to the right boundary first.
+  if ((Align & 3) != 0 ||
+      (I && I->getValue() < Subtarget->getMinRepStrSizeThreshold())) {
+    MVT::ValueType IntPtr = getPointerTy();
+    const Type *IntPtrTy = getTargetData()->getIntPtrType();
+    TargetLowering::ArgListTy Args; 
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Op.getOperand(1);
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    // Extend the unsigned i8 argument to be an int value for the call.
+    Entry.Node = DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Op.getOperand(2));
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    Entry.Node = Op.getOperand(3);
+    Args.push_back(Entry);
+    std::pair<SDOperand,SDOperand> CallResult =
+      LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false,
+                  DAG.getExternalSymbol("memset", IntPtr), Args, DAG);
+    return CallResult.second;
+  }
+
+  MVT::ValueType AVT;
+  SDOperand Count;
+  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+  unsigned BytesLeft = 0;
+  bool TwoRepStos = false;
+  if (ValC) {
+    unsigned ValReg;
+    uint64_t Val = ValC->getValue() & 255;
+
+    // If the value is a constant, then we can potentially use larger sets.
+    switch (Align & 3) {
+      case 2:   // WORD aligned
+        AVT = MVT::i16;
+        ValReg = X86::AX;
+        Val = (Val << 8) | Val;
+        break;
+      case 0:  // DWORD aligned
+        AVT = MVT::i32;
+        ValReg = X86::EAX;
+        Val = (Val << 8)  | Val;
+        Val = (Val << 16) | Val;
+        if (Subtarget->is64Bit() && ((Align & 0xF) == 0)) {  // QWORD aligned
+          AVT = MVT::i64;
+          ValReg = X86::RAX;
+          Val = (Val << 32) | Val;
+        }
+        break;
+      default:  // Byte aligned
+        AVT = MVT::i8;
+        ValReg = X86::AL;
+        Count = Op.getOperand(3);
+        break;
+    }
+
+    if (AVT > MVT::i8) {
+      if (I) {
+        unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
+        Count = DAG.getConstant(I->getValue() / UBytes, getPointerTy());
+        BytesLeft = I->getValue() % UBytes;
+      } else {
+        assert(AVT >= MVT::i32 &&
+               "Do not use rep;stos if not at least DWORD aligned");
+        Count = DAG.getNode(ISD::SRL, Op.getOperand(3).getValueType(),
+                            Op.getOperand(3), DAG.getConstant(2, MVT::i8));
+        TwoRepStos = true;
+      }
+    }
+
+    Chain  = DAG.getCopyToReg(Chain, ValReg, DAG.getConstant(Val, AVT),
+                              InFlag);
+    InFlag = Chain.getValue(1);
+  } else {
+    AVT = MVT::i8;
+    Count  = Op.getOperand(3);
+    Chain  = DAG.getCopyToReg(Chain, X86::AL, Op.getOperand(2), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI,
+                            Op.getOperand(1), InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getValueType(AVT));
+  Ops.push_back(InFlag);
+  Chain  = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size());
+
+  if (TwoRepStos) {
+    InFlag = Chain.getValue(1);
+    Count = Op.getOperand(3);
+    MVT::ValueType CVT = Count.getValueType();
+    SDOperand Left = DAG.getNode(ISD::AND, CVT, Count,
+                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+    Chain  = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+                              Left, InFlag);
+    InFlag = Chain.getValue(1);
+    Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    Ops.clear();
+    Ops.push_back(Chain);
+    Ops.push_back(DAG.getValueType(MVT::i8));
+    Ops.push_back(InFlag);
+    Chain  = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size());
+  } else if (BytesLeft) {
+    // Issue stores for the last 1 - 7 bytes.
+    SDOperand Value;
+    unsigned Val = ValC->getValue() & 255;
+    unsigned Offset = I->getValue() - BytesLeft;
+    SDOperand DstAddr = Op.getOperand(1);
+    MVT::ValueType AddrVT = DstAddr.getValueType();
+    if (BytesLeft >= 4) {
+      Val = (Val << 8)  | Val;
+      Val = (Val << 16) | Val;
+      Value = DAG.getConstant(Val, MVT::i32);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, AddrVT, DstAddr,
+                                       DAG.getConstant(Offset, AddrVT)),
+                           NULL, 0);
+      BytesLeft -= 4;
+      Offset += 4;
+    }
+    if (BytesLeft >= 2) {
+      Value = DAG.getConstant((Val << 8) | Val, MVT::i16);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, AddrVT, DstAddr,
+                                       DAG.getConstant(Offset, AddrVT)),
+                           NULL, 0);
+      BytesLeft -= 2;
+      Offset += 2;
+    }
+    if (BytesLeft == 1) {
+      Value = DAG.getConstant(Val, MVT::i8);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, AddrVT, DstAddr,
+                                       DAG.getConstant(Offset, AddrVT)),
+                           NULL, 0);
+    }
+  }
+
+  return Chain;
+}
+
+SDOperand X86TargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Chain = Op.getOperand(0);
+  unsigned Align =
+    (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue();
+  if (Align == 0) Align = 1;
+
+  ConstantSDNode *I = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+  // If not DWORD aligned, call memcpy if size is less than the threshold.
+  // It knows how to align to the right boundary first.
+  if ((Align & 3) != 0 ||
+      (I && I->getValue() < Subtarget->getMinRepStrSizeThreshold())) {
+    MVT::ValueType IntPtr = getPointerTy();
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = getTargetData()->getIntPtrType();
+    Entry.Node = Op.getOperand(1); Args.push_back(Entry);
+    Entry.Node = Op.getOperand(2); Args.push_back(Entry);
+    Entry.Node = Op.getOperand(3); Args.push_back(Entry);
+    std::pair<SDOperand,SDOperand> CallResult =
+      LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false,
+                  DAG.getExternalSymbol("memcpy", IntPtr), Args, DAG);
+    return CallResult.second;
+  }
+
+  MVT::ValueType AVT;
+  SDOperand Count;
+  unsigned BytesLeft = 0;
+  bool TwoRepMovs = false;
+  switch (Align & 3) {
+    case 2:   // WORD aligned
+      AVT = MVT::i16;
+      break;
+    case 0:  // DWORD aligned
+      AVT = MVT::i32;
+      if (Subtarget->is64Bit() && ((Align & 0xF) == 0))  // QWORD aligned
+        AVT = MVT::i64;
+      break;
+    default:  // Byte aligned
+      AVT = MVT::i8;
+      Count = Op.getOperand(3);
+      break;
+  }
+
+  if (AVT > MVT::i8) {
+    if (I) {
+      unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
+      Count = DAG.getConstant(I->getValue() / UBytes, getPointerTy());
+      BytesLeft = I->getValue() % UBytes;
+    } else {
+      assert(AVT >= MVT::i32 &&
+             "Do not use rep;movs if not at least DWORD aligned");
+      Count = DAG.getNode(ISD::SRL, Op.getOperand(3).getValueType(),
+                          Op.getOperand(3), DAG.getConstant(2, MVT::i8));
+      TwoRepMovs = true;
+    }
+  }
+
+  SDOperand InFlag(0, 0);
+  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI,
+                            Op.getOperand(1), InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RSI : X86::ESI,
+                            Op.getOperand(2), InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getValueType(AVT));
+  Ops.push_back(InFlag);
+  Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size());
+
+  if (TwoRepMovs) {
+    InFlag = Chain.getValue(1);
+    Count = Op.getOperand(3);
+    MVT::ValueType CVT = Count.getValueType();
+    SDOperand Left = DAG.getNode(ISD::AND, CVT, Count,
+                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+    Chain  = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+                              Left, InFlag);
+    InFlag = Chain.getValue(1);
+    Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    Ops.clear();
+    Ops.push_back(Chain);
+    Ops.push_back(DAG.getValueType(MVT::i8));
+    Ops.push_back(InFlag);
+    Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size());
+  } else if (BytesLeft) {
+    // Issue loads and stores for the last 1 - 7 bytes.
+    unsigned Offset = I->getValue() - BytesLeft;
+    SDOperand DstAddr = Op.getOperand(1);
+    MVT::ValueType DstVT = DstAddr.getValueType();
+    SDOperand SrcAddr = Op.getOperand(2);
+    MVT::ValueType SrcVT = SrcAddr.getValueType();
+    SDOperand Value;
+    if (BytesLeft >= 4) {
+      Value = DAG.getLoad(MVT::i32, Chain,
+                          DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
+                                      DAG.getConstant(Offset, SrcVT)),
+                          NULL, 0);
+      Chain = Value.getValue(1);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, DstVT, DstAddr,
+                                       DAG.getConstant(Offset, DstVT)),
+                           NULL, 0);
+      BytesLeft -= 4;
+      Offset += 4;
+    }
+    if (BytesLeft >= 2) {
+      Value = DAG.getLoad(MVT::i16, Chain,
+                          DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
+                                      DAG.getConstant(Offset, SrcVT)),
+                          NULL, 0);
+      Chain = Value.getValue(1);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, DstVT, DstAddr,
+                                       DAG.getConstant(Offset, DstVT)),
+                           NULL, 0);
+      BytesLeft -= 2;
+      Offset += 2;
+    }
+
+    if (BytesLeft == 1) {
+      Value = DAG.getLoad(MVT::i8, Chain,
+                          DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
+                                      DAG.getConstant(Offset, SrcVT)),
+                          NULL, 0);
+      Chain = Value.getValue(1);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, DstVT, DstAddr,
+                                       DAG.getConstant(Offset, DstVT)),
+                           NULL, 0);
+    }
+  }
+
+  return Chain;
+}
+
+SDOperand
+X86TargetLowering::LowerREADCYCLCECOUNTER(SDOperand Op, SelectionDAG &DAG) {
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDOperand TheOp = Op.getOperand(0);
+  SDOperand rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheOp, 1);
+  if (Subtarget->is64Bit()) {
+    SDOperand Copy1 = DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1));
+    SDOperand Copy2 = DAG.getCopyFromReg(Copy1.getValue(1), X86::RDX,
+                                         MVT::i64, Copy1.getValue(2));
+    SDOperand Tmp = DAG.getNode(ISD::SHL, MVT::i64, Copy2,
+                                DAG.getConstant(32, MVT::i8));
+    SDOperand Ops[] = {
+      DAG.getNode(ISD::OR, MVT::i64, Copy1, Tmp), Copy2.getValue(1)
+    };
+    
+    Tys = DAG.getVTList(MVT::i64, MVT::Other);
+    return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 2);
+  }
+  
+  SDOperand Copy1 = DAG.getCopyFromReg(rd, X86::EAX, MVT::i32, rd.getValue(1));
+  SDOperand Copy2 = DAG.getCopyFromReg(Copy1.getValue(1), X86::EDX,
+                                       MVT::i32, Copy1.getValue(2));
+  SDOperand Ops[] = { Copy1, Copy2, Copy2.getValue(1) };
+  Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+  return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 3);
+}
+
+SDOperand X86TargetLowering::LowerVASTART(SDOperand Op, SelectionDAG &DAG) {
+  SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2));
+
+  if (!Subtarget->is64Bit()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+    return DAG.getStore(Op.getOperand(0), FR,Op.getOperand(1), SV->getValue(),
+                        SV->getOffset());
+  }
+
+  // __va_list_tag:
+  //   gp_offset         (0 - 6 * 8)
+  //   fp_offset         (48 - 48 + 8 * 16)
+  //   overflow_arg_area (point to parameters coming in memory).
+  //   reg_save_area
+  SmallVector<SDOperand, 8> MemOps;
+  SDOperand FIN = Op.getOperand(1);
+  // Store gp_offset
+  SDOperand Store = DAG.getStore(Op.getOperand(0),
+                                 DAG.getConstant(VarArgsGPOffset, MVT::i32),
+                                 FIN, SV->getValue(), SV->getOffset());
+  MemOps.push_back(Store);
+
+  // Store fp_offset
+  FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                    DAG.getConstant(4, getPointerTy()));
+  Store = DAG.getStore(Op.getOperand(0),
+                       DAG.getConstant(VarArgsFPOffset, MVT::i32),
+                       FIN, SV->getValue(), SV->getOffset());
+  MemOps.push_back(Store);
+
+  // Store ptr to overflow_arg_area
+  FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                    DAG.getConstant(4, getPointerTy()));
+  SDOperand OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), OVFIN, FIN, SV->getValue(),
+                       SV->getOffset());
+  MemOps.push_back(Store);
+
+  // Store ptr to reg_save_area.
+  FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                    DAG.getConstant(8, getPointerTy()));
+  SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), RSFIN, FIN, SV->getValue(),
+                       SV->getOffset());
+  MemOps.push_back(Store);
+  return DAG.getNode(ISD::TokenFactor, MVT::Other, &MemOps[0], MemOps.size());
+}
+
+SDOperand X86TargetLowering::LowerVACOPY(SDOperand Op, SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand DstPtr = Op.getOperand(1);
+  SDOperand SrcPtr = Op.getOperand(2);
+  SrcValueSDNode *DstSV = cast<SrcValueSDNode>(Op.getOperand(3));
+  SrcValueSDNode *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4));
+
+  SrcPtr = DAG.getLoad(getPointerTy(), Chain, SrcPtr,
+                       SrcSV->getValue(), SrcSV->getOffset());
+  Chain = SrcPtr.getValue(1);
+  for (unsigned i = 0; i < 3; ++i) {
+    SDOperand Val = DAG.getLoad(MVT::i64, Chain, SrcPtr,
+                                SrcSV->getValue(), SrcSV->getOffset());
+    Chain = Val.getValue(1);
+    Chain = DAG.getStore(Chain, Val, DstPtr,
+                         DstSV->getValue(), DstSV->getOffset());
+    if (i == 2)
+      break;
+    SrcPtr = DAG.getNode(ISD::ADD, getPointerTy(), SrcPtr, 
+                         DAG.getConstant(8, getPointerTy()));
+    DstPtr = DAG.getNode(ISD::ADD, getPointerTy(), DstPtr, 
+                         DAG.getConstant(8, getPointerTy()));
+  }
+  return Chain;
+}
+
+SDOperand
+X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getValue();
+  switch (IntNo) {
+  default: return SDOperand();    // Don't custom lower most intrinsics.
+    // Comparison intrinsics.
+  case Intrinsic::x86_sse_comieq_ss:
+  case Intrinsic::x86_sse_comilt_ss:
+  case Intrinsic::x86_sse_comile_ss:
+  case Intrinsic::x86_sse_comigt_ss:
+  case Intrinsic::x86_sse_comige_ss:
+  case Intrinsic::x86_sse_comineq_ss:
+  case Intrinsic::x86_sse_ucomieq_ss:
+  case Intrinsic::x86_sse_ucomilt_ss:
+  case Intrinsic::x86_sse_ucomile_ss:
+  case Intrinsic::x86_sse_ucomigt_ss:
+  case Intrinsic::x86_sse_ucomige_ss:
+  case Intrinsic::x86_sse_ucomineq_ss:
+  case Intrinsic::x86_sse2_comieq_sd:
+  case Intrinsic::x86_sse2_comilt_sd:
+  case Intrinsic::x86_sse2_comile_sd:
+  case Intrinsic::x86_sse2_comigt_sd:
+  case Intrinsic::x86_sse2_comige_sd:
+  case Intrinsic::x86_sse2_comineq_sd:
+  case Intrinsic::x86_sse2_ucomieq_sd:
+  case Intrinsic::x86_sse2_ucomilt_sd:
+  case Intrinsic::x86_sse2_ucomile_sd:
+  case Intrinsic::x86_sse2_ucomigt_sd:
+  case Intrinsic::x86_sse2_ucomige_sd:
+  case Intrinsic::x86_sse2_ucomineq_sd: {
+    unsigned Opc = 0;
+    ISD::CondCode CC = ISD::SETCC_INVALID;
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_sse_comieq_ss:
+    case Intrinsic::x86_sse2_comieq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_comilt_ss:
+    case Intrinsic::x86_sse2_comilt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_comile_ss:
+    case Intrinsic::x86_sse2_comile_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_comigt_ss:
+    case Intrinsic::x86_sse2_comigt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_comige_ss:
+    case Intrinsic::x86_sse2_comige_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_comineq_ss:
+    case Intrinsic::x86_sse2_comineq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETNE;
+      break;
+    case Intrinsic::x86_sse_ucomieq_ss:
+    case Intrinsic::x86_sse2_ucomieq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_ucomilt_ss:
+    case Intrinsic::x86_sse2_ucomilt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_ucomile_ss:
+    case Intrinsic::x86_sse2_ucomile_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_ucomigt_ss:
+    case Intrinsic::x86_sse2_ucomigt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_ucomige_ss:
+    case Intrinsic::x86_sse2_ucomige_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_ucomineq_ss:
+    case Intrinsic::x86_sse2_ucomineq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETNE;
+      break;
+    }
+
+    unsigned X86CC;
+    SDOperand LHS = Op.getOperand(1);
+    SDOperand RHS = Op.getOperand(2);
+    translateX86CC(CC, true, X86CC, LHS, RHS, DAG);
+
+    const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+    SDOperand Ops1[] = { DAG.getEntryNode(), LHS, RHS };
+    SDOperand Cond = DAG.getNode(Opc, VTs, 2, Ops1, 3);
+    VTs = DAG.getNodeValueTypes(MVT::i8, MVT::Flag);
+    SDOperand Ops2[] = { DAG.getConstant(X86CC, MVT::i8), Cond };
+    SDOperand SetCC = DAG.getNode(X86ISD::SETCC, VTs, 2, Ops2, 2);
+    return DAG.getNode(ISD::ANY_EXTEND, MVT::i32, SetCC);
+  }
+  }
+}
+
+SDOperand X86TargetLowering::LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG) {
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0)
+    return SDOperand();
+  
+  // Just load the return address
+  SDOperand RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(getPointerTy(), DAG.getEntryNode(), RetAddrFI, NULL, 0);
+}
+
+SDOperand X86TargetLowering::LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG) {
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0)
+    return SDOperand();
+    
+  SDOperand RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getNode(ISD::SUB, getPointerTy(), RetAddrFI, 
+                     DAG.getConstant(4, getPointerTy()));
+}
+
+SDOperand X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDOperand Op,
+                                                       SelectionDAG &DAG) {
+  // Is not yet supported on x86-64
+  if (Subtarget->is64Bit())
+    return SDOperand();
+  
+  return DAG.getConstant(8, getPointerTy());
+}
+
+SDOperand X86TargetLowering::LowerEH_RETURN(SDOperand Op, SelectionDAG &DAG)
+{
+  assert(!Subtarget->is64Bit() &&
+         "Lowering of eh_return builtin is not supported yet on x86-64");
+    
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDOperand Chain     = Op.getOperand(0);
+  SDOperand Offset    = Op.getOperand(1);
+  SDOperand Handler   = Op.getOperand(2);
+
+  SDOperand Frame = DAG.getRegister(RegInfo->getFrameRegister(MF),
+                                    getPointerTy());
+
+  SDOperand StoreAddr = DAG.getNode(ISD::SUB, getPointerTy(), Frame,
+                                    DAG.getConstant(-4UL, getPointerTy()));
+  StoreAddr = DAG.getNode(ISD::ADD, getPointerTy(), StoreAddr, Offset);
+  Chain = DAG.getStore(Chain, Handler, StoreAddr, NULL, 0);
+  Chain = DAG.getCopyToReg(Chain, X86::ECX, StoreAddr);
+  MF.addLiveOut(X86::ECX);
+
+  return DAG.getNode(X86ISD::EH_RETURN, MVT::Other,
+                     Chain, DAG.getRegister(X86::ECX, getPointerTy()));
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Should not custom lower this!");
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
+  case ISD::SHL_PARTS:
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::FABS:               return LowerFABS(Op, DAG);
+  case ISD::FNEG:               return LowerFNEG(Op, DAG);
+  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG, DAG.getEntryNode());
+  case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::CALL:               return LowerCALL(Op, DAG);
+  case ISD::RET:                return LowerRET(Op, DAG);
+  case ISD::FORMAL_ARGUMENTS:   return LowerFORMAL_ARGUMENTS(Op, DAG);
+  case ISD::MEMSET:             return LowerMEMSET(Op, DAG);
+  case ISD::MEMCPY:             return LowerMEMCPY(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLCECOUNTER(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET:
+                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  }
+  return SDOperand();
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return NULL;
+  case X86ISD::SHLD:               return "X86ISD::SHLD";
+  case X86ISD::SHRD:               return "X86ISD::SHRD";
+  case X86ISD::FAND:               return "X86ISD::FAND";
+  case X86ISD::FOR:                return "X86ISD::FOR";
+  case X86ISD::FXOR:               return "X86ISD::FXOR";
+  case X86ISD::FSRL:               return "X86ISD::FSRL";
+  case X86ISD::FILD:               return "X86ISD::FILD";
+  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
+  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+  case X86ISD::FLD:                return "X86ISD::FLD";
+  case X86ISD::FST:                return "X86ISD::FST";
+  case X86ISD::FP_GET_RESULT:      return "X86ISD::FP_GET_RESULT";
+  case X86ISD::FP_SET_RESULT:      return "X86ISD::FP_SET_RESULT";
+  case X86ISD::CALL:               return "X86ISD::CALL";
+  case X86ISD::TAILCALL:           return "X86ISD::TAILCALL";
+  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::CMP:                return "X86ISD::CMP";
+  case X86ISD::COMI:               return "X86ISD::COMI";
+  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::SETCC:              return "X86ISD::SETCC";
+  case X86ISD::CMOV:               return "X86ISD::CMOV";
+  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
+  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
+  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
+  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
+  case X86ISD::LOAD_PACK:          return "X86ISD::LOAD_PACK";
+  case X86ISD::LOAD_UA:            return "X86ISD::LOAD_UA";
+  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
+  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
+  case X86ISD::S2VEC:              return "X86ISD::S2VEC";
+  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
+  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
+  case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::THREAD_POINTER:     return "X86ISD::THREAD_POINTER";
+  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
+  }
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 
+                                              const Type *Ty) const {
+  // X86 supports extremely general addressing modes.
+  
+  // X86 allows a sign-extended 32-bit immediate field as a displacement.
+  if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1)
+    return false;
+  
+  if (AM.BaseGV) {
+    // X86-64 only supports addr of globals in small code model.
+    if (Subtarget->is64Bit() &&
+        getTargetMachine().getCodeModel() != CodeModel::Small)
+      return false;
+    
+    // We can only fold this if we don't need a load either.
+    if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false))
+      return false;
+  }
+  
+  switch (AM.Scale) {
+  case 0:
+  case 1:
+  case 2:
+  case 4:
+  case 8:
+    // These scales always work.
+    break;
+  case 3:
+  case 5:
+  case 9:
+    // These scales are formed with basereg+scalereg.  Only accept if there is
+    // no basereg yet.
+    if (AM.HasBaseReg)
+      return false;
+    break;
+  default:  // Other stuff never works.
+    return false;
+  }
+  
+  return true;
+}
+
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(SDOperand Mask, MVT::ValueType VT) const {
+  // Only do shuffles on 128-bit vector types for now.
+  if (MVT::getSizeInBits(VT) == 64) return false;
+  return (Mask.Val->getNumOperands() <= 4 ||
+          isIdentityMask(Mask.Val) ||
+          isIdentityMask(Mask.Val, true) ||
+          isSplatMask(Mask.Val)  ||
+          isPSHUFHW_PSHUFLWMask(Mask.Val) ||
+          X86::isUNPCKLMask(Mask.Val) ||
+          X86::isUNPCKHMask(Mask.Val) ||
+          X86::isUNPCKL_v_undef_Mask(Mask.Val) ||
+          X86::isUNPCKH_v_undef_Mask(Mask.Val));
+}
+
+bool X86TargetLowering::isVectorClearMaskLegal(std::vector<SDOperand> &BVOps,
+                                               MVT::ValueType EVT,
+                                               SelectionDAG &DAG) const {
+  unsigned NumElts = BVOps.size();
+  // Only do shuffles on 128-bit vector types for now.
+  if (MVT::getSizeInBits(EVT) * NumElts == 64) return false;
+  if (NumElts == 2) return true;
+  if (NumElts == 4) {
+    return (isMOVLMask(&BVOps[0], 4)  ||
+            isCommutedMOVL(&BVOps[0], 4, true) ||
+            isSHUFPMask(&BVOps[0], 4) || 
+            isCommutedSHUFP(&BVOps[0], 4));
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+X86TargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                           MachineBasicBlock *BB) {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  switch (MI->getOpcode()) {
+  default: assert(false && "Unexpected instr type to insert");
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    ilist<MachineBasicBlock>::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB = BB;
+    MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB);
+    unsigned Opc =
+      X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+    BuildMI(BB, TII->get(Opc)).addMBB(sinkMBB);
+    MachineFunction *F = BB->getParent();
+    F->getBasicBlockList().insert(It, copy0MBB);
+    F->getBasicBlockList().insert(It, sinkMBB);
+    // Update machine-CFG edges by first adding all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
+        e = BB->succ_end(); i != e; ++i)
+      sinkMBB->addSuccessor(*i);
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while(!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(BB, TII->get(X86::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+    delete MI;   // The pseudo instruction is gone now.
+    return BB;
+  }
+
+  case X86::FP32_TO_INT16_IN_MEM:
+  case X86::FP32_TO_INT32_IN_MEM:
+  case X86::FP32_TO_INT64_IN_MEM:
+  case X86::FP64_TO_INT16_IN_MEM:
+  case X86::FP64_TO_INT32_IN_MEM:
+  case X86::FP64_TO_INT64_IN_MEM: {
+    // Change the floating point control register to use "round towards zero"
+    // mode when truncating to an integer value.
+    MachineFunction *F = BB->getParent();
+    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
+    addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+    // Load the old value of the high byte of the control word...
+    unsigned OldCW =
+      F->getSSARegMap()->createVirtualRegister(X86::GR16RegisterClass);
+    addFrameReference(BuildMI(BB, TII->get(X86::MOV16rm), OldCW), CWFrameIdx);
+
+    // Set the high part to be round to zero...
+    addFrameReference(BuildMI(BB, TII->get(X86::MOV16mi)), CWFrameIdx)
+      .addImm(0xC7F);
+
+    // Reload the modified control word now...
+    addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    // Restore the memory image of control word to original value
+    addFrameReference(BuildMI(BB, TII->get(X86::MOV16mr)), CWFrameIdx)
+      .addReg(OldCW);
+
+    // Get the X86 opcode to use.
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: assert(0 && "illegal opcode!");
+    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+    }
+
+    X86AddressMode AM;
+    MachineOperand &Op = MI->getOperand(0);
+    if (Op.isRegister()) {
+      AM.BaseType = X86AddressMode::RegBase;
+      AM.Base.Reg = Op.getReg();
+    } else {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = Op.getFrameIndex();
+    }
+    Op = MI->getOperand(1);
+    if (Op.isImmediate())
+      AM.Scale = Op.getImm();
+    Op = MI->getOperand(2);
+    if (Op.isImmediate())
+      AM.IndexReg = Op.getImm();
+    Op = MI->getOperand(3);
+    if (Op.isGlobalAddress()) {
+      AM.GV = Op.getGlobal();
+    } else {
+      AM.Disp = Op.getImm();
+    }
+    addFullAddress(BuildMI(BB, TII->get(Opc)), AM)
+                      .addReg(MI->getOperand(4).getReg());
+
+    // Reload the original control word now.
+    addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    delete MI;   // The pseudo instruction is gone now.
+    return BB;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                       uint64_t Mask,
+                                                       uint64_t &KnownZero,
+                                                       uint64_t &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  unsigned Opc = Op.getOpcode();
+  assert((Opc >= ISD::BUILTIN_OP_END ||
+          Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN ||
+          Opc == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+
+  KnownZero = KnownOne = 0;   // Don't know anything.
+  switch (Opc) {
+  default: break;
+  case X86ISD::SETCC:
+    KnownZero |= (MVT::getIntVTBitMask(Op.getValueType()) ^ 1ULL);
+    break;
+  }
+}
+
+/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+static SDOperand getShuffleScalarElt(SDNode *N, unsigned i, SelectionDAG &DAG) {
+  MVT::ValueType VT = N->getValueType(0);
+  SDOperand PermMask = N->getOperand(2);
+  unsigned NumElems = PermMask.getNumOperands();
+  SDOperand V = (i < NumElems) ? N->getOperand(0) : N->getOperand(1);
+  i %= NumElems;
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    return (i == 0)
+      ? V.getOperand(0) : DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+  } else if (V.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    SDOperand Idx = PermMask.getOperand(i);
+    if (Idx.getOpcode() == ISD::UNDEF)
+      return DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+    return getShuffleScalarElt(V.Val,cast<ConstantSDNode>(Idx)->getValue(),DAG);
+  }
+  return SDOperand();
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + an offset.
+static bool isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) {
+  unsigned Opc = N->getOpcode();
+  if (Opc == X86ISD::Wrapper) {
+    if (dyn_cast<GlobalAddressSDNode>(N->getOperand(0))) {
+      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+      return true;
+    }
+  } else if (Opc == ISD::ADD) {
+    SDOperand N1 = N->getOperand(0);
+    SDOperand N2 = N->getOperand(1);
+    if (isGAPlusOffset(N1.Val, GA, Offset)) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N2);
+      if (V) {
+        Offset += V->getSignExtended();
+        return true;
+      }
+    } else if (isGAPlusOffset(N2.Val, GA, Offset)) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N1);
+      if (V) {
+        Offset += V->getSignExtended();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// isConsecutiveLoad - Returns true if N is loading from an address of Base
+/// + Dist * Size.
+static bool isConsecutiveLoad(SDNode *N, SDNode *Base, int Dist, int Size,
+                              MachineFrameInfo *MFI) {
+  if (N->getOperand(0).Val != Base->getOperand(0).Val)
+    return false;
+
+  SDOperand Loc = N->getOperand(1);
+  SDOperand BaseLoc = Base->getOperand(1);
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    int FI  = dyn_cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = dyn_cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != Size) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Size);
+  } else {
+    GlobalValue *GV1 = NULL;
+    GlobalValue *GV2 = NULL;
+    int64_t Offset1 = 0;
+    int64_t Offset2 = 0;
+    bool isGA1 = isGAPlusOffset(Loc.Val, GV1, Offset1);
+    bool isGA2 = isGAPlusOffset(BaseLoc.Val, GV2, Offset2);
+    if (isGA1 && isGA2 && GV1 == GV2)
+      return Offset1 == (Offset2 + Dist*Size);
+  }
+
+  return false;
+}
+
+static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI,
+                              const X86Subtarget *Subtarget) {
+  GlobalValue *GV;
+  int64_t Offset;
+  if (isGAPlusOffset(Base, GV, Offset))
+    return (GV->getAlignment() >= 16 && (Offset % 16) == 0);
+  else {
+    assert(Base->getOpcode() == ISD::FrameIndex && "Unexpected base node!");
+    int BFI = dyn_cast<FrameIndexSDNode>(Base)->getIndex();
+    if (BFI < 0)
+      // Fixed objects do not specify alignment, however the offsets are known.
+      return ((Subtarget->getStackAlignment() % 16) == 0 &&
+              (MFI->getObjectOffset(BFI) % 16) == 0);
+    else
+      return MFI->getObjectAlignment(BFI) >= 16;
+  }
+  return false;
+}
+
+
+/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
+/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
+/// if the load addresses are consecutive, non-overlapping, and in the right
+/// order.
+static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+                                       const X86Subtarget *Subtarget) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MVT::ValueType VT = N->getValueType(0);
+  MVT::ValueType EVT = MVT::getVectorElementType(VT);
+  SDOperand PermMask = N->getOperand(2);
+  int NumElems = (int)PermMask.getNumOperands();
+  SDNode *Base = NULL;
+  for (int i = 0; i < NumElems; ++i) {
+    SDOperand Idx = PermMask.getOperand(i);
+    if (Idx.getOpcode() == ISD::UNDEF) {
+      if (!Base) return SDOperand();
+    } else {
+      SDOperand Arg =
+        getShuffleScalarElt(N, cast<ConstantSDNode>(Idx)->getValue(), DAG);
+      if (!Arg.Val || !ISD::isNON_EXTLoad(Arg.Val))
+        return SDOperand();
+      if (!Base)
+        Base = Arg.Val;
+      else if (!isConsecutiveLoad(Arg.Val, Base,
+                                  i, MVT::getSizeInBits(EVT)/8,MFI))
+        return SDOperand();
+    }
+  }
+
+  bool isAlign16 = isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget);
+  if (isAlign16) {
+    LoadSDNode *LD = cast<LoadSDNode>(Base);
+    return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
+                       LD->getSrcValueOffset());
+  } else {
+    // Just use movups, it's shorter.
+    SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
+    SmallVector<SDOperand, 3> Ops;
+    Ops.push_back(Base->getOperand(0));
+    Ops.push_back(Base->getOperand(1));
+    Ops.push_back(Base->getOperand(2));
+    return DAG.getNode(ISD::BIT_CONVERT, VT,
+                       DAG.getNode(X86ISD::LOAD_UA, Tys, &Ops[0], Ops.size()));
+  }
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDOperand Cond = N->getOperand(0);
+
+  // If we have SSE[12] support, try to form min/max nodes.
+  if (Subtarget->hasSSE2() &&
+      (N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64)) {
+    if (Cond.getOpcode() == ISD::SETCC) {
+      // Get the LHS/RHS of the select.
+      SDOperand LHS = N->getOperand(1);
+      SDOperand RHS = N->getOperand(2);
+      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+      unsigned Opcode = 0;
+      if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+        switch (CC) {
+        default: break;
+        case ISD::SETOLE: // (X <= Y) ? X : Y -> min
+        case ISD::SETULE:
+        case ISD::SETLE:
+          if (!UnsafeFPMath) break;
+          // FALL THROUGH.
+        case ISD::SETOLT:  // (X olt/lt Y) ? X : Y -> min
+        case ISD::SETLT:
+          Opcode = X86ISD::FMIN;
+          break;
+
+        case ISD::SETOGT: // (X > Y) ? X : Y -> max
+        case ISD::SETUGT:
+        case ISD::SETGT:
+          if (!UnsafeFPMath) break;
+          // FALL THROUGH.
+        case ISD::SETUGE:  // (X uge/ge Y) ? X : Y -> max
+        case ISD::SETGE:
+          Opcode = X86ISD::FMAX;
+          break;
+        }
+      } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
+        switch (CC) {
+        default: break;
+        case ISD::SETOGT: // (X > Y) ? Y : X -> min
+        case ISD::SETUGT:
+        case ISD::SETGT:
+          if (!UnsafeFPMath) break;
+          // FALL THROUGH.
+        case ISD::SETUGE:  // (X uge/ge Y) ? Y : X -> min
+        case ISD::SETGE:
+          Opcode = X86ISD::FMIN;
+          break;
+
+        case ISD::SETOLE:   // (X <= Y) ? Y : X -> max
+        case ISD::SETULE:
+        case ISD::SETLE:
+          if (!UnsafeFPMath) break;
+          // FALL THROUGH.
+        case ISD::SETOLT:   // (X olt/lt Y) ? Y : X -> max
+        case ISD::SETLT:
+          Opcode = X86ISD::FMAX;
+          break;
+        }
+      }
+
+      if (Opcode)
+        return DAG.getNode(Opcode, N->getValueType(0), LHS, RHS);
+    }
+
+  }
+
+  return SDOperand();
+}
+
+
+SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::VECTOR_SHUFFLE:
+    return PerformShuffleCombine(N, DAG, Subtarget);
+  case ISD::SELECT:
+    return PerformSELECTCombine(N, DAG, Subtarget);
+  }
+
+  return SDOperand();
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'A':
+    case 'r':
+    case 'R':
+    case 'l':
+    case 'q':
+    case 'Q':
+    case 'x':
+    case 'Y':
+      return C_RegisterClass;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// isOperandValidForConstraint - Return the specified operand (possibly
+/// modified) if the specified SDOperand is valid for the specified target
+/// constraint letter, otherwise return null.
+SDOperand X86TargetLowering::
+isOperandValidForConstraint(SDOperand Op, char Constraint, SelectionDAG &DAG) {
+  switch (Constraint) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getValue() <= 31)
+        return DAG.getTargetConstant(C->getValue(), Op.getValueType());
+    }
+    return SDOperand(0,0);
+  case 'N':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getValue() <= 255)
+        return DAG.getTargetConstant(C->getValue(), Op.getValueType());
+    }
+    return SDOperand(0,0);
+  case 'i': {
+    // Literal immediates are always ok.
+    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op))
+      return DAG.getTargetConstant(CST->getValue(), Op.getValueType());
+
+    // If we are in non-pic codegen mode, we allow the address of a global (with
+    // an optional displacement) to be used with 'i'.
+    GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
+    int64_t Offset = 0;
+    
+    // Match either (GA) or (GA+C)
+    if (GA) {
+      Offset = GA->getOffset();
+    } else if (Op.getOpcode() == ISD::ADD) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+      GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
+      if (C && GA) {
+        Offset = GA->getOffset()+C->getValue();
+      } else {
+        C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+        GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
+        if (C && GA)
+          Offset = GA->getOffset()+C->getValue();
+        else
+          C = 0, GA = 0;
+      }
+    }
+    
+    if (GA) {
+      // If addressing this global requires a load (e.g. in PIC mode), we can't
+      // match.
+      if (Subtarget->GVRequiresExtraLoad(GA->getGlobal(), getTargetMachine(),
+                                         false))
+        return SDOperand(0, 0);
+
+      Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+                                      Offset);
+      return Op;
+    }
+
+    // Otherwise, not valid for this mode.
+    return SDOperand(0, 0);
+  }
+  }
+  return TargetLowering::isOperandValidForConstraint(Op, Constraint, DAG);
+}
+
+std::vector<unsigned> X86TargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT::ValueType VT) const {
+  if (Constraint.size() == 1) {
+    // FIXME: not handling fp-stack yet!
+    switch (Constraint[0]) {      // GCC X86 Constraint Letters
+    default: break;  // Unknown constraint letter
+    case 'A':   // EAX/EDX
+      if (VT == MVT::i32 || VT == MVT::i64)
+        return make_vector<unsigned>(X86::EAX, X86::EDX, 0);
+      break;
+    case 'q':   // Q_REGS (GENERAL_REGS in 64-bit mode)
+    case 'Q':   // Q_REGS
+      if (VT == MVT::i32)
+        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
+      else if (VT == MVT::i16)
+        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
+      else if (VT == MVT::i8)
+        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::DL, 0);
+        break;
+    }
+  }
+
+  return std::vector<unsigned>();
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT::ValueType VT) const {
+  // First, see if this is a constraint that directly corresponds to an LLVM
+  // register class.
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+    case 'R':   // LEGACY_REGS
+    case 'l':   // INDEX_REGS
+      if (VT == MVT::i64 && Subtarget->is64Bit())
+        return std::make_pair(0U, X86::GR64RegisterClass);
+      if (VT == MVT::i32)
+        return std::make_pair(0U, X86::GR32RegisterClass);
+      else if (VT == MVT::i16)
+        return std::make_pair(0U, X86::GR16RegisterClass);
+      else if (VT == MVT::i8)
+        return std::make_pair(0U, X86::GR8RegisterClass);
+      break;
+    case 'y':   // MMX_REGS if MMX allowed.
+      if (!Subtarget->hasMMX()) break;
+      return std::make_pair(0U, X86::VR64RegisterClass);
+      break;
+    case 'Y':   // SSE_REGS if SSE2 allowed
+      if (!Subtarget->hasSSE2()) break;
+      // FALL THROUGH.
+    case 'x':   // SSE_REGS if SSE1 allowed
+      if (!Subtarget->hasSSE1()) break;
+      
+      switch (VT) {
+      default: break;
+      // Scalar SSE types.
+      case MVT::f32:
+      case MVT::i32:
+        return std::make_pair(0U, X86::FR32RegisterClass);
+      case MVT::f64:
+      case MVT::i64:
+        return std::make_pair(0U, X86::FR64RegisterClass);
+      // Vector types.
+      case MVT::v16i8:
+      case MVT::v8i16:
+      case MVT::v4i32:
+      case MVT::v2i64:
+      case MVT::v4f32:
+      case MVT::v2f64:
+        return std::make_pair(0U, X86::VR128RegisterClass);
+      }
+      break;
+    }
+  }
+  
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass*> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (Res.second == 0) {
+    // GCC calls "st(0)" just plain "st".
+    if (StringsEqualNoCase("{st}", Constraint)) {
+      Res.first = X86::ST0;
+      Res.second = X86::RSTRegisterClass;
+    }
+
+    return Res;
+  }
+
+  // Otherwise, check to see if this is a register class of the wrong value
+  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+  // turn into {ax},{dx}.
+  if (Res.second->hasType(VT))
+    return Res;   // Correct type already, nothing to do.
+
+  // All of the single-register GCC register classes map their values onto
+  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
+  // really want an 8-bit or 32-bit register, map to the appropriate register
+  // class and return the appropriate register.
+  if (Res.second != X86::GR16RegisterClass)
+    return Res;
+
+  if (VT == MVT::i8) {
+    unsigned DestReg = 0;
+    switch (Res.first) {
+    default: break;
+    case X86::AX: DestReg = X86::AL; break;
+    case X86::DX: DestReg = X86::DL; break;
+    case X86::CX: DestReg = X86::CL; break;
+    case X86::BX: DestReg = X86::BL; break;
+    }
+    if (DestReg) {
+      Res.first = DestReg;
+      Res.second = Res.second = X86::GR8RegisterClass;
+    }
+  } else if (VT == MVT::i32) {
+    unsigned DestReg = 0;
+    switch (Res.first) {
+    default: break;
+    case X86::AX: DestReg = X86::EAX; break;
+    case X86::DX: DestReg = X86::EDX; break;
+    case X86::CX: DestReg = X86::ECX; break;
+    case X86::BX: DestReg = X86::EBX; break;
+    case X86::SI: DestReg = X86::ESI; break;
+    case X86::DI: DestReg = X86::EDI; break;
+    case X86::BP: DestReg = X86::EBP; break;
+    case X86::SP: DestReg = X86::ESP; break;
+    }
+    if (DestReg) {
+      Res.first = DestReg;
+      Res.second = Res.second = X86::GR32RegisterClass;
+    }
+  } else if (VT == MVT::i64) {
+    unsigned DestReg = 0;
+    switch (Res.first) {
+    default: break;
+    case X86::AX: DestReg = X86::RAX; break;
+    case X86::DX: DestReg = X86::RDX; break;
+    case X86::CX: DestReg = X86::RCX; break;
+    case X86::BX: DestReg = X86::RBX; break;
+    case X86::SI: DestReg = X86::RSI; break;
+    case X86::DI: DestReg = X86::RDI; break;
+    case X86::BP: DestReg = X86::RBP; break;
+    case X86::SP: DestReg = X86::RSP; break;
+    }
+    if (DestReg) {
+      Res.first = DestReg;
+      Res.second = Res.second = X86::GR64RegisterClass;
+    }
+  }
+
+  return Res;
+}

diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..07a96d3
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.h

@@ -0,0 +1,437 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ISELLOWERING_H
+#define X86ISELLOWERING_H
+
+#include "X86Subtarget.h"
+#include "X86RegisterInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+namespace llvm {
+  namespace X86ISD {
+    // X86 Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builtin ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+X86::INSTRUCTION_LIST_END,
+
+      /// SHLD, SHRD - Double shift instructions. These correspond to
+      /// X86::SHLDxx and X86::SHRDxx instructions.
+      SHLD,
+      SHRD,
+
+      /// FAND - Bitwise logical AND of floating point values. This corresponds
+      /// to X86::ANDPS or X86::ANDPD.
+      FAND,
+
+      /// FOR - Bitwise logical OR of floating point values. This corresponds
+      /// to X86::ORPS or X86::ORPD.
+      FOR,
+
+      /// FXOR - Bitwise logical XOR of floating point values. This corresponds
+      /// to X86::XORPS or X86::XORPD.
+      FXOR,
+
+      /// FSRL - Bitwise logical right shift of floating point values. These
+      /// corresponds to X86::PSRLDQ.
+      FSRL,
+
+      /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+      /// integer source in memory and FP reg result.  This corresponds to the
+      /// X86::FILD*m instructions. It has three inputs (token chain, address,
+      /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+      /// also produces a flag).
+      FILD,
+      FILD_FLAG,
+
+      /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+      /// integer destination in memory and a FP reg source.  This corresponds
+      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+      /// has two inputs (token chain and address) and two outputs (int value
+      /// and token chain).
+      FP_TO_INT16_IN_MEM,
+      FP_TO_INT32_IN_MEM,
+      FP_TO_INT64_IN_MEM,
+
+      /// FLD - This instruction implements an extending load to FP stack slots.
+      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+      /// operand, ptr to load from, and a ValueType node indicating the type
+      /// to load to.
+      FLD,
+
+      /// FST - This instruction implements a truncating store to FP stack
+      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+      /// chain operand, value to store, address, and a ValueType to store it
+      /// as.
+      FST,
+
+      /// FP_GET_RESULT - This corresponds to FpGETRESULT pseudo instruction
+      /// which copies from ST(0) to the destination. It takes a chain and
+      /// writes a RFP result and a chain.
+      FP_GET_RESULT,
+
+      /// FP_SET_RESULT - This corresponds to FpSETRESULT pseudo instruction
+      /// which copies the source operand to ST(0). It takes a chain+value and
+      /// returns a chain and a flag.
+      FP_SET_RESULT,
+
+      /// CALL/TAILCALL - These operations represent an abstract X86 call
+      /// instruction, which includes a bunch of information.  In particular the
+      /// operands of these node are:
+      ///
+      ///     #0 - The incoming token chain
+      ///     #1 - The callee
+      ///     #2 - The number of arg bytes the caller pushes on the stack.
+      ///     #3 - The number of arg bytes the callee pops off the stack.
+      ///     #4 - The value to pass in AL/AX/EAX (optional)
+      ///     #5 - The value to pass in DL/DX/EDX (optional)
+      ///
+      /// The result values of these nodes are:
+      ///
+      ///     #0 - The outgoing token chain
+      ///     #1 - The first register result value (optional)
+      ///     #2 - The second register result value (optional)
+      ///
+      /// The CALL vs TAILCALL distinction boils down to whether the callee is
+      /// known not to modify the caller's stack frame, as is standard with
+      /// LLVM.
+      CALL,
+      TAILCALL,
+      
+      /// RDTSC_DAG - This operation implements the lowering for 
+      /// readcyclecounter
+      RDTSC_DAG,
+
+      /// X86 compare and logical compare instructions.
+      CMP, TEST, COMI, UCOMI,
+
+      /// X86 SetCC. Operand 1 is condition code, and operand 2 is the flag
+      /// operand produced by a CMP instruction.
+      SETCC,
+
+      /// X86 conditional moves. Operand 1 and operand 2 are the two values
+      /// to select from (operand 1 is a R/W operand). Operand 3 is the
+      /// condition code, and operand 4 is the flag operand produced by a CMP
+      /// or TEST instruction. It also writes a flag result.
+      CMOV,
+
+      /// X86 conditional branches. Operand 1 is the chain operand, operand 2
+      /// is the block to branch if condition is true, operand 3 is the
+      /// condition code, and operand 4 is the flag operand produced by a CMP
+      /// or TEST instruction.
+      BRCOND,
+
+      /// Return with a flag operand. Operand 1 is the chain operand, operand
+      /// 2 is the number of bytes of stack to pop.
+      RET_FLAG,
+
+      /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
+      REP_STOS,
+
+      /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
+      REP_MOVS,
+
+      /// LOAD_PACK Load a 128-bit packed float / double value. It has the same
+      /// operands as a normal load.
+      LOAD_PACK,
+
+      /// LOAD_UA Load an unaligned 128-bit value. It has the same operands as
+      /// a normal load.
+      LOAD_UA,
+
+      /// GlobalBaseReg - On Darwin, this node represents the result of the popl
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// Wrapper - A wrapper node for TargetConstantPool,
+      /// TargetExternalSymbol, and TargetGlobalAddress.
+      Wrapper,
+
+      /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP
+      /// relative displacements.
+      WrapperRIP,
+
+      /// S2VEC - X86 version of SCALAR_TO_VECTOR. The destination base does not
+      /// have to match the operand type.
+      S2VEC,
+
+      /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRW.
+      PEXTRW,
+
+      /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRW.
+      PINSRW,
+
+      /// FMAX, FMIN - Floating point max and min.
+      ///
+      FMAX, FMIN,
+
+      /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+      /// approximation.  Note that these typically require refinement
+      /// in order to obtain suitable precision.
+      FRSQRT, FRCP,
+
+      // Thread Local Storage
+      TLSADDR, THREAD_POINTER,
+
+      // Exception Handling helpers
+      EH_RETURN
+    };
+  }
+
+ /// Define some predicates that are used for node matching.
+ namespace X86 {
+   /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+   bool isPSHUFDMask(SDNode *N);
+
+   /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+   bool isPSHUFHWMask(SDNode *N);
+
+   /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+   bool isPSHUFLWMask(SDNode *N);
+
+   /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to SHUFP*.
+   bool isSHUFPMask(SDNode *N);
+
+   /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+   bool isMOVHLPSMask(SDNode *N);
+
+   /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+   /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+   /// <2, 3, 2, 3>
+   bool isMOVHLPS_v_undef_Mask(SDNode *N);
+
+   /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+   bool isMOVLPMask(SDNode *N);
+
+   /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
+   /// as well as MOVLHPS.
+   bool isMOVHPMask(SDNode *N);
+
+   /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to UNPCKL.
+   bool isUNPCKLMask(SDNode *N, bool V2IsSplat = false);
+
+   /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to UNPCKH.
+   bool isUNPCKHMask(SDNode *N, bool V2IsSplat = false);
+
+   /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+   /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+   /// <0, 0, 1, 1>
+   bool isUNPCKL_v_undef_Mask(SDNode *N);
+
+   /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+   /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+   /// <2, 2, 3, 3>
+   bool isUNPCKH_v_undef_Mask(SDNode *N);
+
+   /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVSS,
+   /// MOVSD, and MOVD, i.e. setting the lowest element.
+   bool isMOVLMask(SDNode *N);
+
+   /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+   bool isMOVSHDUPMask(SDNode *N);
+
+   /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+   bool isMOVSLDUPMask(SDNode *N);
+
+   /// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a splat of a single element.
+   bool isSplatMask(SDNode *N);
+
+   /// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a splat of zero element.
+   bool isSplatLoMask(SDNode *N);
+
+   /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+   /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+   /// instructions.
+   unsigned getShuffleSHUFImmediate(SDNode *N);
+
+   /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+   /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
+   /// instructions.
+   unsigned getShufflePSHUFHWImmediate(SDNode *N);
+
+   /// getShufflePSHUFKWImmediate - Return the appropriate immediate to shuffle
+   /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
+   /// instructions.
+   unsigned getShufflePSHUFLWImmediate(SDNode *N);
+ }
+
+  //===--------------------------------------------------------------------===//
+  //  X86TargetLowering - X86 Implementation of the TargetLowering interface
+  class X86TargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    int RegSaveFrameIndex;            // X86-64 vararg func register save area.
+    unsigned VarArgsGPOffset;         // X86-64 vararg func int reg offset.
+    unsigned VarArgsFPOffset;         // X86-64 vararg func fp reg offset.
+    int ReturnAddrIndex;              // FrameIndex for return slot.
+    int BytesToPopOnReturn;           // Number of arg bytes ret should pop.
+    int BytesCallerReserves;          // Number of arg bytes caller makes.
+  public:
+    X86TargetLowering(TargetMachine &TM);
+
+    // Return the number of bytes that a function should pop when it returns (in
+    // addition to the space used by the return address).
+    //
+    unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+
+    // Return the number of bytes that the caller reserves for arguments passed
+    // to this function.
+    unsigned getBytesCallerReserves() const { return BytesCallerReserves; }
+ 
+    /// getStackPtrReg - Return the stack pointer register we are using: either
+    /// ESP or RSP.
+    unsigned getStackPtrReg() const { return X86StackPtr; }
+    
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+
+    virtual SDOperand PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                                       MachineBasicBlock *MBB);
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// computeMaskedBitsForTargetNode - Determine which of the bits specified 
+    /// in Mask are known to be either zero or one and return them in the 
+    /// KnownZero/KnownOne bitsets.
+    virtual void computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                uint64_t Mask,
+                                                uint64_t &KnownZero, 
+                                                uint64_t &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+    
+    SDOperand getReturnAddressFrameIndex(SelectionDAG &DAG);
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+     
+    std::vector<unsigned> 
+      getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                        MVT::ValueType VT) const;
+    /// isOperandValidForConstraint - Return the specified operand (possibly
+    /// modified) if the specified SDOperand is valid for the specified target
+    /// constraint letter, otherwise return null.
+    SDOperand isOperandValidForConstraint(SDOperand Op, char ConstraintLetter,
+                                          SelectionDAG &DAG);
+    
+    /// getRegForInlineAsmConstraint - Given a physical register constraint
+    /// (e.g. {edx}), return the register number and the register class for the
+    /// register.  This should only be used for C_Register constraints.  On
+    /// error, this returns a register number of 0.
+    std::pair<unsigned, const TargetRegisterClass*> 
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT::ValueType VT) const;
+    
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+
+    /// isShuffleMaskLegal - Targets can use this to indicate that they only
+    /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+    /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
+    /// values are assumed to be legal.
+    virtual bool isShuffleMaskLegal(SDOperand Mask, MVT::ValueType VT) const;
+
+    /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
+    /// used by Targets can use this to indicate if there is a suitable
+    /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
+    /// pool entry.
+    virtual bool isVectorClearMaskLegal(std::vector<SDOperand> &BVOps,
+                                        MVT::ValueType EVT,
+                                        SelectionDAG &DAG) const;
+  private:
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+    const MRegisterInfo *RegInfo;
+
+    /// X86StackPtr - X86 physical register used as stack ptr.
+    unsigned X86StackPtr;
+
+    /// X86ScalarSSE - Select between SSE2 or x87 floating point ops.
+    bool X86ScalarSSE;
+
+    SDNode *LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode*TheCall,
+                            unsigned CallingConv, SelectionDAG &DAG);
+        
+    // C and StdCall Calling Convention implementation.
+    SDOperand LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
+                                bool isStdCall = false);
+    SDOperand LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC);
+
+    // X86-64 C Calling Convention implementation.
+    SDOperand LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,unsigned CC);
+
+    // Fast and FastCall Calling Convention implementation.
+    SDOperand LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC);
+
+    SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerExternalSymbol(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerShift(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFABS(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFNEG(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerSETCC(SDOperand Op, SelectionDAG &DAG, SDOperand Chain);
+    SDOperand LowerSELECT(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerBRCOND(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerMEMSET(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerJumpTable(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerDYNAMIC_STACKALLOC(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerREADCYCLCECOUNTER(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerVASTART(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerVACOPY(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFRAME_TO_ARGS_OFFSET(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerEH_RETURN(SDOperand Op, SelectionDAG &DAG);
+  };
+}
+
+#endif    // X86ISELLOWERING_H

diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..c0fa58d
--- /dev/null
+++ b/lib/Target/X86/X86InstrBuilder.h

@@ -0,0 +1,125 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call.  X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense.  Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRBUILDER_H
+#define X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly.  The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FrameIndex;
+  } Base;
+
+  unsigned Scale;
+  unsigned IndexReg;
+  unsigned Disp;
+  GlobalValue *GV;
+
+  X86AddressMode() : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0) {
+    Base.Reg = 0;
+  }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+inline const MachineInstrBuilder &addDirectMem(const MachineInstrBuilder &MIB,
+                                               unsigned Reg) {
+  // Because memory references are always represented with four
+  // values, this adds: Reg, [1, NoReg, 0] to the instruction.
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0);
+}
+
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+inline const MachineInstrBuilder &addRegOffset(const MachineInstrBuilder &MIB,
+                                               unsigned Reg, int Offset) {
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg1, unsigned Reg2) {
+  return MIB.addReg(Reg1).addImm(1).addReg(Reg2).addImm(0);
+}
+
+inline const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
+                                                 const X86AddressMode &AM) {
+  assert (AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+  if (AM.BaseType == X86AddressMode::RegBase)
+    MIB.addReg(AM.Base.Reg);
+  else if (AM.BaseType == X86AddressMode::FrameIndexBase)
+    MIB.addFrameIndex(AM.Base.FrameIndex);
+  else
+    assert (0);
+  MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+  if (AM.GV)
+    return MIB.addGlobalAddress(AM.GV, AM.Disp);
+  else
+    return MIB.addImm(AM.Disp);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+  return MIB.addFrameIndex(FI).addImm(1).addReg(0).addImm(Offset);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference has base register ConstantPoolIndex offset which is retained until
+/// either machine code emission or assembly output.  This allows an optional
+/// offset to be added as well.
+///
+inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         int Offset = 0) {
+  return MIB.addConstantPoolIndex(CPI).addImm(1).addReg(0).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..11aeb07
--- /dev/null
+++ b/lib/Target/X86/X86InstrFPStack.td

@@ -0,0 +1,456 @@
+//==- X86InstrFPStack.td - Describe the X86 Instruction Set -------*- C++ -*-=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet     : SDTypeProfile<1, 0, [SDTCisFP<0>]>;
+def SDTX86FpSet     : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDTX86Fld       : SDTypeProfile<1, 2, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86FpToIMem  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def X86fpget        : SDNode<"X86ISD::FP_GET_RESULT", SDTX86FpGet,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def X86fpset        : SDNode<"X86ISD::FP_SET_RESULT", SDTX86FpSet,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86fld          : SDNode<"X86ISD::FLD",      SDTX86Fld,
+                        [SDNPHasChain]>;
+def X86fst          : SDNode<"X86ISD::FST",      SDTX86Fst,
+                        [SDNPHasChain, SDNPInFlag]>;
+def X86fild         : SDNode<"X86ISD::FILD",     SDTX86Fild,
+                        [SDNPHasChain]>;
+def X86fildflag     : SDNode<"X86ISD::FILD_FLAG",SDTX86Fild,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+                        [SDNPHasChain]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+                        [SDNPHasChain]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+                        [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomDAGSchedInserter = 1 in {  // Expanded by the scheduler.
+  def FP32_TO_INT16_IN_MEM : I<0, Pseudo,
+                              (ops i16mem:$dst, RFP32:$src),
+                              "#FP32_TO_INT16_IN_MEM PSEUDO!",
+                              [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT32_IN_MEM : I<0, Pseudo,
+                              (ops i32mem:$dst, RFP32:$src),
+                              "#FP32_TO_INT32_IN_MEM PSEUDO!",
+                              [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT64_IN_MEM : I<0, Pseudo,
+                              (ops i64mem:$dst, RFP32:$src),
+                              "#FP32_TO_INT64_IN_MEM PSEUDO!",
+                              [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+  def FP64_TO_INT16_IN_MEM : I<0, Pseudo,
+                              (ops i16mem:$dst, RFP64:$src),
+                              "#FP64_TO_INT16_IN_MEM PSEUDO!",
+                              [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT32_IN_MEM : I<0, Pseudo,
+                              (ops i32mem:$dst, RFP64:$src),
+                              "#FP64_TO_INT32_IN_MEM PSEUDO!",
+                              [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT64_IN_MEM : I<0, Pseudo,
+                              (ops i64mem:$dst, RFP64:$src),
+                              "#FP64_TO_INT64_IN_MEM PSEUDO!",
+                              [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+}
+
+let isTerminator = 1 in
+  let Defs = [FP0, FP1, FP2, FP3, FP4, FP5, FP6] in
+    def FP_REG_KILL  : I<0, Pseudo, (ops), "#FP_REG_KILL", []>;
+
+// All FP Stack operations are represented with three instructions here.  The
+// first two instructions, generated by the instruction selector, uses "RFP32"
+// or "RFP64" registers: traditional register files to reference 32-bit or
+// 64-bit floating point values.  These sizes apply to the values, not the
+// registers, which are always 64 bits; RFP32 and RFP64 can be copied to
+// each other without losing information.  These instructions are all psuedo
+// instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of 32-bit and
+// 64-bit registers.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler.  These use "RST" registers, although frequently
+// the actual register(s) used are implicit.  These are always 64-bits.
+// The FP stackifier pass converts one to the other after register allocation 
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag ops, string asm> : I<o, F, ops, asm, []> {}
+
+// FpI_ - Floating Point Psuedo Instruction template. Not Predicated.
+class FpI_<dag ops, FPFormat fp, list<dag> pattern>
+  : X86Inst<0, Pseudo, NoImm, ops, ""> {
+  let FPForm = fp; let FPFormBits = FPForm.Value;
+  let Pattern = pattern;
+}
+
+// Random Pseudo Instructions.
+def FpGETRESULT32 : FpI_<(ops RFP32:$dst), SpecialFP,
+                      [(set RFP32:$dst, X86fpget)]>;           // FPR = ST(0)
+
+def FpGETRESULT64 : FpI_<(ops RFP64:$dst), SpecialFP,
+                      [(set RFP64:$dst, X86fpget)]>;           // FPR = ST(0)
+
+let noResults = 1 in {
+  def FpSETRESULT32 : FpI_<(ops RFP32:$src), SpecialFP,
+                        [(X86fpset RFP32:$src)]>, Imp<[], [ST0]>;// ST(0) = FPR
+
+  def FpSETRESULT64 : FpI_<(ops RFP64:$src), SpecialFP,
+                        [(X86fpset RFP64:$src)]>, Imp<[], [ST0]>;// ST(0) = FPR
+}
+// FpI - Floating Point Psuedo Instruction template. Predicated on FPStack.
+class FpI<dag ops, FPFormat fp, list<dag> pattern> :
+  FpI_<ops, fp, pattern>, Requires<[FPStack]>;
+
+// Register copies.  Just copies, the 64->32 version does not truncate.
+def MOV_Fp3232       : FpI<(ops RFP32:$dst, RFP32:$src), SpecialFP, []>; 
+def MOV_Fp3264       : FpI<(ops RFP64:$dst, RFP32:$src), SpecialFP, []>; 
+def MOV_Fp6432       : FpI<(ops RFP32:$dst, RFP64:$src), SpecialFP, []>; 
+def MOV_Fp6464       : FpI<(ops RFP64:$dst, RFP64:$src), SpecialFP, []>; 
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpI<(ops RFP32:$dst, RFP32:$src1, RFP32:$src2), TwoArgFP,
+                [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpI<(ops RFP64:$dst, RFP64:$src1, RFP64:$src2), TwoArgFP,
+                [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m  : FpI<(ops RFP32:$dst, RFP32:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP32:$dst, 
+                    (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+def _Fp64m  : FpI<(ops RFP64:$dst, RFP64:$src1, f64mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+def _Fp64m32: FpI<(ops RFP64:$dst, RFP64:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (extloadf32 addr:$src2)))]>;
+def _F32m  : FPI<0xD8, fp, (ops f32mem:$src), 
+                 !strconcat("f", !strconcat(asmstring, "{s} $src"))>;
+def _F64m  : FPI<0xDC, fp, (ops f64mem:$src), 
+                 !strconcat("f", !strconcat(asmstring, "{l} $src"))>;
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpI<(ops RFP32:$dst, RFP32:$src1, i16mem:$src2), OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m32 : FpI<(ops RFP32:$dst, RFP32:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FpI16m64 : FpI<(ops RFP64:$dst, RFP64:$src1, i16mem:$src2), OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m64 : FpI<(ops RFP64:$dst, RFP64:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FI16m  : FPI<0xDE, fp, (ops i16mem:$src), 
+                  !strconcat("fi", !strconcat(asmstring, "{s} $src"))>;
+def _FI32m  : FPI<0xDA, fp, (ops i32mem:$src), 
+                  !strconcat("fi", !strconcat(asmstring, "{l} $src"))>;
+}
+
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+
+class FPST0rInst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (ops RST:$op), asm>, D8;
+class FPrST0Inst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (ops RST:$op), asm>, DC;
+class FPrST0PInst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (ops RST:$op), asm>, DE;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
+// we have to put some 'r's in and take them out of weird places.
+def ADD_FST0r   : FPST0rInst <0xC0, "fadd $op">;
+def ADD_FrST0   : FPrST0Inst <0xC0, "fadd {%st(0), $op|$op, %ST(0)}">;
+def ADD_FPrST0  : FPrST0PInst<0xC0, "faddp $op">;
+def SUBR_FST0r  : FPST0rInst <0xE8, "fsubr $op">;
+def SUB_FrST0   : FPrST0Inst <0xE8, "fsub{r} {%st(0), $op|$op, %ST(0)}">;
+def SUB_FPrST0  : FPrST0PInst<0xE8, "fsub{r}p $op">;
+def SUB_FST0r   : FPST0rInst <0xE0, "fsub $op">;
+def SUBR_FrST0  : FPrST0Inst <0xE0, "fsub{|r} {%st(0), $op|$op, %ST(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p $op">;
+def MUL_FST0r   : FPST0rInst <0xC8, "fmul $op">;
+def MUL_FrST0   : FPrST0Inst <0xC8, "fmul {%st(0), $op|$op, %ST(0)}">;
+def MUL_FPrST0  : FPrST0PInst<0xC8, "fmulp $op">;
+def DIVR_FST0r  : FPST0rInst <0xF8, "fdivr $op">;
+def DIV_FrST0   : FPrST0Inst <0xF8, "fdiv{r} {%st(0), $op|$op, %ST(0)}">;
+def DIV_FPrST0  : FPrST0PInst<0xF8, "fdiv{r}p $op">;
+def DIV_FST0r   : FPST0rInst <0xF0, "fdiv $op">;
+def DIVR_FrST0  : FPrST0Inst <0xF0, "fdiv{|r} {%st(0), $op|$op, %ST(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p $op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
+def _Fp32  : FpI<(ops RFP32:$dst, RFP32:$src), OneArgFPRW,
+                 [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64  : FpI<(ops RFP64:$dst, RFP64:$src), OneArgFPRW,
+                 [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _F     : FPI<opcode, RawFrm, (ops), asmstring>, D9;
+}
+
+defm CHS : FPUnary<fneg, 0xE0, "fchs">;
+defm ABS : FPUnary<fabs, 0xE1, "fabs">;
+defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
+defm SIN : FPUnary<fsin, 0xFE, "fsin">;
+defm COS : FPUnary<fcos, 0xFF, "fcos">;
+
+def TST_Fp32  : FpI<(ops RFP32:$src), OneArgFP,
+                 []>;
+def TST_Fp64  : FpI<(ops RFP64:$src), OneArgFP,
+                 []>;
+def TST_F  : FPI<0xE4, RawFrm, (ops), "ftst">, D9;
+
+// Floating point cmovs.
+multiclass FPCMov<PatLeaf cc> {
+  def _Fp32  : FpI<(ops RFP32:$dst, RFP32:$src1, RFP32:$src2), CondMovFP,
+                     [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+                                        cc))]>;
+  def _Fp64  : FpI<(ops RFP64:$dst, RFP64:$src1, RFP64:$src2), CondMovFP,
+                     [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+                                        cc))]>;
+}
+let isTwoAddress = 1 in {
+defm CMOVB  : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE  : FPCMov<X86_COND_E>;
+defm CMOVP  : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+}
+
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F  : FPI<0xC0, AddRegFrm, (ops RST:$op),
+                  "fcmovb {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVBE_F : FPI<0xD0, AddRegFrm, (ops RST:$op),
+                  "fcmovbe {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVE_F  : FPI<0xC8, AddRegFrm, (ops RST:$op),
+                  "fcmove {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVP_F  : FPI<0xD8, AddRegFrm, (ops RST:$op),
+                  "fcmovu  {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVNB_F : FPI<0xC0, AddRegFrm, (ops RST:$op),
+                  "fcmovnb {$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNBE_F: FPI<0xD0, AddRegFrm, (ops RST:$op),
+                  "fcmovnbe {$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNE_F : FPI<0xC8, AddRegFrm, (ops RST:$op),
+                  "fcmovne {$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNP_F : FPI<0xD8, AddRegFrm, (ops RST:$op),
+                  "fcmovnu {$op, %st(0)|%ST(0), $op}">, DB;
+
+// Floating point loads & stores.
+def LD_Fp32m   : FpI<(ops RFP32:$dst, f32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (loadf32 addr:$src))]>;
+def LD_Fp64m   : FpI<(ops RFP64:$dst, f64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def ILD_Fp16m32: FpI<(ops RFP32:$dst, i16mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpI<(ops RFP32:$dst, i32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpI<(ops RFP32:$dst, i64mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpI<(ops RFP64:$dst, i16mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpI<(ops RFP64:$dst, i32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpI<(ops RFP64:$dst, i64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m   : FpI<(ops f32mem:$op, RFP32:$src), OneArgFP,
+                  [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpI<(ops f32mem:$op, RFP64:$src), OneArgFP,
+                  [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m   : FpI<(ops f64mem:$op, RFP64:$src), OneArgFP,
+                  [(store RFP64:$src, addr:$op)]>;
+
+def ST_FpP32m    : FpI<(ops f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32  : FpI<(ops f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m    : FpI<(ops f64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m32  : FpI<(ops i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32  : FpI<(ops i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32  : FpI<(ops i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64  : FpI<(ops i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64  : FpI<(ops i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64  : FpI<(ops i64mem:$op, RFP64:$src), OneArgFP, []>;
+
+def LD_F32m   : FPI<0xD9, MRM0m, (ops f32mem:$src), "fld{s} $src">;
+def LD_F64m   : FPI<0xDD, MRM0m, (ops f64mem:$src), "fld{l} $src">;
+def ILD_F16m  : FPI<0xDF, MRM0m, (ops i16mem:$src), "fild{s} $src">;
+def ILD_F32m  : FPI<0xDB, MRM0m, (ops i32mem:$src), "fild{l} $src">;
+def ILD_F64m  : FPI<0xDF, MRM5m, (ops i64mem:$src), "fild{ll} $src">;
+def ST_F32m   : FPI<0xD9, MRM2m, (ops f32mem:$dst), "fst{s} $dst">;
+def ST_F64m   : FPI<0xDD, MRM2m, (ops f64mem:$dst), "fst{l} $dst">;
+def ST_FP32m  : FPI<0xD9, MRM3m, (ops f32mem:$dst), "fstp{s} $dst">;
+def ST_FP64m  : FPI<0xDD, MRM3m, (ops f64mem:$dst), "fstp{l} $dst">;
+def IST_F16m  : FPI<0xDF, MRM2m, (ops i16mem:$dst), "fist{s} $dst">;
+def IST_F32m  : FPI<0xDB, MRM2m, (ops i32mem:$dst), "fist{l} $dst">;
+def IST_FP16m : FPI<0xDF, MRM3m, (ops i16mem:$dst), "fistp{s} $dst">;
+def IST_FP32m : FPI<0xDB, MRM3m, (ops i32mem:$dst), "fistp{l} $dst">;
+def IST_FP64m : FPI<0xDF, MRM7m, (ops i64mem:$dst), "fistp{ll} $dst">;
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+def ISTT_Fp16m32 : FpI_<(ops i16mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp32m32 : FpI_<(ops i32mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp64m32 : FpI_<(ops i64mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp16m64 : FpI_<(ops i16mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp32m64 : FpI_<(ops i32mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp64m64 : FpI_<(ops i64mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+
+def ISTT_FP16m : FPI<0xDF, MRM1m, (ops i16mem:$dst), "fisttp{s} $dst">;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (ops i32mem:$dst), "fisttp{l} $dst">;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (ops i64mem:$dst), "fisttp{ll} $dst">;
+
+// FP Stack manipulation instructions.
+def LD_Frr   : FPI<0xC0, AddRegFrm, (ops RST:$op), "fld $op">, D9;
+def ST_Frr   : FPI<0xD0, AddRegFrm, (ops RST:$op), "fst $op">, DD;
+def ST_FPrr  : FPI<0xD8, AddRegFrm, (ops RST:$op), "fstp $op">, DD;
+def XCH_F    : FPI<0xC8, AddRegFrm, (ops RST:$op), "fxch $op">, D9;
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpI<(ops RFP32:$dst), ZeroArgFP,
+                [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpI<(ops RFP32:$dst), ZeroArgFP,
+                [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpI<(ops RFP64:$dst), ZeroArgFP,
+                [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpI<(ops RFP64:$dst), ZeroArgFP,
+                [(set RFP64:$dst, fpimm1)]>;
+}
+
+def LD_F0 : FPI<0xEE, RawFrm, (ops), "fldz">, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (ops), "fld1">, D9;
+
+
+// Floating point compares.
+def UCOM_Fpr32 : FpI<(ops RFP32:$lhs, RFP32:$rhs), CompareFP,
+                  []>;  // FPSW = cmp ST(0) with ST(i)
+def UCOM_FpIr32: FpI<(ops RFP32:$lhs, RFP32:$rhs), CompareFP,
+                  [(X86cmp RFP32:$lhs, RFP32:$rhs)]>; // CC = ST(0) cmp ST(i)
+def UCOM_Fpr64 : FpI<(ops RFP64:$lhs, RFP64:$rhs), CompareFP,
+                  []>;  // FPSW = cmp ST(0) with ST(i)
+def UCOM_FpIr64: FpI<(ops RFP64:$lhs, RFP64:$rhs), CompareFP,
+                  [(X86cmp RFP64:$lhs, RFP64:$rhs)]>; // CC = ST(0) cmp ST(i)
+
+def UCOM_Fr    : FPI<0xE0, AddRegFrm,    // FPSW = cmp ST(0) with ST(i)
+                    (ops RST:$reg),
+                    "fucom $reg">, DD, Imp<[ST0],[]>;
+def UCOM_FPr   : FPI<0xE8, AddRegFrm,    // FPSW = cmp ST(0) with ST(i), pop
+                    (ops RST:$reg),
+                    "fucomp $reg">, DD, Imp<[ST0],[]>;
+def UCOM_FPPr  : FPI<0xE9, RawFrm,       // cmp ST(0) with ST(1), pop, pop
+                    (ops),
+                    "fucompp">, DA, Imp<[ST0],[]>;
+
+def UCOM_FIr   : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i)
+                    (ops RST:$reg),
+                    "fucomi {$reg, %st(0)|%ST(0), $reg}">, DB, Imp<[ST0],[]>;
+def UCOM_FIPr  : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i), pop
+                    (ops RST:$reg),
+                    "fucomip {$reg, %st(0)|%ST(0), $reg}">, DF, Imp<[ST0],[]>;
+
+// Floating point flag ops.
+def FNSTSW8r  : I<0xE0, RawFrm,                  // AX = fp flags
+                  (ops), "fnstsw", []>, DF, Imp<[],[AX]>;
+
+def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
+                  (ops i16mem:$dst), "fnstcw $dst", []>;
+def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
+                  (ops i16mem:$dst), "fldcw $dst", []>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+
+// Required for CALL which return f32 / f64 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStack]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStack]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStack]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStack]>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+def : Pat<(extloadf32 addr:$src), 
+           (MOV_Fp3264 (LD_Fp32m addr:$src))>, Requires<[FPStack]>;
+def : Pat<(fextend RFP32:$src), (MOV_Fp3264 RFP32:$src)>, Requires<[FPStack]>;

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..06b14fe
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.cpp

@@ -0,0 +1,567 @@
+//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86GenInstrInfo.inc"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LiveVariables.h"
+using namespace llvm;
+
+X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
+  : TargetInstrInfo(X86Insts, sizeof(X86Insts)/sizeof(X86Insts[0])),
+    TM(tm), RI(tm, *this) {
+}
+
+bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
+                               unsigned& sourceReg,
+                               unsigned& destReg) const {
+  MachineOpCode oc = MI.getOpcode();
+  if (oc == X86::MOV8rr || oc == X86::MOV16rr ||
+      oc == X86::MOV32rr || oc == X86::MOV64rr ||
+      oc == X86::MOV16to16_ || oc == X86::MOV32to32_ ||
+      oc == X86::MOV_Fp3232  || oc == X86::MOVSSrr || oc == X86::MOVSDrr ||
+      oc == X86::MOV_Fp3264 || oc == X86::MOV_Fp6432 || oc == X86::MOV_Fp6464 ||
+      oc == X86::FsMOVAPSrr || oc == X86::FsMOVAPDrr ||
+      oc == X86::MOVAPSrr || oc == X86::MOVAPDrr ||
+      oc == X86::MOVSS2PSrr || oc == X86::MOVSD2PDrr ||
+      oc == X86::MOVPS2SSrr || oc == X86::MOVPD2SDrr ||
+      oc == X86::MMX_MOVD64rr || oc == X86::MMX_MOVQ64rr) {
+      assert(MI.getNumOperands() >= 2 &&
+             MI.getOperand(0).isRegister() &&
+             MI.getOperand(1).isRegister() &&
+             "invalid register-register move instruction");
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+  }
+  return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(MachineInstr *MI, 
+                                           int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV16_rm:
+  case X86::MOV32rm:
+  case X86::MOV32_rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVAPDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    if (MI->getOperand(1).isFrameIndex() && MI->getOperand(2).isImmediate() &&
+        MI->getOperand(3).isRegister() && MI->getOperand(4).isImmediate() &&
+        MI->getOperand(2).getImmedValue() == 1 &&
+        MI->getOperand(3).getReg() == 0 &&
+        MI->getOperand(4).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(MachineInstr *MI,
+                                          int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8mr:
+  case X86::MOV16mr:
+  case X86::MOV16_mr:
+  case X86::MOV32mr:
+  case X86::MOV32_mr:
+  case X86::MOV64mr:
+  case X86::ST_FpP64m:
+  case X86::MOVSSmr:
+  case X86::MOVSDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVAPDmr:
+  case X86::MMX_MOVD64mr:
+  case X86::MMX_MOVQ64mr:
+  case X86::MMX_MOVNTQmr:
+    if (MI->getOperand(0).isFrameIndex() && MI->getOperand(1).isImmediate() &&
+        MI->getOperand(2).isRegister() && MI->getOperand(3).isImmediate() &&
+        MI->getOperand(1).getImmedValue() == 1 &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(0).getFrameIndex();
+      return MI->getOperand(4).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+
+bool X86InstrInfo::isReallyTriviallyReMaterializable(MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV16_rm:
+  case X86::MOV32rm:
+  case X86::MOV32_rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVAPDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    // Loads from constant pools are trivially rematerializable.
+    return MI->getOperand(1).isRegister() && MI->getOperand(2).isImmediate() &&
+           MI->getOperand(3).isRegister() && MI->getOperand(4).isConstantPoolIndex() &&
+           MI->getOperand(1).getReg() == 0 &&
+           MI->getOperand(2).getImmedValue() == 1 &&
+           MI->getOperand(3).getReg() == 0;
+  }
+  // All other instructions marked M_REMATERIALIZABLE are always trivially
+  // rematerializable.
+  return true;
+}
+
+/// convertToThreeAddress - This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand.  This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    LiveVariables &LV) const {
+  MachineInstr *MI = MBBI;
+  // All instructions input are two-addr instructions.  Get the known operands.
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Src = MI->getOperand(1).getReg();
+
+  MachineInstr *NewMI = NULL;
+  // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
+  // we have better subtarget support, enable the 16-bit LEA generation here.
+  bool DisableLEA16 = true;
+
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case X86::SHUFPSrri: {
+    assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
+    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+    
+    unsigned A = MI->getOperand(0).getReg();
+    unsigned B = MI->getOperand(1).getReg();
+    unsigned C = MI->getOperand(2).getReg();
+    unsigned M = MI->getOperand(3).getImm();
+    if (B != C) return 0;
+    NewMI = BuildMI(get(X86::PSHUFDri), A).addReg(B).addImm(M);
+    break;
+  }
+  case X86::SHL64ri: {
+    assert(MI->getNumOperands() == 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src = MI->getOperand(1).getReg();
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    
+    NewMI = BuildMI(get(X86::LEA64r), Dest)
+      .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+    break;
+  }
+  case X86::SHL32ri: {
+    assert(MI->getNumOperands() == 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src = MI->getOperand(1).getReg();
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    
+    unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() ?
+      X86::LEA64_32r : X86::LEA32r;
+    NewMI = BuildMI(get(Opc), Dest)
+      .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+    break;
+  }
+  case X86::SHL16ri: {
+    assert(MI->getNumOperands() == 3 && "Unknown shift instruction!");
+    if (DisableLEA16) return 0;
+    
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src = MI->getOperand(1).getReg();
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    
+    NewMI = BuildMI(get(X86::LEA16r), Dest)
+      .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+    break;
+  }
+  }
+
+  // FIXME: None of these instructions are promotable to LEAs without
+  // additional information.  In particular, LEA doesn't set the flags that
+  // add and inc do.  :(
+  if (0)
+  switch (MI->getOpcode()) {
+  case X86::INC32r:
+  case X86::INC64_32r:
+    assert(MI->getNumOperands() == 2 && "Unknown inc instruction!");
+    NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src, 1);
+    break;
+  case X86::INC16r:
+  case X86::INC64_16r:
+    if (DisableLEA16) return 0;
+    assert(MI->getNumOperands() == 2 && "Unknown inc instruction!");
+    NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, 1);
+    break;
+  case X86::DEC32r:
+  case X86::DEC64_32r:
+    assert(MI->getNumOperands() == 2 && "Unknown dec instruction!");
+    NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src, -1);
+    break;
+  case X86::DEC16r:
+  case X86::DEC64_16r:
+    if (DisableLEA16) return 0;
+    assert(MI->getNumOperands() == 2 && "Unknown dec instruction!");
+    NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, -1);
+    break;
+  case X86::ADD32rr:
+    assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+    NewMI = addRegReg(BuildMI(get(X86::LEA32r), Dest), Src,
+                     MI->getOperand(2).getReg());
+    break;
+  case X86::ADD16rr:
+    if (DisableLEA16) return 0;
+    assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+    NewMI = addRegReg(BuildMI(get(X86::LEA16r), Dest), Src,
+                     MI->getOperand(2).getReg());
+    break;
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+    assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+    if (MI->getOperand(2).isImmediate())
+      NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src,
+                          MI->getOperand(2).getImmedValue());
+    break;
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+    if (DisableLEA16) return 0;
+    assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+    if (MI->getOperand(2).isImmediate())
+      NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src,
+                          MI->getOperand(2).getImmedValue());
+    break;
+  case X86::SHL16ri:
+    if (DisableLEA16) return 0;
+  case X86::SHL32ri:
+    assert(MI->getNumOperands() == 3 && MI->getOperand(2).isImmediate() &&
+           "Unknown shl instruction!");
+    unsigned ShAmt = MI->getOperand(2).getImmedValue();
+    if (ShAmt == 1 || ShAmt == 2 || ShAmt == 3) {
+      X86AddressMode AM;
+      AM.Scale = 1 << ShAmt;
+      AM.IndexReg = Src;
+      unsigned Opc = MI->getOpcode() == X86::SHL32ri ? X86::LEA32r :X86::LEA16r;
+      NewMI = addFullAddress(BuildMI(get(Opc), Dest), AM);
+    }
+    break;
+  }
+
+  if (NewMI) {
+    NewMI->copyKillDeadInfo(MI);
+    LV.instructionChanged(MI, NewMI);  // Update live variables
+    MFI->insert(MBBI, NewMI);          // Insert the new inst    
+  }
+  return NewMI;
+}
+
+/// commuteInstruction - We have a few instructions that must be hacked on to
+/// commute them.
+///
+MachineInstr *X86InstrInfo::commuteInstruction(MachineInstr *MI) const {
+  // FIXME: Can commute cmoves by changing the condition!
+  switch (MI->getOpcode()) {
+  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+  case X86::SHLD32rri8:{// A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+    unsigned Opc;
+    unsigned Size;
+    switch (MI->getOpcode()) {
+    default: assert(0 && "Unreachable!");
+    case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+    case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+    case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+    case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+    }
+    unsigned Amt = MI->getOperand(3).getImmedValue();
+    unsigned A = MI->getOperand(0).getReg();
+    unsigned B = MI->getOperand(1).getReg();
+    unsigned C = MI->getOperand(2).getReg();
+    bool BisKill = MI->getOperand(1).isKill();
+    bool CisKill = MI->getOperand(2).isKill();
+    return BuildMI(get(Opc), A).addReg(C, false, false, CisKill)
+      .addReg(B, false, false, BisKill).addImm(Size-Amt);
+  }
+  default:
+    return TargetInstrInfo::commuteInstruction(MI);
+  }
+}
+
+static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
+  switch (BrOpc) {
+  default: return X86::COND_INVALID;
+  case X86::JE:  return X86::COND_E;
+  case X86::JNE: return X86::COND_NE;
+  case X86::JL:  return X86::COND_L;
+  case X86::JLE: return X86::COND_LE;
+  case X86::JG:  return X86::COND_G;
+  case X86::JGE: return X86::COND_GE;
+  case X86::JB:  return X86::COND_B;
+  case X86::JBE: return X86::COND_BE;
+  case X86::JA:  return X86::COND_A;
+  case X86::JAE: return X86::COND_AE;
+  case X86::JS:  return X86::COND_S;
+  case X86::JNS: return X86::COND_NS;
+  case X86::JP:  return X86::COND_P;
+  case X86::JNP: return X86::COND_NP;
+  case X86::JO:  return X86::COND_O;
+  case X86::JNO: return X86::COND_NO;
+  }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Illegal condition code!");
+  case X86::COND_E:  return X86::JE;
+  case X86::COND_NE: return X86::JNE;
+  case X86::COND_L:  return X86::JL;
+  case X86::COND_LE: return X86::JLE;
+  case X86::COND_G:  return X86::JG;
+  case X86::COND_GE: return X86::JGE;
+  case X86::COND_B:  return X86::JB;
+  case X86::COND_BE: return X86::JBE;
+  case X86::COND_A:  return X86::JA;
+  case X86::COND_AE: return X86::JAE;
+  case X86::COND_S:  return X86::JS;
+  case X86::COND_NS: return X86::JNS;
+  case X86::COND_P:  return X86::JP;
+  case X86::COND_NP: return X86::JNP;
+  case X86::COND_O:  return X86::JO;
+  case X86::COND_NO: return X86::JNO;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Illegal condition code!");
+  case X86::COND_E:  return X86::COND_NE;
+  case X86::COND_NE: return X86::COND_E;
+  case X86::COND_L:  return X86::COND_GE;
+  case X86::COND_LE: return X86::COND_G;
+  case X86::COND_G:  return X86::COND_LE;
+  case X86::COND_GE: return X86::COND_L;
+  case X86::COND_B:  return X86::COND_AE;
+  case X86::COND_BE: return X86::COND_A;
+  case X86::COND_A:  return X86::COND_BE;
+  case X86::COND_AE: return X86::COND_B;
+  case X86::COND_S:  return X86::COND_NS;
+  case X86::COND_NS: return X86::COND_S;
+  case X86::COND_P:  return X86::COND_NP;
+  case X86::COND_NP: return X86::COND_P;
+  case X86::COND_O:  return X86::COND_NO;
+  case X86::COND_NO: return X86::COND_O;
+  }
+}
+
+// For purposes of branch analysis do not count FP_REG_KILL as a terminator.
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  if (MI->getOpcode() == X86::FP_REG_KILL)
+    return false;
+
+  const TargetInstrDescriptor *TID = MI->getInstrDescriptor();
+  if (TID->Flags & M_TERMINATOR_FLAG) {
+    // Conditional branch is a special case.
+    if ((TID->Flags & M_BRANCH_FLAG) != 0 && (TID->Flags & M_BARRIER_FLAG) == 0)
+      return true;
+    if ((TID->Flags & M_PREDICABLE) == 0)
+      return true;
+    return !isPredicated(MI);
+  }
+  return false;
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, 
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 std::vector<MachineOperand> &Cond) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (!isBranch(LastInst->getOpcode()))
+      return true;
+    
+    // If the block ends with a branch there are 3 possibilities:
+    // it's an unconditional, conditional, or indirect branch.
+    
+    if (LastInst->getOpcode() == X86::JMP) {
+      TBB = LastInst->getOperand(0).getMachineBasicBlock();
+      return false;
+    }
+    X86::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+    if (BranchCode == X86::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Otherwise, block ends with fall-through condbranch.
+    TBB = LastInst->getOperand(0).getMachineBasicBlock();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    return false;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+  
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with X86::JMP and a conditional branch, handle it.
+  X86::CondCode BranchCode = GetCondFromBranchOpc(SecondLastInst->getOpcode());
+  if (BranchCode != X86::COND_INVALID && LastInst->getOpcode() == X86::JMP) {
+    TBB = SecondLastInst->getOperand(0).getMachineBasicBlock();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    FBB = LastInst->getOperand(0).getMachineBasicBlock();
+    return false;
+  }
+
+  // If the block ends with two X86::JMPs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == X86::JMP && 
+      LastInst->getOpcode() == X86::JMP) {
+    TBB = SecondLastInst->getOperand(0).getMachineBasicBlock();
+    I = LastInst;
+    I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != X86::JMP && 
+      GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+  
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const std::vector<MachineOperand> &Cond) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "X86 branch conditions have one component!");
+
+  if (FBB == 0) { // One way branch.
+    if (Cond.empty()) {
+      // Unconditional branch?
+      BuildMI(&MBB, get(X86::JMP)).addMBB(TBB);
+    } else {
+      // Conditional branch.
+      unsigned Opc = GetCondBranchFromCond((X86::CondCode)Cond[0].getImm());
+      BuildMI(&MBB, get(Opc)).addMBB(TBB);
+    }
+    return 1;
+  }
+  
+  // Two-way Conditional branch.
+  unsigned Opc = GetCondBranchFromCond((X86::CondCode)Cond[0].getImm());
+  BuildMI(&MBB, get(Opc)).addMBB(TBB);
+  BuildMI(&MBB, get(X86::JMP)).addMBB(FBB);
+  return 2;
+}
+
+bool X86InstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case X86::RET:     // Return.
+  case X86::RETI:
+  case X86::TAILJMPd:
+  case X86::TAILJMPr:
+  case X86::TAILJMPm:
+  case X86::JMP:     // Uncond branch.
+  case X86::JMP32r:  // Indirect branch.
+  case X86::JMP32m:  // Indirect branch through mem.
+    return true;
+  default: return false;
+  }
+}
+
+bool X86InstrInfo::
+ReverseBranchCondition(std::vector<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+  Cond[0].setImm(GetOppositeBranchCondition((X86::CondCode)Cond[0].getImm()));
+  return false;
+}
+
+const TargetRegisterClass *X86InstrInfo::getPointerRegClass() const {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  if (Subtarget->is64Bit())
+    return &X86::GR64RegClass;
+  else
+    return &X86::GR32RegClass;
+}

diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..ec30cc7
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.h

@@ -0,0 +1,287 @@
+//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRUCTIONINFO_H
+#define X86INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "X86RegisterInfo.h"
+
+namespace llvm {
+  class X86RegisterInfo;
+  class X86TargetMachine;
+
+namespace X86 {
+  // X86 specific condition code. These correspond to X86_*_COND in
+  // X86InstrInfo.td. They must be kept in synch.
+  enum CondCode {
+    COND_A  = 0,
+    COND_AE = 1,
+    COND_B  = 2,
+    COND_BE = 3,
+    COND_E  = 4,
+    COND_G  = 5,
+    COND_GE = 6,
+    COND_L  = 7,
+    COND_LE = 8,
+    COND_NE = 9,
+    COND_NO = 10,
+    COND_NP = 11,
+    COND_NS = 12,
+    COND_O  = 13,
+    COND_P  = 14,
+    COND_S  = 15,
+    COND_INVALID
+  };
+  
+  // Turn condition code into conditional branch opcode.
+  unsigned GetCondBranchFromCond(CondCode CC);
+  
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  CondCode GetOppositeBranchCondition(X86::CondCode CC);
+
+}
+  
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction types.  These are the standard/most common forms for X86
+    // instructions.
+    //
+
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo         = 0,
+
+    /// Raw - This form is for instructions that don't have any operands, so
+    /// they are just a fixed opcode value, like 'leave'.
+    RawFrm         = 1,
+
+    /// AddRegFrm - This form is used for instructions like 'push r32' that have
+    /// their one register operand added to their opcode.
+    AddRegFrm      = 2,
+
+    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is a register.
+    ///
+    MRMDestReg     = 3,
+
+    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is memory.
+    ///
+    MRMDestMem     = 4,
+
+    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is a register.
+    ///
+    MRMSrcReg      = 5,
+
+    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is memory.
+    ///
+    MRMSrcMem      = 6,
+
+    /// MRM[0-7][rm] - These forms are used to represent instructions that use
+    /// a Mod/RM byte, and use the middle field to hold extended opcode
+    /// information.  In the intel manual these are represented as /0, /1, ...
+    ///
+
+    // First, instructions that operate on a register r/m operand...
+    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
+    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
+
+    // Next, instructions that operate on a memory r/m operand...
+    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
+    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
+
+    // MRMInitReg - This form is used for instructions whose source and
+    // destinations are the same register.
+    MRMInitReg = 32,
+
+    FormMask       = 63,
+
+    //===------------------------------------------------------------------===//
+    // Actual flags...
+
+    // OpSize - Set if this instruction requires an operand size prefix (0x66),
+    // which most often indicates that the instruction operates on 16 bit data
+    // instead of 32 bit data.
+    OpSize      = 1 << 6,
+
+    // AsSize - Set if this instruction requires an operand size prefix (0x67),
+    // which most often indicates that the instruction address 16 bit address
+    // instead of 32 bit address (or 32 bit address in 64 bit mode).
+    AdSize      = 1 << 7,
+
+    //===------------------------------------------------------------------===//
+    // Op0Mask - There are several prefix bytes that are used to form two byte
+    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
+    // used to obtain the setting of this field.  If no bits in this field is
+    // set, there is no prefix byte for obtaining a multibyte opcode.
+    //
+    Op0Shift    = 8,
+    Op0Mask     = 0xF << Op0Shift,
+
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB          = 1 << Op0Shift,
+
+    // REP - The 0xF3 prefix byte indicating repetition of the following
+    // instruction.
+    REP         = 2 << Op0Shift,
+
+    // D8-DF - These escape opcodes are used by the floating point unit.  These
+    // values must remain sequential.
+    D8 = 3 << Op0Shift,   D9 = 4 << Op0Shift,
+    DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
+    DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
+    DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
+
+    // T8, TA - Prefix after the 0x0F prefix.
+    T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
+
+    //===------------------------------------------------------------------===//
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = 12,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
+    // unused so that we can tell if we forgot to set a value.
+    ImmShift = 13,
+    ImmMask  = 7 << ImmShift,
+    Imm8     = 1 << ImmShift,
+    Imm16    = 2 << ImmShift,
+    Imm32    = 3 << ImmShift,
+    Imm64    = 4 << ImmShift,
+
+    //===------------------------------------------------------------------===//
+    // FP Instruction Classification...  Zero is non-fp instruction.
+
+    // FPTypeMask - Mask for all of the FP types...
+    FPTypeShift = 16,
+    FPTypeMask  = 7 << FPTypeShift,
+
+    // NotFP - The default, set for instructions that do not use FP registers.
+    NotFP      = 0 << FPTypeShift,
+
+    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+    ZeroArgFP  = 1 << FPTypeShift,
+
+    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+    OneArgFP   = 2 << FPTypeShift,
+
+    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+    // result back to ST(0).  For example, fcos, fsqrt, etc.
+    //
+    OneArgFPRW = 3 << FPTypeShift,
+
+    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+    // explicit argument, storing the result to either ST(0) or the implicit
+    // argument.  For example: fadd, fsub, fmul, etc...
+    TwoArgFP   = 4 << FPTypeShift,
+
+    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
+    CompareFP  = 5 << FPTypeShift,
+
+    // CondMovFP - "2 operand" floating point conditional move instructions.
+    CondMovFP  = 6 << FPTypeShift,
+
+    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
+    SpecialFP  = 7 << FPTypeShift,
+
+    // Bits 19 -> 23 are unused
+    OpcodeShift   = 24,
+    OpcodeMask    = 0xFF << OpcodeShift
+  };
+}
+
+class X86InstrInfo : public TargetInstrInfo {
+  X86TargetMachine &TM;
+  const X86RegisterInfo RI;
+public:
+  X86InstrInfo(X86TargetMachine &tm);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MRegisterInfo &getRegisterInfo() const { return RI; }
+
+  // Return true if the instruction is a register to register move and
+  // leave the source and dest operands in the passed parameters.
+  //
+  bool isMoveInstr(const MachineInstr& MI, unsigned& sourceReg,
+                   unsigned& destReg) const;
+  unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  bool isReallyTriviallyReMaterializable(MachineInstr *MI) const;
+  
+  /// convertToThreeAddress - This method must be implemented by targets that
+  /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+  /// may be able to convert a two-address instruction into a true
+  /// three-address instruction on demand.  This allows the X86 target (for
+  /// example) to convert ADD and SHL instructions into LEA instructions if they
+  /// would require register copies due to two-addressness.
+  ///
+  /// This method returns a null pointer if the transformation cannot be
+  /// performed, otherwise it returns the new instruction.
+  ///
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables &LV) const;
+
+  /// commuteInstruction - We have a few instructions that must be hacked on to
+  /// commute them.
+  ///
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI) const;
+
+  // Branch analysis.
+  virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             std::vector<MachineOperand> &Cond) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const std::vector<MachineOperand> &Cond) const;
+  virtual bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const;
+  virtual bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const;
+
+  const TargetRegisterClass *getPointerRegClass() const;
+
+  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+  // specified opcode number.
+  //
+  unsigned char getBaseOpcodeFor(const TargetInstrDescriptor *TID) const {
+    return TID->TSFlags >> X86II::OpcodeShift;
+  }
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..b24f644
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.td

@@ -0,0 +1,2674 @@
+//===- X86InstrInfo.td - Describe the X86 Instruction Set -------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                   SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDTX86Cmov    : SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>]>;
+
+def SDTX86BrCond  : SDTypeProfile<0, 2,
+                                  [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>]>;
+
+def SDTX86SetCC   : SDTypeProfile<1, 1,
+                                  [SDTCisVT<0, i8>, SDTCisVT<1, i8>]>;
+
+def SDTX86Ret     : SDTypeProfile<0, 1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>;
+def SDT_X86CallSeqEnd   : SDTypeProfile<0, 2, [ SDTCisVT<0, i32>,
+                                                SDTCisVT<1, i32> ]>;
+
+def SDT_X86Call   : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+
+def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86RdTsc   : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
+
+def SDT_X86TLSTP : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
+def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
+
+def X86cmp     : SDNode<"X86ISD::CMP" ,     SDTX86CmpTest,
+                        [SDNPHasChain, SDNPOutFlag]>;
+
+def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov,    
+                        [SDNPInFlag, SDNPOutFlag]>;
+def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
+                        [SDNPHasChain, SDNPInFlag]>;
+def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC,
+                        [SDNPInFlag, SDNPOutFlag]>;
+
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInFlag]>;
+
+def X86callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_X86CallSeqEnd,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86tailcall: SDNode<"X86ISD::TAILCALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG",SDTX86RdTsc,
+                        [SDNPHasChain, SDNPOutFlag]>;
+
+def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def X86TLStp : SDNode<"X86ISD::THREAD_POINTER", SDT_X86TLSTP, []>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+                        [SDNPHasChain]>;
+
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+class X86MemOperand<string printMethod> : Operand<iPTR> {
+  let PrintMethod = printMethod;
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
+}
+
+def i8mem   : X86MemOperand<"printi8mem">;
+def i16mem  : X86MemOperand<"printi16mem">;
+def i32mem  : X86MemOperand<"printi32mem">;
+def i64mem  : X86MemOperand<"printi64mem">;
+def i128mem : X86MemOperand<"printi128mem">;
+def f32mem  : X86MemOperand<"printf32mem">;
+def f64mem  : X86MemOperand<"printf64mem">;
+def f128mem : X86MemOperand<"printf128mem">;
+
+def lea32mem : Operand<i32> {
+  let PrintMethod = "printi32mem";
+  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
+def SSECC : Operand<i8> {
+  let PrintMethod = "printSSECC";
+}
+
+def piclabel: Operand<i32> {
+  let PrintMethod = "printPICLabel";
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm  : Operand<i16>;
+// 32-bits but only 8 bits are significant.
+def i32i8imm  : Operand<i32>;
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86 specific addressing mode.
+def addr      : ComplexPattern<iPTR, 4, "SelectAddr", [], []>;
+def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr",
+                               [add, mul, shl, or, frameindex], []>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+  bits<6> Value = val;
+}
+
+def Pseudo     : Format<0>; def RawFrm     : Format<1>;
+def AddRegFrm  : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg  : Format<5>;
+def MRMSrcMem  : Format<6>;
+def MRM0r  : Format<16>; def MRM1r  : Format<17>; def MRM2r  : Format<18>;
+def MRM3r  : Format<19>; def MRM4r  : Format<20>; def MRM5r  : Format<21>;
+def MRM6r  : Format<22>; def MRM7r  : Format<23>;
+def MRM0m  : Format<24>; def MRM1m  : Format<25>; def MRM2m  : Format<26>;
+def MRM3m  : Format<27>; def MRM4m  : Format<28>; def MRM5m  : Format<29>;
+def MRM6m  : Format<30>; def MRM7m  : Format<31>;
+def MRMInitReg : Format<32>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasMMX       : Predicate<"Subtarget->hasMMX()">;
+def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def FPStack      : Predicate<"!Subtarget->hasSSE2()">;
+def In32BitMode  : Predicate<"!Subtarget->is64Bit()">;
+def In64BitMode  : Predicate<"Subtarget->is64Bit()">;
+def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def NotSmallCode : Predicate<"TM.getCodeModel() != CodeModel::Small">;
+def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+
+//===----------------------------------------------------------------------===//
+// X86 specific pattern fragments.
+//
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<3> val> {
+  bits<3> Value = val;
+}
+def NoImm  : ImmType<0>;
+def Imm8   : ImmType<1>;
+def Imm16  : ImmType<2>;
+def Imm32  : ImmType<3>;
+def Imm64  : ImmType<4>;
+
+// FPFormat - This specifies what form this FP instruction has.  This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+  bits<3> Value = val;
+}
+def NotFP      : FPFormat<0>;
+def ZeroArgFP  : FPFormat<1>;
+def OneArgFP   : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP   : FPFormat<4>;
+def CompareFP  : FPFormat<5>;
+def CondMovFP  : FPFormat<6>;
+def SpecialFP  : FPFormat<7>;
+
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag ops, string AsmStr>
+  : Instruction {
+  let Namespace = "X86";
+
+  bits<8> Opcode = opcod;
+  Format Form = f;
+  bits<6> FormBits = Form.Value;
+  ImmType ImmT = i;
+  bits<3> ImmTypeBits = ImmT.Value;
+
+  dag OperandList = ops;
+  string AsmString = AsmStr;
+
+  //
+  // Attributes specific to X86 instructions...
+  //
+  bit hasOpSizePrefix = 0;  // Does this inst have a 0x66 prefix?
+  bit hasAdSizePrefix = 0;  // Does this inst have a 0x67 prefix?
+
+  bits<4> Prefix = 0;       // Which prefix byte does this inst have?
+  bit hasREX_WPrefix  = 0;  // Does this inst requires the REX.W prefix?
+  FPFormat FPForm;          // What flavor of FP instruction is this?
+  bits<3> FPFormBits = 0;
+}
+
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize { bit hasOpSizePrefix = 1; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W  { bit hasREX_WPrefix = 1; }
+class TB     { bits<4> Prefix = 1; }
+class REP    { bits<4> Prefix = 2; }
+class D8     { bits<4> Prefix = 3; }
+class D9     { bits<4> Prefix = 4; }
+class DA     { bits<4> Prefix = 5; }
+class DB     { bits<4> Prefix = 6; }
+class DC     { bits<4> Prefix = 7; }
+class DD     { bits<4> Prefix = 8; }
+class DE     { bits<4> Prefix = 9; }
+class DF     { bits<4> Prefix = 10; }
+class XD     { bits<4> Prefix = 11; }
+class XS     { bits<4> Prefix = 12; }
+class T8     { bits<4> Prefix = 13; }
+class TA     { bits<4> Prefix = 14; }
+
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments...
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A   : PatLeaf<(i8 0)>;
+def X86_COND_AE  : PatLeaf<(i8 1)>;
+def X86_COND_B   : PatLeaf<(i8 2)>;
+def X86_COND_BE  : PatLeaf<(i8 3)>;
+def X86_COND_E   : PatLeaf<(i8 4)>;
+def X86_COND_G   : PatLeaf<(i8 5)>;
+def X86_COND_GE  : PatLeaf<(i8 6)>;
+def X86_COND_L   : PatLeaf<(i8 7)>;
+def X86_COND_LE  : PatLeaf<(i8 8)>;
+def X86_COND_NE  : PatLeaf<(i8 9)>;
+def X86_COND_NO  : PatLeaf<(i8 10)>;
+def X86_COND_NP  : PatLeaf<(i8 11)>;
+def X86_COND_NS  : PatLeaf<(i8 12)>;
+def X86_COND_O   : PatLeaf<(i8 13)>;
+def X86_COND_P   : PatLeaf<(i8 14)>;
+def X86_COND_S   : PatLeaf<(i8 15)>;
+
+def i16immSExt8  : PatLeaf<(i16 imm), [{
+  // i16immSExt8 predicate - True if the 16-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int16_t)N->getValue() == (int8_t)N->getValue();
+}]>;
+
+def i32immSExt8  : PatLeaf<(i32 imm), [{
+  // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int32_t)N->getValue() == (int8_t)N->getValue();
+}]>;
+
+// Helper fragments for loads.
+def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (load node:$ptr))>;
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (load node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+
+def sextloadi16i1  : PatFrag<(ops node:$ptr), (i16 (sextloadi1 node:$ptr))>;
+def sextloadi32i1  : PatFrag<(ops node:$ptr), (i32 (sextloadi1 node:$ptr))>;
+def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+
+def zextloadi8i1   : PatFrag<(ops node:$ptr), (i8  (zextloadi1 node:$ptr))>;
+def zextloadi16i1  : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1  : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+
+def extloadi8i1    : PatFrag<(ops node:$ptr), (i8  (extloadi1 node:$ptr))>;
+def extloadi16i1   : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1   : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8   : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8   : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction templates...
+//
+
+class I<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+  : X86Inst<o, f, NoImm, ops, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm8 , ops, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm16, ops, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm32, ops, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+def ADJCALLSTACKDOWN : I<0, Pseudo, (ops i32imm:$amt), "#ADJCALLSTACKDOWN",
+                         [(X86callseq_start imm:$amt)]>, Imp<[ESP],[ESP]>;
+def ADJCALLSTACKUP   : I<0, Pseudo, (ops i32imm:$amt1, i32imm:$amt2),
+                         "#ADJCALLSTACKUP",
+                         [(X86callseq_end imm:$amt1, imm:$amt2)]>,
+                         Imp<[ESP],[ESP]>;
+def IMPLICIT_USE     : I<0, Pseudo, (ops variable_ops), "#IMPLICIT_USE", []>;
+def IMPLICIT_DEF     : I<0, Pseudo, (ops variable_ops), "#IMPLICIT_DEF", []>;
+def IMPLICIT_DEF_GR8  : I<0, Pseudo, (ops GR8:$dst),
+                         "#IMPLICIT_DEF $dst",
+                         [(set GR8:$dst, (undef))]>;
+def IMPLICIT_DEF_GR16  : I<0, Pseudo, (ops GR16:$dst),
+                         "#IMPLICIT_DEF $dst",
+                         [(set GR16:$dst, (undef))]>;
+def IMPLICIT_DEF_GR32  : I<0, Pseudo, (ops GR32:$dst),
+                         "#IMPLICIT_DEF $dst",
+                         [(set GR32:$dst, (undef))]>;
+
+// Nop
+def NOOP : I<0x90, RawFrm, (ops), "nop", []>;
+
+// Truncate
+def TRUNC_32_to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR32_:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}", []>;
+def TRUNC_16_to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR16_:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}}", []>;
+def TRUNC_32to16 : I<0x89, MRMDestReg, (ops GR16:$dst, GR32:$src),
+                     "mov{w} {${src:subreg16}, $dst|$dst, ${src:subreg16}}",
+                     [(set GR16:$dst, (trunc GR32:$src))]>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// Return instructions.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, noResults = 1 in {
+  def RET    : I<0xC3, RawFrm, (ops), "ret", [(X86retflag 0)]>;
+  def RETI   : Ii16<0xC2, RawFrm, (ops i16imm:$amt), "ret $amt",
+                    [(X86retflag imm:$amt)]>;
+}
+
+// All branches are RawFrm, Void, Branch, and Terminators
+let isBranch = 1, isTerminator = 1, noResults = 1 in
+  class IBr<bits<8> opcode, dag ops, string asm, list<dag> pattern> :
+        I<opcode, RawFrm, ops, asm, pattern>;
+
+// Indirect branches
+let isBranch = 1, isBarrier = 1 in
+  def JMP : IBr<0xE9, (ops brtarget:$dst), "jmp $dst", [(br bb:$dst)]>;
+
+let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1 in {
+  def JMP32r     : I<0xFF, MRM4r, (ops GR32:$dst), "jmp{l} {*}$dst",
+                     [(brind GR32:$dst)]>;
+  def JMP32m     : I<0xFF, MRM4m, (ops i32mem:$dst), "jmp{l} {*}$dst",
+                     [(brind (loadi32 addr:$dst))]>;
+}
+
+// Conditional branches
+def JE  : IBr<0x84, (ops brtarget:$dst), "je $dst",
+              [(X86brcond bb:$dst, X86_COND_E)]>, TB;
+def JNE : IBr<0x85, (ops brtarget:$dst), "jne $dst",
+              [(X86brcond bb:$dst, X86_COND_NE)]>, TB;
+def JL  : IBr<0x8C, (ops brtarget:$dst), "jl $dst",
+              [(X86brcond bb:$dst, X86_COND_L)]>, TB;
+def JLE : IBr<0x8E, (ops brtarget:$dst), "jle $dst",
+              [(X86brcond bb:$dst, X86_COND_LE)]>, TB;
+def JG  : IBr<0x8F, (ops brtarget:$dst), "jg $dst",
+              [(X86brcond bb:$dst, X86_COND_G)]>, TB;
+def JGE : IBr<0x8D, (ops brtarget:$dst), "jge $dst",
+              [(X86brcond bb:$dst, X86_COND_GE)]>, TB;
+
+def JB  : IBr<0x82, (ops brtarget:$dst), "jb $dst",
+              [(X86brcond bb:$dst, X86_COND_B)]>, TB;
+def JBE : IBr<0x86, (ops brtarget:$dst), "jbe $dst",
+              [(X86brcond bb:$dst, X86_COND_BE)]>, TB;
+def JA  : IBr<0x87, (ops brtarget:$dst), "ja $dst",
+              [(X86brcond bb:$dst, X86_COND_A)]>, TB;
+def JAE : IBr<0x83, (ops brtarget:$dst), "jae $dst",
+              [(X86brcond bb:$dst, X86_COND_AE)]>, TB;
+
+def JS  : IBr<0x88, (ops brtarget:$dst), "js $dst",
+              [(X86brcond bb:$dst, X86_COND_S)]>, TB;
+def JNS : IBr<0x89, (ops brtarget:$dst), "jns $dst",
+              [(X86brcond bb:$dst, X86_COND_NS)]>, TB;
+def JP  : IBr<0x8A, (ops brtarget:$dst), "jp $dst",
+              [(X86brcond bb:$dst, X86_COND_P)]>, TB;
+def JNP : IBr<0x8B, (ops brtarget:$dst), "jnp $dst",
+              [(X86brcond bb:$dst, X86_COND_NP)]>, TB;
+def JO  : IBr<0x80, (ops brtarget:$dst), "jo $dst",
+              [(X86brcond bb:$dst, X86_COND_O)]>, TB;
+def JNO : IBr<0x81, (ops brtarget:$dst), "jno $dst",
+              [(X86brcond bb:$dst, X86_COND_NO)]>, TB;
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1, noResults = 1 in
+  // All calls clobber the non-callee saved registers...
+  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7] in {
+    def CALLpcrel32 : I<0xE8, RawFrm, (ops i32imm:$dst, variable_ops),
+                        "call ${dst:call}", []>;
+    def CALL32r     : I<0xFF, MRM2r, (ops GR32:$dst, variable_ops),
+                        "call {*}$dst", [(X86call GR32:$dst)]>;
+    def CALL32m     : I<0xFF, MRM2m, (ops i32mem:$dst, variable_ops),
+                        "call {*}$dst", []>;
+  }
+
+// Tail call stuff.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in
+  def TAILJMPd : IBr<0xE9, (ops i32imm:$dst), "jmp ${dst:call}  # TAIL CALL",
+                 []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in
+  def TAILJMPr : I<0xFF, MRM4r, (ops GR32:$dst), "jmp {*}$dst  # TAIL CALL",
+                 []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in
+  def TAILJMPm : I<0xFF, MRM4m, (ops i32mem:$dst),
+                   "jmp {*}$dst  # TAIL CALL", []>;
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+def LEAVE    : I<0xC9, RawFrm,
+                 (ops), "leave", []>, Imp<[EBP,ESP],[EBP,ESP]>;
+def POP32r   : I<0x58, AddRegFrm,
+                 (ops GR32:$reg), "pop{l} $reg", []>, Imp<[ESP],[ESP]>;
+
+def PUSH32r  : I<0x50, AddRegFrm,
+                 (ops GR32:$reg), "push{l} $reg", []>, Imp<[ESP],[ESP]>;
+
+def MovePCtoStack : I<0, Pseudo, (ops piclabel:$label),
+                      "call $label", []>;
+
+let isTwoAddress = 1 in                               // GR32 = bswap GR32
+  def BSWAP32r : I<0xC8, AddRegFrm,
+                   (ops GR32:$dst, GR32:$src),
+                   "bswap{l} $dst", 
+                   [(set GR32:$dst, (bswap GR32:$src))]>, TB;
+
+def XCHG8rr  : I<0x86, MRMDestReg,                    // xchg GR8, GR8
+                 (ops GR8:$src1, GR8:$src2),
+                 "xchg{b} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG16rr : I<0x87, MRMDestReg,                    // xchg GR16, GR16
+                 (ops GR16:$src1, GR16:$src2),
+                 "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize;
+def XCHG32rr : I<0x87, MRMDestReg,                    // xchg GR32, GR32
+                 (ops GR32:$src1, GR32:$src2),
+                 "xchg{l} {$src2|$src1}, {$src1|$src2}", []>;
+
+def XCHG8mr  : I<0x86, MRMDestMem,
+                 (ops i8mem:$src1, GR8:$src2),
+                 "xchg{b} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG16mr : I<0x87, MRMDestMem,
+                 (ops i16mem:$src1, GR16:$src2),
+                 "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize;
+def XCHG32mr : I<0x87, MRMDestMem,
+                 (ops i32mem:$src1, GR32:$src2),
+                 "xchg{l} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG8rm  : I<0x86, MRMSrcMem,
+                 (ops GR8:$src1, i8mem:$src2),
+                 "xchg{b} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG16rm : I<0x87, MRMSrcMem,
+                 (ops GR16:$src1, i16mem:$src2),
+                 "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize;
+def XCHG32rm : I<0x87, MRMSrcMem,
+                 (ops GR32:$src1, i32mem:$src2),
+                 "xchg{l} {$src2|$src1}, {$src1|$src2}", []>;
+
+def LEA16r   : I<0x8D, MRMSrcMem,
+                 (ops GR16:$dst, i32mem:$src),
+                 "lea{w} {$src|$dst}, {$dst|$src}", []>, OpSize;
+def LEA32r   : I<0x8D, MRMSrcMem,
+                 (ops GR32:$dst, lea32mem:$src),
+                 "lea{l} {$src|$dst}, {$dst|$src}",
+                 [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>;
+
+def REP_MOVSB : I<0xA4, RawFrm, (ops), "{rep;movsb|rep movsb}",
+                  [(X86rep_movs i8)]>,
+                Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP;
+def REP_MOVSW : I<0xA5, RawFrm, (ops), "{rep;movsw|rep movsw}",
+                  [(X86rep_movs i16)]>,
+                Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP, OpSize;
+def REP_MOVSD : I<0xA5, RawFrm, (ops), "{rep;movsl|rep movsd}",
+                  [(X86rep_movs i32)]>,
+                Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP;
+
+def REP_STOSB : I<0xAA, RawFrm, (ops), "{rep;stosb|rep stosb}",
+                  [(X86rep_stos i8)]>,
+                Imp<[AL,ECX,EDI], [ECX,EDI]>, REP;
+def REP_STOSW : I<0xAB, RawFrm, (ops), "{rep;stosw|rep stosw}",
+                  [(X86rep_stos i16)]>,
+                Imp<[AX,ECX,EDI], [ECX,EDI]>, REP, OpSize;
+def REP_STOSD : I<0xAB, RawFrm, (ops), "{rep;stosl|rep stosd}",
+                  [(X86rep_stos i32)]>,
+                Imp<[EAX,ECX,EDI], [ECX,EDI]>, REP;
+
+def RDTSC : I<0x31, RawFrm, (ops), "rdtsc", [(X86rdtsc)]>,
+            TB, Imp<[],[RAX,RDX]>;
+
+//===----------------------------------------------------------------------===//
+//  Input/Output Instructions...
+//
+def IN8rr  : I<0xEC, RawFrm, (ops),
+               "in{b} {%dx, %al|%AL, %DX}",
+               []>,  Imp<[DX], [AL]>;
+def IN16rr : I<0xED, RawFrm, (ops),
+               "in{w} {%dx, %ax|%AX, %DX}",
+               []>,  Imp<[DX], [AX]>, OpSize;
+def IN32rr : I<0xED, RawFrm, (ops),
+               "in{l} {%dx, %eax|%EAX, %DX}",
+               []>, Imp<[DX],[EAX]>;
+
+def IN8ri  : Ii8<0xE4, RawFrm, (ops i16i8imm:$port),
+                  "in{b} {$port, %al|%AL, $port}",
+                 []>,
+             Imp<[], [AL]>;
+def IN16ri : Ii8<0xE5, RawFrm, (ops i16i8imm:$port),
+                  "in{w} {$port, %ax|%AX, $port}",
+                 []>,
+             Imp<[], [AX]>, OpSize;
+def IN32ri : Ii8<0xE5, RawFrm, (ops i16i8imm:$port),
+                  "in{l} {$port, %eax|%EAX, $port}",
+                 []>,
+             Imp<[],[EAX]>;
+
+def OUT8rr  : I<0xEE, RawFrm, (ops),
+                "out{b} {%al, %dx|%DX, %AL}",
+                []>,  Imp<[DX,  AL], []>;
+def OUT16rr : I<0xEF, RawFrm, (ops),
+                "out{w} {%ax, %dx|%DX, %AX}",
+                []>,  Imp<[DX,  AX], []>, OpSize;
+def OUT32rr : I<0xEF, RawFrm, (ops),
+                "out{l} {%eax, %dx|%DX, %EAX}",
+                []>, Imp<[DX, EAX], []>;
+
+def OUT8ir  : Ii8<0xE6, RawFrm, (ops i16i8imm:$port),
+                   "out{b} {%al, $port|$port, %AL}",
+                   []>,
+              Imp<[AL], []>;
+def OUT16ir : Ii8<0xE7, RawFrm, (ops i16i8imm:$port),
+                   "out{w} {%ax, $port|$port, %AX}",
+                   []>,
+              Imp<[AX], []>, OpSize;
+def OUT32ir : Ii8<0xE7, RawFrm, (ops i16i8imm:$port),
+                   "out{l} {%eax, $port|$port, %EAX}",
+                   []>,
+              Imp<[EAX], []>;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions...
+//
+def MOV8rr  : I<0x88, MRMDestReg, (ops GR8 :$dst, GR8 :$src),
+                "mov{b} {$src, $dst|$dst, $src}", []>;
+def MOV16rr : I<0x89, MRMDestReg, (ops GR16:$dst, GR16:$src),
+                "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32rr : I<0x89, MRMDestReg, (ops GR32:$dst, GR32:$src),
+                "mov{l} {$src, $dst|$dst, $src}", []>;
+let isReMaterializable = 1 in {
+def MOV8ri  : Ii8 <0xB0, AddRegFrm, (ops GR8 :$dst, i8imm :$src),
+                   "mov{b} {$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (ops GR16:$dst, i16imm:$src),
+                   "mov{w} {$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, imm:$src)]>, OpSize;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (ops GR32:$dst, i32imm:$src),
+                   "mov{l} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, imm:$src)]>;
+}
+def MOV8mi  : Ii8 <0xC6, MRM0m, (ops i8mem :$dst, i8imm :$src),
+                   "mov{b} {$src, $dst|$dst, $src}",
+                   [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : Ii16<0xC7, MRM0m, (ops i16mem:$dst, i16imm:$src),
+                   "mov{w} {$src, $dst|$dst, $src}",
+                   [(store (i16 imm:$src), addr:$dst)]>, OpSize;
+def MOV32mi : Ii32<0xC7, MRM0m, (ops i32mem:$dst, i32imm:$src),
+                   "mov{l} {$src, $dst|$dst, $src}",
+                   [(store (i32 imm:$src), addr:$dst)]>;
+
+def MOV8rm  : I<0x8A, MRMSrcMem, (ops GR8 :$dst, i8mem :$src),
+                "mov{b} {$src, $dst|$dst, $src}",
+                [(set GR8:$dst, (load addr:$src))]>;
+def MOV16rm : I<0x8B, MRMSrcMem, (ops GR16:$dst, i16mem:$src),
+                "mov{w} {$src, $dst|$dst, $src}",
+                [(set GR16:$dst, (load addr:$src))]>, OpSize;
+def MOV32rm : I<0x8B, MRMSrcMem, (ops GR32:$dst, i32mem:$src),
+                "mov{l} {$src, $dst|$dst, $src}",
+                [(set GR32:$dst, (load addr:$src))]>;
+
+def MOV8mr  : I<0x88, MRMDestMem, (ops i8mem :$dst, GR8 :$src),
+                "mov{b} {$src, $dst|$dst, $src}",
+                [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I<0x89, MRMDestMem, (ops i16mem:$dst, GR16:$src),
+                "mov{w} {$src, $dst|$dst, $src}",
+                [(store GR16:$src, addr:$dst)]>, OpSize;
+def MOV32mr : I<0x89, MRMDestMem, (ops i32mem:$dst, GR32:$src),
+                "mov{l} {$src, $dst|$dst, $src}",
+                [(store GR32:$src, addr:$dst)]>;
+                
+//===----------------------------------------------------------------------===//
+//  Fixed-Register Multiplication and Division Instructions...
+//
+
+// Extra precision multiplication
+def MUL8r  : I<0xF6, MRM4r, (ops GR8:$src), "mul{b} $src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, GR8:$src))]>,
+             Imp<[AL],[AX]>;               // AL,AH = AL*GR8
+def MUL16r : I<0xF7, MRM4r, (ops GR16:$src), "mul{w} $src", []>,
+             Imp<[AX],[AX,DX]>, OpSize;    // AX,DX = AX*GR16
+def MUL32r : I<0xF7, MRM4r, (ops GR32:$src), "mul{l} $src", []>,
+             Imp<[EAX],[EAX,EDX]>;         // EAX,EDX = EAX*GR32
+def MUL8m  : I<0xF6, MRM4m, (ops i8mem :$src),
+               "mul{b} $src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, (loadi8 addr:$src)))]>,
+             Imp<[AL],[AX]>;          // AL,AH = AL*[mem8]
+def MUL16m : I<0xF7, MRM4m, (ops i16mem:$src),
+               "mul{w} $src", []>, Imp<[AX],[AX,DX]>,
+               OpSize; // AX,DX = AX*[mem16]
+def MUL32m : I<0xF7, MRM4m, (ops i32mem:$src),
+               "mul{l} $src", []>, Imp<[EAX],[EAX,EDX]>;// EAX,EDX = EAX*[mem32]
+
+def IMUL8r  : I<0xF6, MRM5r, (ops GR8:$src), "imul{b} $src", []>,
+              Imp<[AL],[AX]>;               // AL,AH = AL*GR8
+def IMUL16r : I<0xF7, MRM5r, (ops GR16:$src), "imul{w} $src", []>,
+              Imp<[AX],[AX,DX]>, OpSize;    // AX,DX = AX*GR16
+def IMUL32r : I<0xF7, MRM5r, (ops GR32:$src), "imul{l} $src", []>,
+              Imp<[EAX],[EAX,EDX]>;         // EAX,EDX = EAX*GR32
+def IMUL8m  : I<0xF6, MRM5m, (ops i8mem :$src),
+                "imul{b} $src", []>, Imp<[AL],[AX]>;        // AL,AH = AL*[mem8]
+def IMUL16m : I<0xF7, MRM5m, (ops i16mem:$src),
+                "imul{w} $src", []>, Imp<[AX],[AX,DX]>,
+                OpSize; // AX,DX = AX*[mem16]
+def IMUL32m : I<0xF7, MRM5m, (ops i32mem:$src),
+                "imul{l} $src", []>,
+                Imp<[EAX],[EAX,EDX]>;  // EAX,EDX = EAX*[mem32]
+
+// unsigned division/remainder
+def DIV8r  : I<0xF6, MRM6r, (ops GR8:$src),          // AX/r8 = AL,AH
+               "div{b} $src", []>, Imp<[AX],[AX]>;
+def DIV16r : I<0xF7, MRM6r, (ops GR16:$src),         // DX:AX/r16 = AX,DX
+               "div{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def DIV32r : I<0xF7, MRM6r, (ops GR32:$src),         // EDX:EAX/r32 = EAX,EDX
+               "div{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+def DIV8m  : I<0xF6, MRM6m, (ops i8mem:$src),       // AX/[mem8] = AL,AH
+               "div{b} $src", []>, Imp<[AX],[AX]>;
+def DIV16m : I<0xF7, MRM6m, (ops i16mem:$src),      // DX:AX/[mem16] = AX,DX
+               "div{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def DIV32m : I<0xF7, MRM6m, (ops i32mem:$src),      // EDX:EAX/[mem32] = EAX,EDX
+               "div{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+
+// Signed division/remainder.
+def IDIV8r : I<0xF6, MRM7r, (ops GR8:$src),          // AX/r8 = AL,AH
+               "idiv{b} $src", []>, Imp<[AX],[AX]>;
+def IDIV16r: I<0xF7, MRM7r, (ops GR16:$src),         // DX:AX/r16 = AX,DX
+               "idiv{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def IDIV32r: I<0xF7, MRM7r, (ops GR32:$src),         // EDX:EAX/r32 = EAX,EDX
+               "idiv{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+def IDIV8m : I<0xF6, MRM7m, (ops i8mem:$src),      // AX/[mem8] = AL,AH
+               "idiv{b} $src", []>, Imp<[AX],[AX]>;
+def IDIV16m: I<0xF7, MRM7m, (ops i16mem:$src),     // DX:AX/[mem16] = AX,DX
+               "idiv{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def IDIV32m: I<0xF7, MRM7m, (ops i32mem:$src),     // EDX:EAX/[mem32] = EAX,EDX
+               "idiv{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+
+
+//===----------------------------------------------------------------------===//
+//  Two address Instructions...
+//
+let isTwoAddress = 1 in {
+
+// Conditional moves
+def CMOVB16rr : I<0x42, MRMSrcReg,       // if <u, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovb {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_B))]>,
+                  TB, OpSize;
+def CMOVB16rm : I<0x42, MRMSrcMem,       // if <u, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovb {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_B))]>,
+                  TB, OpSize;
+def CMOVB32rr : I<0x42, MRMSrcReg,       // if <u, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovb {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_B))]>,
+                   TB;
+def CMOVB32rm : I<0x42, MRMSrcMem,       // if <u, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovb {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_B))]>,
+                   TB;
+
+def CMOVAE16rr: I<0x43, MRMSrcReg,       // if >=u, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovae {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_AE))]>,
+                   TB, OpSize;
+def CMOVAE16rm: I<0x43, MRMSrcMem,       // if >=u, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovae {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_AE))]>,
+                   TB, OpSize;
+def CMOVAE32rr: I<0x43, MRMSrcReg,       // if >=u, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovae {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_AE))]>,
+                   TB;
+def CMOVAE32rm: I<0x43, MRMSrcMem,       // if >=u, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovae {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_AE))]>,
+                   TB;
+
+def CMOVE16rr : I<0x44, MRMSrcReg,       // if ==, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmove {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_E))]>,
+                   TB, OpSize;
+def CMOVE16rm : I<0x44, MRMSrcMem,       // if ==, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmove {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_E))]>,
+                   TB, OpSize;
+def CMOVE32rr : I<0x44, MRMSrcReg,       // if ==, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmove {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_E))]>,
+                   TB;
+def CMOVE32rm : I<0x44, MRMSrcMem,       // if ==, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmove {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_E))]>,
+                   TB;
+
+def CMOVNE16rr: I<0x45, MRMSrcReg,       // if !=, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovne {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_NE))]>,
+                   TB, OpSize;
+def CMOVNE16rm: I<0x45, MRMSrcMem,       // if !=, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovne {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_NE))]>,
+                   TB, OpSize;
+def CMOVNE32rr: I<0x45, MRMSrcReg,       // if !=, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovne {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_NE))]>,
+                   TB;
+def CMOVNE32rm: I<0x45, MRMSrcMem,       // if !=, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovne {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_NE))]>,
+                   TB;
+
+def CMOVBE16rr: I<0x46, MRMSrcReg,       // if <=u, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovbe {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_BE))]>,
+                   TB, OpSize;
+def CMOVBE16rm: I<0x46, MRMSrcMem,       // if <=u, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovbe {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_BE))]>,
+                   TB, OpSize;
+def CMOVBE32rr: I<0x46, MRMSrcReg,       // if <=u, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovbe {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_BE))]>,
+                   TB;
+def CMOVBE32rm: I<0x46, MRMSrcMem,       // if <=u, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovbe {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_BE))]>,
+                   TB;
+
+def CMOVA16rr : I<0x47, MRMSrcReg,       // if >u, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmova {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_A))]>,
+                   TB, OpSize;
+def CMOVA16rm : I<0x47, MRMSrcMem,       // if >u, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmova {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_A))]>,
+                   TB, OpSize;
+def CMOVA32rr : I<0x47, MRMSrcReg,       // if >u, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmova {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_A))]>,
+                   TB;
+def CMOVA32rm : I<0x47, MRMSrcMem,       // if >u, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmova {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_A))]>,
+                   TB;
+
+def CMOVL16rr : I<0x4C, MRMSrcReg,       // if <s, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovl {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_L))]>,
+                   TB, OpSize;
+def CMOVL16rm : I<0x4C, MRMSrcMem,       // if <s, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovl {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_L))]>,
+                   TB, OpSize;
+def CMOVL32rr : I<0x4C, MRMSrcReg,       // if <s, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovl {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_L))]>,
+                   TB;
+def CMOVL32rm : I<0x4C, MRMSrcMem,       // if <s, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovl {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_L))]>,
+                   TB;
+
+def CMOVGE16rr: I<0x4D, MRMSrcReg,       // if >=s, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovge {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_GE))]>,
+                   TB, OpSize;
+def CMOVGE16rm: I<0x4D, MRMSrcMem,       // if >=s, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovge {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_GE))]>,
+                   TB, OpSize;
+def CMOVGE32rr: I<0x4D, MRMSrcReg,       // if >=s, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovge {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_GE))]>,
+                   TB;
+def CMOVGE32rm: I<0x4D, MRMSrcMem,       // if >=s, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovge {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_GE))]>,
+                   TB;
+
+def CMOVLE16rr: I<0x4E, MRMSrcReg,       // if <=s, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovle {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_LE))]>,
+                   TB, OpSize;
+def CMOVLE16rm: I<0x4E, MRMSrcMem,       // if <=s, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovle {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_LE))]>,
+                   TB, OpSize;
+def CMOVLE32rr: I<0x4E, MRMSrcReg,       // if <=s, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovle {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_LE))]>,
+                   TB;
+def CMOVLE32rm: I<0x4E, MRMSrcMem,       // if <=s, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovle {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_LE))]>,
+                   TB;
+
+def CMOVG16rr : I<0x4F, MRMSrcReg,       // if >s, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovg {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_G))]>,
+                   TB, OpSize;
+def CMOVG16rm : I<0x4F, MRMSrcMem,       // if >s, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovg {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_G))]>,
+                   TB, OpSize;
+def CMOVG32rr : I<0x4F, MRMSrcReg,       // if >s, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovg {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_G))]>,
+                   TB;
+def CMOVG32rm : I<0x4F, MRMSrcMem,       // if >s, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovg {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_G))]>,
+                   TB;
+
+def CMOVS16rr : I<0x48, MRMSrcReg,       // if signed, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovs {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_S))]>,
+                  TB, OpSize;
+def CMOVS16rm : I<0x48, MRMSrcMem,       // if signed, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovs {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_S))]>,
+                  TB, OpSize;
+def CMOVS32rr : I<0x48, MRMSrcReg,       // if signed, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovs {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_S))]>,
+                  TB;
+def CMOVS32rm : I<0x48, MRMSrcMem,       // if signed, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovs {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_S))]>,
+                  TB;
+
+def CMOVNS16rr: I<0x49, MRMSrcReg,       // if !signed, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovns {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_NS))]>,
+                  TB, OpSize;
+def CMOVNS16rm: I<0x49, MRMSrcMem,       // if !signed, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovns {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_NS))]>,
+                  TB, OpSize;
+def CMOVNS32rr: I<0x49, MRMSrcReg,       // if !signed, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovns {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_NS))]>,
+                  TB;
+def CMOVNS32rm: I<0x49, MRMSrcMem,       // if !signed, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovns {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_NS))]>,
+                  TB;
+
+def CMOVP16rr : I<0x4A, MRMSrcReg,       // if parity, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_P))]>,
+                  TB, OpSize;
+def CMOVP16rm : I<0x4A, MRMSrcMem,       // if parity, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_P))]>,
+                  TB, OpSize;
+def CMOVP32rr : I<0x4A, MRMSrcReg,       // if parity, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_P))]>,
+                  TB;
+def CMOVP32rm : I<0x4A, MRMSrcMem,       // if parity, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_P))]>,
+                  TB;
+
+def CMOVNP16rr : I<0x4B, MRMSrcReg,       // if !parity, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                    X86_COND_NP))]>,
+                  TB, OpSize;
+def CMOVNP16rm : I<0x4B, MRMSrcMem,       // if !parity, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    X86_COND_NP))]>,
+                  TB, OpSize;
+def CMOVNP32rr : I<0x4B, MRMSrcReg,       // if !parity, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                    X86_COND_NP))]>,
+                  TB;
+def CMOVNP32rm : I<0x4B, MRMSrcMem,       // if !parity, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    X86_COND_NP))]>,
+                  TB;
+
+
+// unary instructions
+let CodeSize = 2 in {
+def NEG8r  : I<0xF6, MRM3r, (ops GR8 :$dst, GR8 :$src), "neg{b} $dst",
+               [(set GR8:$dst, (ineg GR8:$src))]>;
+def NEG16r : I<0xF7, MRM3r, (ops GR16:$dst, GR16:$src), "neg{w} $dst",
+               [(set GR16:$dst, (ineg GR16:$src))]>, OpSize;
+def NEG32r : I<0xF7, MRM3r, (ops GR32:$dst, GR32:$src), "neg{l} $dst",
+               [(set GR32:$dst, (ineg GR32:$src))]>;
+let isTwoAddress = 0 in {
+  def NEG8m  : I<0xF6, MRM3m, (ops i8mem :$dst), "neg{b} $dst",
+                 [(store (ineg (loadi8 addr:$dst)), addr:$dst)]>;
+  def NEG16m : I<0xF7, MRM3m, (ops i16mem:$dst), "neg{w} $dst",
+                 [(store (ineg (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+  def NEG32m : I<0xF7, MRM3m, (ops i32mem:$dst), "neg{l} $dst",
+                 [(store (ineg (loadi32 addr:$dst)), addr:$dst)]>;
+
+}
+
+def NOT8r  : I<0xF6, MRM2r, (ops GR8 :$dst, GR8 :$src), "not{b} $dst",
+               [(set GR8:$dst, (not GR8:$src))]>;
+def NOT16r : I<0xF7, MRM2r, (ops GR16:$dst, GR16:$src), "not{w} $dst",
+               [(set GR16:$dst, (not GR16:$src))]>, OpSize;
+def NOT32r : I<0xF7, MRM2r, (ops GR32:$dst, GR32:$src), "not{l} $dst",
+               [(set GR32:$dst, (not GR32:$src))]>;
+let isTwoAddress = 0 in {
+  def NOT8m  : I<0xF6, MRM2m, (ops i8mem :$dst), "not{b} $dst",
+                 [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
+  def NOT16m : I<0xF7, MRM2m, (ops i16mem:$dst), "not{w} $dst",
+                 [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+  def NOT32m : I<0xF7, MRM2m, (ops i32mem:$dst), "not{l} $dst",
+                 [(store (not (loadi32 addr:$dst)), addr:$dst)]>;
+}
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let CodeSize = 2 in
+def INC8r  : I<0xFE, MRM0r, (ops GR8 :$dst, GR8 :$src), "inc{b} $dst",
+               [(set GR8:$dst, (add GR8:$src, 1))]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
+def INC16r : I<0x40, AddRegFrm, (ops GR16:$dst, GR16:$src), "inc{w} $dst",
+               [(set GR16:$dst, (add GR16:$src, 1))]>,
+             OpSize, Requires<[In32BitMode]>;
+def INC32r : I<0x40, AddRegFrm, (ops GR32:$dst, GR32:$src), "inc{l} $dst",
+               [(set GR32:$dst, (add GR32:$src, 1))]>, Requires<[In32BitMode]>;
+}
+let isTwoAddress = 0, CodeSize = 2 in {
+  def INC8m  : I<0xFE, MRM0m, (ops i8mem :$dst), "inc{b} $dst",
+               [(store (add (loadi8 addr:$dst), 1), addr:$dst)]>;
+  def INC16m : I<0xFF, MRM0m, (ops i16mem:$dst), "inc{w} $dst",
+               [(store (add (loadi16 addr:$dst), 1), addr:$dst)]>, OpSize;
+  def INC32m : I<0xFF, MRM0m, (ops i32mem:$dst), "inc{l} $dst",
+               [(store (add (loadi32 addr:$dst), 1), addr:$dst)]>;
+}
+
+let CodeSize = 2 in
+def DEC8r  : I<0xFE, MRM1r, (ops GR8 :$dst, GR8 :$src), "dec{b} $dst",
+               [(set GR8:$dst, (add GR8:$src, -1))]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
+def DEC16r : I<0x48, AddRegFrm, (ops GR16:$dst, GR16:$src), "dec{w} $dst",
+               [(set GR16:$dst, (add GR16:$src, -1))]>,
+             OpSize, Requires<[In32BitMode]>;
+def DEC32r : I<0x48, AddRegFrm, (ops GR32:$dst, GR32:$src), "dec{l} $dst",
+               [(set GR32:$dst, (add GR32:$src, -1))]>, Requires<[In32BitMode]>;
+}
+
+let isTwoAddress = 0, CodeSize = 2 in {
+  def DEC8m  : I<0xFE, MRM1m, (ops i8mem :$dst), "dec{b} $dst",
+               [(store (add (loadi8 addr:$dst), -1), addr:$dst)]>;
+  def DEC16m : I<0xFF, MRM1m, (ops i16mem:$dst), "dec{w} $dst",
+               [(store (add (loadi16 addr:$dst), -1), addr:$dst)]>, OpSize;
+  def DEC32m : I<0xFF, MRM1m, (ops i32mem:$dst), "dec{l} $dst",
+               [(store (add (loadi32 addr:$dst), -1), addr:$dst)]>;
+}
+
+// Logical operators...
+let isCommutable = 1 in {   // X = AND Y, Z   --> X = AND Z, Y
+def AND8rr   : I<0x20, MRMDestReg,
+                (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+                "and{b} {$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (and GR8:$src1, GR8:$src2))]>;
+def AND16rr  : I<0x21, MRMDestReg,
+                 (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                 "and{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (and GR16:$src1, GR16:$src2))]>, OpSize;
+def AND32rr  : I<0x21, MRMDestReg, 
+                 (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "and{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (and GR32:$src1, GR32:$src2))]>;
+}
+
+def AND8rm   : I<0x22, MRMSrcMem, 
+                 (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+                 "and{b} {$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (and GR8:$src1, (load addr:$src2)))]>;
+def AND16rm  : I<0x23, MRMSrcMem, 
+                 (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                 "and{w} {$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (and GR16:$src1, (load addr:$src2)))]>, OpSize;
+def AND32rm  : I<0x23, MRMSrcMem,
+                 (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "and{l} {$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (and GR32:$src1, (load addr:$src2)))]>;
+
+def AND8ri   : Ii8<0x80, MRM4r, 
+                   (ops GR8 :$dst, GR8 :$src1, i8imm :$src2),
+                   "and{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (and GR8:$src1, imm:$src2))]>;
+def AND16ri  : Ii16<0x81, MRM4r, 
+                    (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+                    "and{w} {$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (and GR16:$src1, imm:$src2))]>, OpSize;
+def AND32ri  : Ii32<0x81, MRM4r, 
+                    (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "and{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (and GR32:$src1, imm:$src2))]>;
+def AND16ri8 : Ii8<0x83, MRM4r, 
+                   (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                   "and{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (and GR16:$src1, i16immSExt8:$src2))]>,
+                   OpSize;
+def AND32ri8 : Ii8<0x83, MRM4r, 
+                   (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "and{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (and GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+  def AND8mr   : I<0x20, MRMDestMem,
+                   (ops i8mem :$dst, GR8 :$src),
+                   "and{b} {$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR8:$src), addr:$dst)]>;
+  def AND16mr  : I<0x21, MRMDestMem,
+                   (ops i16mem:$dst, GR16:$src),
+                   "and{w} {$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR16:$src), addr:$dst)]>,
+                   OpSize;
+  def AND32mr  : I<0x21, MRMDestMem,
+                   (ops i32mem:$dst, GR32:$src),
+                   "and{l} {$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR32:$src), addr:$dst)]>;
+  def AND8mi   : Ii8<0x80, MRM4m,
+                     (ops i8mem :$dst, i8imm :$src),
+                     "and{b} {$src, $dst|$dst, $src}",
+                      [(store (and (loadi8 addr:$dst), imm:$src), addr:$dst)]>;
+  def AND16mi  : Ii16<0x81, MRM4m,
+                      (ops i16mem:$dst, i16imm:$src),
+                      "and{w} {$src, $dst|$dst, $src}",
+                      [(store (and (loadi16 addr:$dst), imm:$src), addr:$dst)]>,
+                      OpSize;
+  def AND32mi  : Ii32<0x81, MRM4m,
+                      (ops i32mem:$dst, i32imm:$src),
+                      "and{l} {$src, $dst|$dst, $src}",
+                      [(store (and (loadi32 addr:$dst), imm:$src), addr:$dst)]>;
+  def AND16mi8 : Ii8<0x83, MRM4m,
+                     (ops i16mem:$dst, i16i8imm :$src),
+                     "and{w} {$src, $dst|$dst, $src}",
+                [(store (and (load addr:$dst), i16immSExt8:$src), addr:$dst)]>,
+                     OpSize;
+  def AND32mi8 : Ii8<0x83, MRM4m,
+                     (ops i32mem:$dst, i32i8imm :$src),
+                     "and{l} {$src, $dst|$dst, $src}",
+                [(store (and (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
+}
+
+
+let isCommutable = 1 in {   // X = OR Y, Z   --> X = OR Z, Y
+def OR8rr    : I<0x08, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+                 "or{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (or GR8:$src1, GR8:$src2))]>;
+def OR16rr   : I<0x09, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                 "or{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (or GR16:$src1, GR16:$src2))]>, OpSize;
+def OR32rr   : I<0x09, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "or{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (or GR32:$src1, GR32:$src2))]>;
+}
+def OR8rm    : I<0x0A, MRMSrcMem , (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+                 "or{b} {$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (or GR8:$src1, (load addr:$src2)))]>;
+def OR16rm   : I<0x0B, MRMSrcMem , (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                 "or{w} {$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (or GR16:$src1, (load addr:$src2)))]>, OpSize;
+def OR32rm   : I<0x0B, MRMSrcMem , (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "or{l} {$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (or GR32:$src1, (load addr:$src2)))]>;
+
+def OR8ri    : Ii8 <0x80, MRM1r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+                    "or{b} {$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (or GR8:$src1, imm:$src2))]>;
+def OR16ri   : Ii16<0x81, MRM1r, (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+                    "or{w} {$src2, $dst|$dst, $src2}", 
+                    [(set GR16:$dst, (or GR16:$src1, imm:$src2))]>, OpSize;
+def OR32ri   : Ii32<0x81, MRM1r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "or{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (or GR32:$src1, imm:$src2))]>;
+
+def OR16ri8  : Ii8<0x83, MRM1r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                   "or{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (or GR16:$src1, i16immSExt8:$src2))]>, OpSize;
+def OR32ri8  : Ii8<0x83, MRM1r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "or{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (or GR32:$src1, i32immSExt8:$src2))]>;
+let isTwoAddress = 0 in {
+  def OR8mr  : I<0x08, MRMDestMem, (ops i8mem:$dst, GR8:$src),
+                 "or{b} {$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
+  def OR16mr : I<0x09, MRMDestMem, (ops i16mem:$dst, GR16:$src),
+                 "or{w} {$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>, OpSize;
+  def OR32mr : I<0x09, MRMDestMem, (ops i32mem:$dst, GR32:$src),
+                 "or{l} {$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR32:$src), addr:$dst)]>;
+  def OR8mi    : Ii8<0x80, MRM1m, (ops i8mem :$dst, i8imm:$src),
+                 "or{b} {$src, $dst|$dst, $src}",
+                 [(store (or (loadi8 addr:$dst), imm:$src), addr:$dst)]>;
+  def OR16mi   : Ii16<0x81, MRM1m, (ops i16mem:$dst, i16imm:$src),
+                 "or{w} {$src, $dst|$dst, $src}",
+                 [(store (or (loadi16 addr:$dst), imm:$src), addr:$dst)]>,
+                 OpSize;
+  def OR32mi   : Ii32<0x81, MRM1m, (ops i32mem:$dst, i32imm:$src),
+                 "or{l} {$src, $dst|$dst, $src}",
+                 [(store (or (loadi32 addr:$dst), imm:$src), addr:$dst)]>;
+  def OR16mi8  : Ii8<0x83, MRM1m, (ops i16mem:$dst, i16i8imm:$src),
+                 "or{w} {$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), i16immSExt8:$src), addr:$dst)]>,
+                     OpSize;
+  def OR32mi8  : Ii8<0x83, MRM1m, (ops i32mem:$dst, i32i8imm:$src),
+                 "or{l} {$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
+}
+
+
+let isCommutable = 1 in {   // X = XOR Y, Z   --> X = XOR Z, Y
+def XOR8rr   : I<0x30, MRMDestReg,
+                 (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+                 "xor{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (xor GR8:$src1, GR8:$src2))]>;
+def XOR16rr  : I<0x31, MRMDestReg, 
+                 (ops GR16:$dst, GR16:$src1, GR16:$src2), 
+                 "xor{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (xor GR16:$src1, GR16:$src2))]>, OpSize;
+def XOR32rr  : I<0x31, MRMDestReg, 
+                 (ops GR32:$dst, GR32:$src1, GR32:$src2), 
+                 "xor{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (xor GR32:$src1, GR32:$src2))]>;
+}
+
+def XOR8rm   : I<0x32, MRMSrcMem , 
+                 (ops GR8 :$dst, GR8:$src1, i8mem :$src2), 
+                 "xor{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2)))]>;
+def XOR16rm  : I<0x33, MRMSrcMem , 
+                 (ops GR16:$dst, GR16:$src1, i16mem:$src2), 
+                 "xor{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2)))]>, OpSize;
+def XOR32rm  : I<0x33, MRMSrcMem , 
+                 (ops GR32:$dst, GR32:$src1, i32mem:$src2), 
+                 "xor{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (xor GR32:$src1, (load addr:$src2)))]>;
+
+def XOR8ri   : Ii8<0x80, MRM6r, 
+                   (ops GR8:$dst, GR8:$src1, i8imm:$src2), 
+                   "xor{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (xor GR8:$src1, imm:$src2))]>;
+def XOR16ri  : Ii16<0x81, MRM6r, 
+                    (ops GR16:$dst, GR16:$src1, i16imm:$src2), 
+                    "xor{w} {$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (xor GR16:$src1, imm:$src2))]>, OpSize;
+def XOR32ri  : Ii32<0x81, MRM6r, 
+                    (ops GR32:$dst, GR32:$src1, i32imm:$src2), 
+                    "xor{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (xor GR32:$src1, imm:$src2))]>;
+def XOR16ri8 : Ii8<0x83, MRM6r, 
+                   (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                   "xor{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (xor GR16:$src1, i16immSExt8:$src2))]>,
+                   OpSize;
+def XOR32ri8 : Ii8<0x83, MRM6r, 
+                   (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "xor{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2))]>;
+let isTwoAddress = 0 in {
+  def XOR8mr   : I<0x30, MRMDestMem,
+                   (ops i8mem :$dst, GR8 :$src),
+                   "xor{b} {$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR8:$src), addr:$dst)]>;
+  def XOR16mr  : I<0x31, MRMDestMem,
+                   (ops i16mem:$dst, GR16:$src),
+                   "xor{w} {$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR16:$src), addr:$dst)]>,
+                   OpSize;
+  def XOR32mr  : I<0x31, MRMDestMem,
+                   (ops i32mem:$dst, GR32:$src),
+                   "xor{l} {$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR32:$src), addr:$dst)]>;
+  def XOR8mi   : Ii8<0x80, MRM6m,
+                     (ops i8mem :$dst, i8imm :$src),
+                     "xor{b} {$src, $dst|$dst, $src}",
+                    [(store (xor (loadi8 addr:$dst), imm:$src), addr:$dst)]>;
+  def XOR16mi  : Ii16<0x81, MRM6m,
+                      (ops i16mem:$dst, i16imm:$src),
+                      "xor{w} {$src, $dst|$dst, $src}",
+                   [(store (xor (loadi16 addr:$dst), imm:$src), addr:$dst)]>,
+                      OpSize;
+  def XOR32mi  : Ii32<0x81, MRM6m,
+                      (ops i32mem:$dst, i32imm:$src),
+                      "xor{l} {$src, $dst|$dst, $src}",
+                   [(store (xor (loadi32 addr:$dst), imm:$src), addr:$dst)]>;
+  def XOR16mi8 : Ii8<0x83, MRM6m,
+                     (ops i16mem:$dst, i16i8imm :$src),
+                     "xor{w} {$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i16immSExt8:$src), addr:$dst)]>,
+                     OpSize;
+  def XOR32mi8 : Ii8<0x83, MRM6m,
+                     (ops i32mem:$dst, i32i8imm :$src),
+                     "xor{l} {$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
+}
+
+// Shift instructions
+def SHL8rCL  : I<0xD2, MRM4r, (ops GR8 :$dst, GR8 :$src),
+                 "shl{b} {%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (shl GR8:$src, CL))]>, Imp<[CL],[]>;
+def SHL16rCL : I<0xD3, MRM4r, (ops GR16:$dst, GR16:$src),
+                 "shl{w} {%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (shl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def SHL32rCL : I<0xD3, MRM4r, (ops GR32:$dst, GR32:$src),
+                 "shl{l} {%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (shl GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def SHL8ri   : Ii8<0xC0, MRM4r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+                   "shl{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def SHL16ri  : Ii8<0xC1, MRM4r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+                   "shl{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHL32ri  : Ii8<0xC1, MRM4r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+                   "shl{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>;
+}
+
+// Shift left by one. Not used because (add x, x) is slightly cheaper.
+def SHL8r1   : I<0xD0, MRM4r, (ops GR8 :$dst, GR8 :$src1),
+                 "shl{b} $dst", []>;
+def SHL16r1  : I<0xD1, MRM4r, (ops GR16:$dst, GR16:$src1),
+                 "shl{w} $dst", []>, OpSize;
+def SHL32r1  : I<0xD1, MRM4r, (ops GR32:$dst, GR32:$src1),
+                 "shl{l} $dst", []>;
+
+let isTwoAddress = 0 in {
+  def SHL8mCL  : I<0xD2, MRM4m, (ops i8mem :$dst),
+                   "shl{b} {%cl, $dst|$dst, %CL}",
+                   [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SHL16mCL : I<0xD3, MRM4m, (ops i16mem:$dst),
+                   "shl{w} {%cl, $dst|$dst, %CL}",
+                   [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>, OpSize;
+  def SHL32mCL : I<0xD3, MRM4m, (ops i32mem:$dst),
+                   "shl{l} {%cl, $dst|$dst, %CL}",
+                   [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SHL8mi   : Ii8<0xC0, MRM4m, (ops i8mem :$dst, i8imm:$src),
+                     "shl{b} {$src, $dst|$dst, $src}",
+                  [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SHL16mi  : Ii8<0xC1, MRM4m, (ops i16mem:$dst, i8imm:$src),
+                     "shl{w} {$src, $dst|$dst, $src}",
+                 [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SHL32mi  : Ii8<0xC1, MRM4m, (ops i32mem:$dst, i8imm:$src),
+                     "shl{l} {$src, $dst|$dst, $src}",
+                 [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SHL8m1   : I<0xD0, MRM4m, (ops i8mem :$dst),
+                   "shl{b} $dst",
+                  [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SHL16m1  : I<0xD1, MRM4m, (ops i16mem:$dst),
+                   "shl{w} $dst",
+                 [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def SHL32m1  : I<0xD1, MRM4m, (ops i32mem:$dst),
+                   "shl{l} $dst",
+                 [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+def SHR8rCL  : I<0xD2, MRM5r, (ops GR8 :$dst, GR8 :$src),
+                 "shr{b} {%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (srl GR8:$src, CL))]>, Imp<[CL],[]>;
+def SHR16rCL : I<0xD3, MRM5r, (ops GR16:$dst, GR16:$src),
+                 "shr{w} {%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (srl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def SHR32rCL : I<0xD3, MRM5r, (ops GR32:$dst, GR32:$src),
+                 "shr{l} {%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (srl GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def SHR8ri   : Ii8<0xC0, MRM5r, (ops GR8:$dst, GR8:$src1, i8imm:$src2),
+                   "shr{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
+def SHR16ri  : Ii8<0xC1, MRM5r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+                   "shr{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHR32ri  : Ii8<0xC1, MRM5r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+                   "shr{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SHR8r1   : I<0xD0, MRM5r, (ops GR8:$dst, GR8:$src1),
+                 "shr{b} $dst",
+                 [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
+def SHR16r1  : I<0xD1, MRM5r, (ops GR16:$dst, GR16:$src1),
+                 "shr{w} $dst",
+                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize;
+def SHR32r1  : I<0xD1, MRM5r, (ops GR32:$dst, GR32:$src1),
+                 "shr{l} $dst",
+                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  def SHR8mCL  : I<0xD2, MRM5m, (ops i8mem :$dst),
+                   "shr{b} {%cl, $dst|$dst, %CL}",
+                   [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SHR16mCL : I<0xD3, MRM5m, (ops i16mem:$dst),
+                   "shr{w} {%cl, $dst|$dst, %CL}",
+                   [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>, OpSize;
+  def SHR32mCL : I<0xD3, MRM5m, (ops i32mem:$dst),
+                   "shr{l} {%cl, $dst|$dst, %CL}",
+                   [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SHR8mi   : Ii8<0xC0, MRM5m, (ops i8mem :$dst, i8imm:$src),
+                     "shr{b} {$src, $dst|$dst, $src}",
+                  [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SHR16mi  : Ii8<0xC1, MRM5m, (ops i16mem:$dst, i8imm:$src),
+                     "shr{w} {$src, $dst|$dst, $src}",
+                 [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SHR32mi  : Ii8<0xC1, MRM5m, (ops i32mem:$dst, i8imm:$src),
+                     "shr{l} {$src, $dst|$dst, $src}",
+                 [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SHR8m1   : I<0xD0, MRM5m, (ops i8mem :$dst),
+                   "shr{b} $dst",
+                  [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SHR16m1  : I<0xD1, MRM5m, (ops i16mem:$dst),
+                   "shr{w} $dst",
+                 [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize;
+  def SHR32m1  : I<0xD1, MRM5m, (ops i32mem:$dst),
+                   "shr{l} $dst",
+                 [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+def SAR8rCL  : I<0xD2, MRM7r, (ops GR8 :$dst, GR8 :$src),
+                 "sar{b} {%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (sra GR8:$src, CL))]>, Imp<[CL],[]>;
+def SAR16rCL : I<0xD3, MRM7r, (ops GR16:$dst, GR16:$src),
+                 "sar{w} {%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (sra GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def SAR32rCL : I<0xD3, MRM7r, (ops GR32:$dst, GR32:$src),
+                 "sar{l} {%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (sra GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def SAR8ri   : Ii8<0xC0, MRM7r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+                   "sar{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
+def SAR16ri  : Ii8<0xC1, MRM7r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+                   "sar{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+                   OpSize;
+def SAR32ri  : Ii8<0xC1, MRM7r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+                   "sar{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SAR8r1   : I<0xD0, MRM7r, (ops GR8 :$dst, GR8 :$src1),
+                 "sar{b} $dst",
+                 [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
+def SAR16r1  : I<0xD1, MRM7r, (ops GR16:$dst, GR16:$src1),
+                 "sar{w} $dst",
+                 [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize;
+def SAR32r1  : I<0xD1, MRM7r, (ops GR32:$dst, GR32:$src1),
+                 "sar{l} $dst",
+                 [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  def SAR8mCL  : I<0xD2, MRM7m, (ops i8mem :$dst),
+                   "sar{b} {%cl, $dst|$dst, %CL}",
+                   [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SAR16mCL : I<0xD3, MRM7m, (ops i16mem:$dst),
+                   "sar{w} {%cl, $dst|$dst, %CL}",
+                   [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>, OpSize;
+  def SAR32mCL : I<0xD3, MRM7m, (ops i32mem:$dst), 
+                   "sar{l} {%cl, $dst|$dst, %CL}",
+                   [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SAR8mi   : Ii8<0xC0, MRM7m, (ops i8mem :$dst, i8imm:$src),
+                     "sar{b} {$src, $dst|$dst, $src}",
+                  [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SAR16mi  : Ii8<0xC1, MRM7m, (ops i16mem:$dst, i8imm:$src),
+                     "sar{w} {$src, $dst|$dst, $src}",
+                 [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SAR32mi  : Ii8<0xC1, MRM7m, (ops i32mem:$dst, i8imm:$src),
+                     "sar{l} {$src, $dst|$dst, $src}",
+                 [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SAR8m1   : I<0xD0, MRM7m, (ops i8mem :$dst),
+                   "sar{b} $dst",
+                  [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SAR16m1  : I<0xD1, MRM7m, (ops i16mem:$dst),
+                   "sar{w} $dst",
+                 [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def SAR32m1  : I<0xD1, MRM7m, (ops i32mem:$dst),
+                   "sar{l} $dst",
+                 [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+// Rotate instructions
+// FIXME: provide shorter instructions when imm8 == 1
+def ROL8rCL  : I<0xD2, MRM0r, (ops GR8 :$dst, GR8 :$src),
+                 "rol{b} {%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (rotl GR8:$src, CL))]>, Imp<[CL],[]>;
+def ROL16rCL : I<0xD3, MRM0r, (ops GR16:$dst, GR16:$src),
+                 "rol{w} {%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (rotl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (ops GR32:$dst, GR32:$src),
+                 "rol{l} {%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (rotl GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def ROL8ri   : Ii8<0xC0, MRM0r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+                   "rol{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+def ROL16ri  : Ii8<0xC1, MRM0r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+                   "rol{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def ROL32ri  : Ii8<0xC1, MRM0r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+                   "rol{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROL8r1   : I<0xD0, MRM0r, (ops GR8 :$dst, GR8 :$src1),
+                 "rol{b} $dst",
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
+def ROL16r1  : I<0xD1, MRM0r, (ops GR16:$dst, GR16:$src1),
+                 "rol{w} $dst",
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize;
+def ROL32r1  : I<0xD1, MRM0r, (ops GR32:$dst, GR32:$src1),
+                 "rol{l} $dst",
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  def ROL8mCL  : I<0xD2, MRM0m, (ops i8mem :$dst),
+                   "rol{b} {%cl, $dst|$dst, %CL}",
+                   [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def ROL16mCL : I<0xD3, MRM0m, (ops i16mem:$dst),
+                   "rol{w} {%cl, $dst|$dst, %CL}",
+                   [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>, OpSize;
+  def ROL32mCL : I<0xD3, MRM0m, (ops i32mem:$dst),
+                   "rol{l} {%cl, $dst|$dst, %CL}",
+                   [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def ROL8mi   : Ii8<0xC0, MRM0m, (ops i8mem :$dst, i8imm:$src),
+                     "rol{b} {$src, $dst|$dst, $src}",
+                 [(store (rotl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def ROL16mi  : Ii8<0xC1, MRM0m, (ops i16mem:$dst, i8imm:$src),
+                     "rol{w} {$src, $dst|$dst, $src}",
+                [(store (rotl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def ROL32mi  : Ii8<0xC1, MRM0m, (ops i32mem:$dst, i8imm:$src),
+                     "rol{l} {$src, $dst|$dst, $src}",
+                [(store (rotl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Rotate by 1
+  def ROL8m1   : I<0xD0, MRM0m, (ops i8mem :$dst),
+                   "rol{b} $dst",
+                 [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def ROL16m1  : I<0xD1, MRM0m, (ops i16mem:$dst),
+                   "rol{w} $dst",
+                [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def ROL32m1  : I<0xD1, MRM0m, (ops i32mem:$dst),
+                   "rol{l} $dst",
+                [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+def ROR8rCL  : I<0xD2, MRM1r, (ops GR8 :$dst, GR8 :$src),
+                 "ror{b} {%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (rotr GR8:$src, CL))]>, Imp<[CL],[]>;
+def ROR16rCL : I<0xD3, MRM1r, (ops GR16:$dst, GR16:$src),
+                 "ror{w} {%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (rotr GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (ops GR32:$dst, GR32:$src),
+                 "ror{l} {%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (rotr GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def ROR8ri   : Ii8<0xC0, MRM1r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+                   "ror{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
+def ROR16ri  : Ii8<0xC1, MRM1r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+                   "ror{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def ROR32ri  : Ii8<0xC1, MRM1r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+                   "ror{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROR8r1   : I<0xD0, MRM1r, (ops GR8 :$dst, GR8 :$src1),
+                 "ror{b} $dst",
+                 [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
+def ROR16r1  : I<0xD1, MRM1r, (ops GR16:$dst, GR16:$src1),
+                 "ror{w} $dst",
+                 [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize;
+def ROR32r1  : I<0xD1, MRM1r, (ops GR32:$dst, GR32:$src1),
+                 "ror{l} $dst",
+                 [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  def ROR8mCL  : I<0xD2, MRM1m, (ops i8mem :$dst),
+                   "ror{b} {%cl, $dst|$dst, %CL}",
+                   [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def ROR16mCL : I<0xD3, MRM1m, (ops i16mem:$dst),
+                   "ror{w} {%cl, $dst|$dst, %CL}",
+                   [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>, OpSize;
+  def ROR32mCL : I<0xD3, MRM1m, (ops i32mem:$dst), 
+                   "ror{l} {%cl, $dst|$dst, %CL}",
+                   [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def ROR8mi   : Ii8<0xC0, MRM1m, (ops i8mem :$dst, i8imm:$src),
+                     "ror{b} {$src, $dst|$dst, $src}",
+                 [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def ROR16mi  : Ii8<0xC1, MRM1m, (ops i16mem:$dst, i8imm:$src),
+                     "ror{w} {$src, $dst|$dst, $src}",
+                [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def ROR32mi  : Ii8<0xC1, MRM1m, (ops i32mem:$dst, i8imm:$src),
+                     "ror{l} {$src, $dst|$dst, $src}",
+                [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Rotate by 1
+  def ROR8m1   : I<0xD0, MRM1m, (ops i8mem :$dst),
+                   "ror{b} $dst",
+                 [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def ROR16m1  : I<0xD1, MRM1m, (ops i16mem:$dst),
+                   "ror{w} $dst",
+                [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def ROR32m1  : I<0xD1, MRM1m, (ops i32mem:$dst),
+                   "ror{l} $dst",
+                [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+
+
+// Double shift instructions (generalizations of rotate)
+def SHLD32rrCL : I<0xA5, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                   "shld{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>,
+                   Imp<[CL],[]>, TB;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                   "shrd{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>,
+                   Imp<[CL],[]>, TB;
+def SHLD16rrCL : I<0xA5, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                   "shld{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+                   Imp<[CL],[]>, TB, OpSize;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                   "shrd{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+                   Imp<[CL],[]>, TB, OpSize;
+
+let isCommutable = 1 in {  // These instructions commute to each other.
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+                     (ops GR32:$dst, GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shld{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))]>,
+                 TB;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+                     (ops GR32:$dst, GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shrd{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))]>,
+                 TB;
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+                     (ops GR16:$dst, GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shld{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))]>,
+                     TB, OpSize;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+                     (ops GR16:$dst, GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shrd{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))]>,
+                     TB, OpSize;
+}
+
+let isTwoAddress = 0 in {
+  def SHLD32mrCL : I<0xA5, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+                     "shld{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                     [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+                       addr:$dst)]>,
+                     Imp<[CL],[]>, TB;
+  def SHRD32mrCL : I<0xAD, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+                    "shrd{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+                      addr:$dst)]>,
+                    Imp<[CL],[]>, TB;
+  def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+                      (ops i32mem:$dst, GR32:$src2, i8imm:$src3),
+                      "shld{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                      TB;
+  def SHRD32mri8 : Ii8<0xAC, MRMDestMem, 
+                       (ops i32mem:$dst, GR32:$src2, i8imm:$src3),
+                       "shrd{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+                                         (i8 imm:$src3)), addr:$dst)]>,
+                       TB;
+
+  def SHLD16mrCL : I<0xA5, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+                     "shld{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                     [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+                       addr:$dst)]>,
+                     Imp<[CL],[]>, TB, OpSize;
+  def SHRD16mrCL : I<0xAD, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+                    "shrd{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+                      addr:$dst)]>,
+                    Imp<[CL],[]>, TB, OpSize;
+  def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+                      (ops i16mem:$dst, GR16:$src2, i8imm:$src3),
+                      "shld{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                      TB, OpSize;
+  def SHRD16mri8 : Ii8<0xAC, MRMDestMem, 
+                       (ops i16mem:$dst, GR16:$src2, i8imm:$src3),
+                       "shrd{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                       TB, OpSize;
+}
+
+
+// Arithmetic.
+let isCommutable = 1 in {   // X = ADD Y, Z   --> X = ADD Z, Y
+def ADD8rr   : I<0x00, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+                 "add{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (add GR8:$src1, GR8:$src2))]>;
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def ADD16rr  : I<0x01, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                 "add{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (add GR16:$src1, GR16:$src2))]>, OpSize;
+def ADD32rr  : I<0x01, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "add{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (add GR32:$src1, GR32:$src2))]>;
+} // end isConvertibleToThreeAddress
+} // end isCommutable
+def ADD8rm   : I<0x02, MRMSrcMem, (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+                 "add{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (add GR8:$src1, (load addr:$src2)))]>;
+def ADD16rm  : I<0x03, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                 "add{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (add GR16:$src1, (load addr:$src2)))]>, OpSize;
+def ADD32rm  : I<0x03, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "add{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (add GR32:$src1, (load addr:$src2)))]>;
+
+def ADD8ri   : Ii8<0x80, MRM0r, (ops GR8:$dst, GR8:$src1, i8imm:$src2),
+                   "add{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (add GR8:$src1, imm:$src2))]>;
+
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def ADD16ri  : Ii16<0x81, MRM0r, (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+                    "add{w} {$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (add GR16:$src1, imm:$src2))]>, OpSize;
+def ADD32ri  : Ii32<0x81, MRM0r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "add{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (add GR32:$src1, imm:$src2))]>;
+def ADD16ri8 : Ii8<0x83, MRM0r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                   "add{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (add GR16:$src1, i16immSExt8:$src2))]>,
+                   OpSize;
+def ADD32ri8 : Ii8<0x83, MRM0r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "add{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (add GR32:$src1, i32immSExt8:$src2))]>;
+}
+
+let isTwoAddress = 0 in {
+  def ADD8mr   : I<0x00, MRMDestMem, (ops i8mem :$dst, GR8 :$src2),
+                   "add{b} {$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR8:$src2), addr:$dst)]>;
+  def ADD16mr  : I<0x01, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+                   "add{w} {$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR16:$src2), addr:$dst)]>,
+                   OpSize;
+  def ADD32mr  : I<0x01, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+                   "add{l} {$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def ADD8mi   : Ii8<0x80, MRM0m, (ops i8mem :$dst, i8imm :$src2),
+                     "add{b} {$src2, $dst|$dst, $src2}",
+                   [(store (add (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+  def ADD16mi  : Ii16<0x81, MRM0m, (ops i16mem:$dst, i16imm:$src2),
+                      "add{w} {$src2, $dst|$dst, $src2}",
+                  [(store (add (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+                   OpSize;
+  def ADD32mi  : Ii32<0x81, MRM0m, (ops i32mem:$dst, i32imm:$src2),
+                      "add{l} {$src2, $dst|$dst, $src2}",
+                  [(store (add (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def ADD16mi8 : Ii8<0x83, MRM0m, (ops i16mem:$dst, i16i8imm :$src2),
+                     "add{w} {$src2, $dst|$dst, $src2}",
+                [(store (add (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+                   OpSize;
+  def ADD32mi8 : Ii8<0x83, MRM0m, (ops i32mem:$dst, i32i8imm :$src2),
+                     "add{l} {$src2, $dst|$dst, $src2}",
+                [(store (add (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+
+let isCommutable = 1 in {  // X = ADC Y, Z --> X = ADC Z, Y
+def ADC32rr  : I<0x11, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "adc{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, GR32:$src2))]>;
+}
+def ADC32rm  : I<0x13, MRMSrcMem , (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "adc{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, (load addr:$src2)))]>;
+def ADC32ri  : Ii32<0x81, MRM2r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "adc{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, imm:$src2))]>;
+def ADC32ri8 : Ii8<0x83, MRM2r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "adc{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+  def ADC32mr  : I<0x11, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+                   "adc{l} {$src2, $dst|$dst, $src2}",
+                   [(store (adde (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def ADC32mi  : Ii32<0x81, MRM2m, (ops i32mem:$dst, i32imm:$src2),
+                      "adc{l} {$src2, $dst|$dst, $src2}",
+                  [(store (adde (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def ADC32mi8 : Ii8<0x83, MRM2m, (ops i32mem:$dst, i32i8imm :$src2),
+                     "adc{l} {$src2, $dst|$dst, $src2}",
+             [(store (adde (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+
+def SUB8rr   : I<0x28, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+                 "sub{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (sub GR8:$src1, GR8:$src2))]>;
+def SUB16rr  : I<0x29, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                 "sub{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (sub GR16:$src1, GR16:$src2))]>, OpSize;
+def SUB32rr  : I<0x29, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "sub{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (sub GR32:$src1, GR32:$src2))]>;
+def SUB8rm   : I<0x2A, MRMSrcMem, (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+                 "sub{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2)))]>;
+def SUB16rm  : I<0x2B, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                 "sub{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2)))]>, OpSize;
+def SUB32rm  : I<0x2B, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "sub{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (sub GR32:$src1, (load addr:$src2)))]>;
+
+def SUB8ri   : Ii8 <0x80, MRM5r, (ops GR8:$dst, GR8:$src1, i8imm:$src2),
+                    "sub{b} {$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (sub GR8:$src1, imm:$src2))]>;
+def SUB16ri  : Ii16<0x81, MRM5r, (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+                    "sub{w} {$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (sub GR16:$src1, imm:$src2))]>, OpSize;
+def SUB32ri  : Ii32<0x81, MRM5r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "sub{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sub GR32:$src1, imm:$src2))]>;
+def SUB16ri8 : Ii8<0x83, MRM5r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                   "sub{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sub GR16:$src1, i16immSExt8:$src2))]>,
+                   OpSize;
+def SUB32ri8 : Ii8<0x83, MRM5r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "sub{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sub GR32:$src1, i32immSExt8:$src2))]>;
+let isTwoAddress = 0 in {
+  def SUB8mr   : I<0x28, MRMDestMem, (ops i8mem :$dst, GR8 :$src2),
+                   "sub{b} {$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR8:$src2), addr:$dst)]>;
+  def SUB16mr  : I<0x29, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+                   "sub{w} {$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR16:$src2), addr:$dst)]>,
+                   OpSize;
+  def SUB32mr  : I<0x29, MRMDestMem, (ops i32mem:$dst, GR32:$src2), 
+                   "sub{l} {$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def SUB8mi   : Ii8<0x80, MRM5m, (ops i8mem :$dst, i8imm:$src2), 
+                     "sub{b} {$src2, $dst|$dst, $src2}",
+                   [(store (sub (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SUB16mi  : Ii16<0x81, MRM5m, (ops i16mem:$dst, i16imm:$src2), 
+                      "sub{w} {$src2, $dst|$dst, $src2}",
+                  [(store (sub (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+                   OpSize;
+  def SUB32mi  : Ii32<0x81, MRM5m, (ops i32mem:$dst, i32imm:$src2), 
+                      "sub{l} {$src2, $dst|$dst, $src2}",
+                  [(store (sub (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SUB16mi8 : Ii8<0x83, MRM5m, (ops i16mem:$dst, i16i8imm :$src2), 
+                     "sub{w} {$src2, $dst|$dst, $src2}",
+                [(store (sub (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+                   OpSize;
+  def SUB32mi8 : Ii8<0x83, MRM5m, (ops i32mem:$dst, i32i8imm :$src2), 
+                     "sub{l} {$src2, $dst|$dst, $src2}",
+                [(store (sub (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+
+def SBB32rr    : I<0x19, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "sbb{l} {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (sube GR32:$src1, GR32:$src2))]>;
+
+let isTwoAddress = 0 in {
+  def SBB32mr  : I<0x19, MRMDestMem, (ops i32mem:$dst, GR32:$src2), 
+                   "sbb{l} {$src2, $dst|$dst, $src2}",
+                   [(store (sube (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def SBB8mi  : Ii32<0x80, MRM3m, (ops i8mem:$dst, i8imm:$src2), 
+                      "sbb{b} {$src2, $dst|$dst, $src2}",
+                   [(store (sube (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SBB32mi  : Ii32<0x81, MRM3m, (ops i32mem:$dst, i32imm:$src2), 
+                      "sbb{l} {$src2, $dst|$dst, $src2}",
+                  [(store (sube (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SBB32mi8 : Ii8<0x83, MRM3m, (ops i32mem:$dst, i32i8imm :$src2), 
+                     "sbb{l} {$src2, $dst|$dst, $src2}",
+             [(store (sube (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+def SBB32rm  : I<0x1B, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                    "sbb{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sube GR32:$src1, (load addr:$src2)))]>;
+def SBB32ri  : Ii32<0x81, MRM3r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "sbb{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sube GR32:$src1, imm:$src2))]>;
+def SBB32ri8 : Ii8<0x83, MRM3r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "sbb{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sube GR32:$src1, i32immSExt8:$src2))]>;
+
+let isCommutable = 1 in {  // X = IMUL Y, Z --> X = IMUL Z, Y
+def IMUL16rr : I<0xAF, MRMSrcReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                 "imul{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (mul GR16:$src1, GR16:$src2))]>, TB, OpSize;
+def IMUL32rr : I<0xAF, MRMSrcReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "imul{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (mul GR32:$src1, GR32:$src2))]>, TB;
+}
+def IMUL16rm : I<0xAF, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                 "imul{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (mul GR16:$src1, (load addr:$src2)))]>,
+                 TB, OpSize;
+def IMUL32rm : I<0xAF, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "imul{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (mul GR32:$src1, (load addr:$src2)))]>, TB;
+
+} // end Two Address instructions
+
+// Suprisingly enough, these are not two address instructions!
+def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
+                      (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+                      "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, (mul GR16:$src1, imm:$src2))]>, OpSize;
+def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
+                      (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                      "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, (mul GR32:$src1, imm:$src2))]>;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
+                     (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                     "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, (mul GR16:$src1, i16immSExt8:$src2))]>,
+                     OpSize;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
+                     (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                     "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, (mul GR32:$src1, i32immSExt8:$src2))]>;
+
+def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                      // GR16 = [mem16]*I16
+                      (ops GR16:$dst, i16mem:$src1, i16imm:$src2),
+                      "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, (mul (load addr:$src1), imm:$src2))]>,
+                      OpSize;
+def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                      // GR32 = [mem32]*I32
+                      (ops GR32:$dst, i32mem:$src1, i32imm:$src2),
+                      "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, (mul (load addr:$src1), imm:$src2))]>;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
+                     (ops GR16:$dst, i16mem:$src1, i16i8imm :$src2),
+                     "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GR16:$dst, (mul (load addr:$src1), i16immSExt8:$src2))]>,
+                     OpSize;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
+                     (ops GR32:$dst, i32mem:$src1, i32i8imm: $src2),
+                     "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GR32:$dst, (mul (load addr:$src1), i32immSExt8:$src2))]>;
+
+//===----------------------------------------------------------------------===//
+// Test instructions are just like AND, except they don't generate a result.
+//
+let isCommutable = 1 in {   // TEST X, Y   --> TEST Y, X
+def TEST8rr  : I<0x84, MRMDestReg, (ops GR8:$src1, GR8:$src2),
+                 "test{b} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR8:$src1, GR8:$src2), 0)]>;
+def TEST16rr : I<0x85, MRMDestReg, (ops GR16:$src1, GR16:$src2),
+                 "test{w} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR16:$src1, GR16:$src2), 0)]>, OpSize;
+def TEST32rr : I<0x85, MRMDestReg, (ops GR32:$src1, GR32:$src2),
+                 "test{l} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR32:$src1, GR32:$src2), 0)]>;
+}
+
+def TEST8rm  : I<0x84, MRMSrcMem, (ops GR8 :$src1, i8mem :$src2),
+                 "test{b} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR8:$src1, (loadi8 addr:$src2)), 0)]>;
+def TEST16rm : I<0x85, MRMSrcMem, (ops GR16:$src1, i16mem:$src2),
+                 "test{w} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR16:$src1, (loadi16 addr:$src2)), 0)]>,
+               OpSize;
+def TEST32rm : I<0x85, MRMSrcMem, (ops GR32:$src1, i32mem:$src2),
+                 "test{l} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR32:$src1, (loadi32 addr:$src2)), 0)]>;
+
+def TEST8ri  : Ii8 <0xF6, MRM0r,                     // flags = GR8  & imm8
+                    (ops GR8:$src1, i8imm:$src2),
+                    "test{b} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and GR8:$src1, imm:$src2), 0)]>;
+def TEST16ri : Ii16<0xF7, MRM0r,                     // flags = GR16 & imm16
+                    (ops GR16:$src1, i16imm:$src2),
+                    "test{w} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and GR16:$src1, imm:$src2), 0)]>, OpSize;
+def TEST32ri : Ii32<0xF7, MRM0r,                     // flags = GR32 & imm32
+                    (ops GR32:$src1, i32imm:$src2),
+                    "test{l} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and GR32:$src1, imm:$src2), 0)]>;
+
+def TEST8mi  : Ii8 <0xF6, MRM0m,                     // flags = [mem8]  & imm8
+                    (ops i8mem:$src1, i8imm:$src2),
+                    "test{b} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi8 addr:$src1), imm:$src2), 0)]>;
+def TEST16mi : Ii16<0xF7, MRM0m,                     // flags = [mem16] & imm16
+                    (ops i16mem:$src1, i16imm:$src2),
+                    "test{w} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi16 addr:$src1), imm:$src2), 0)]>,
+               OpSize;
+def TEST32mi : Ii32<0xF7, MRM0m,                     // flags = [mem32] & imm32
+                    (ops i32mem:$src1, i32imm:$src2),
+                    "test{l} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi32 addr:$src1), imm:$src2), 0)]>;
+
+
+// Condition code ops, incl. set if equal/not equal/...
+def SAHF     : I<0x9E, RawFrm, (ops), "sahf", []>, Imp<[AH],[]>;  // flags = AH
+def LAHF     : I<0x9F, RawFrm, (ops), "lahf", []>, Imp<[],[AH]>;  // AH = flags
+
+def SETEr    : I<0x94, MRM0r, 
+                 (ops GR8   :$dst),
+                 "sete $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_E))]>,
+               TB;                        // GR8 = ==
+def SETEm    : I<0x94, MRM0m, 
+                 (ops i8mem:$dst),
+                 "sete $dst",
+                 [(store (X86setcc X86_COND_E), addr:$dst)]>,
+               TB;                        // [mem8] = ==
+def SETNEr   : I<0x95, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setne $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NE))]>,
+               TB;                        // GR8 = !=
+def SETNEm   : I<0x95, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setne $dst",
+                 [(store (X86setcc X86_COND_NE), addr:$dst)]>,
+               TB;                        // [mem8] = !=
+def SETLr    : I<0x9C, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setl $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_L))]>,
+               TB;                        // GR8 = <  signed
+def SETLm    : I<0x9C, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setl $dst",
+                 [(store (X86setcc X86_COND_L), addr:$dst)]>,
+               TB;                        // [mem8] = <  signed
+def SETGEr   : I<0x9D, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setge $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_GE))]>,
+               TB;                        // GR8 = >= signed
+def SETGEm   : I<0x9D, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setge $dst",
+                 [(store (X86setcc X86_COND_GE), addr:$dst)]>,
+               TB;                        // [mem8] = >= signed
+def SETLEr   : I<0x9E, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setle $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_LE))]>,
+               TB;                        // GR8 = <= signed
+def SETLEm   : I<0x9E, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setle $dst",
+                 [(store (X86setcc X86_COND_LE), addr:$dst)]>,
+               TB;                        // [mem8] = <= signed
+def SETGr    : I<0x9F, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setg $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_G))]>,
+               TB;                        // GR8 = >  signed
+def SETGm    : I<0x9F, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setg $dst",
+                 [(store (X86setcc X86_COND_G), addr:$dst)]>,
+               TB;                        // [mem8] = >  signed
+
+def SETBr    : I<0x92, MRM0r,
+                 (ops GR8   :$dst),
+                 "setb $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_B))]>,
+               TB;                        // GR8 = <  unsign
+def SETBm    : I<0x92, MRM0m,
+                 (ops i8mem:$dst),
+                 "setb $dst",
+                 [(store (X86setcc X86_COND_B), addr:$dst)]>,
+               TB;                        // [mem8] = <  unsign
+def SETAEr   : I<0x93, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setae $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_AE))]>,
+               TB;                        // GR8 = >= unsign
+def SETAEm   : I<0x93, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setae $dst",
+                 [(store (X86setcc X86_COND_AE), addr:$dst)]>,
+               TB;                        // [mem8] = >= unsign
+def SETBEr   : I<0x96, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setbe $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_BE))]>,
+               TB;                        // GR8 = <= unsign
+def SETBEm   : I<0x96, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setbe $dst",
+                 [(store (X86setcc X86_COND_BE), addr:$dst)]>,
+               TB;                        // [mem8] = <= unsign
+def SETAr    : I<0x97, MRM0r, 
+                 (ops GR8   :$dst),
+                 "seta $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_A))]>,
+               TB;                        // GR8 = >  signed
+def SETAm    : I<0x97, MRM0m, 
+                 (ops i8mem:$dst),
+                 "seta $dst",
+                 [(store (X86setcc X86_COND_A), addr:$dst)]>,
+               TB;                        // [mem8] = >  signed
+
+def SETSr    : I<0x98, MRM0r, 
+                 (ops GR8   :$dst),
+                 "sets $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_S))]>,
+               TB;                        // GR8 = <sign bit>
+def SETSm    : I<0x98, MRM0m, 
+                 (ops i8mem:$dst),
+                 "sets $dst",
+                 [(store (X86setcc X86_COND_S), addr:$dst)]>,
+               TB;                        // [mem8] = <sign bit>
+def SETNSr   : I<0x99, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setns $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NS))]>,
+               TB;                        // GR8 = !<sign bit>
+def SETNSm   : I<0x99, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setns $dst",
+                 [(store (X86setcc X86_COND_NS), addr:$dst)]>,
+               TB;                        // [mem8] = !<sign bit>
+def SETPr    : I<0x9A, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setp $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_P))]>,
+               TB;                        // GR8 = parity
+def SETPm    : I<0x9A, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setp $dst",
+                 [(store (X86setcc X86_COND_P), addr:$dst)]>,
+               TB;                        // [mem8] = parity
+def SETNPr   : I<0x9B, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setnp $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NP))]>,
+               TB;                        // GR8 = not parity
+def SETNPm   : I<0x9B, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setnp $dst",
+                 [(store (X86setcc X86_COND_NP), addr:$dst)]>,
+               TB;                        // [mem8] = not parity
+
+// Integer comparisons
+def CMP8rr  : I<0x38, MRMDestReg,
+                (ops GR8 :$src1, GR8 :$src2),
+                "cmp{b} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR8:$src1, GR8:$src2)]>;
+def CMP16rr : I<0x39, MRMDestReg,
+                (ops GR16:$src1, GR16:$src2),
+                "cmp{w} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR16:$src1, GR16:$src2)]>, OpSize;
+def CMP32rr : I<0x39, MRMDestReg,
+                (ops GR32:$src1, GR32:$src2),
+                "cmp{l} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR32:$src1, GR32:$src2)]>;
+def CMP8mr  : I<0x38, MRMDestMem,
+                (ops i8mem :$src1, GR8 :$src2),
+                "cmp{b} {$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi8 addr:$src1), GR8:$src2)]>;
+def CMP16mr : I<0x39, MRMDestMem,
+                (ops i16mem:$src1, GR16:$src2),
+                "cmp{w} {$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi16 addr:$src1), GR16:$src2)]>, OpSize;
+def CMP32mr : I<0x39, MRMDestMem,
+                (ops i32mem:$src1, GR32:$src2),
+                "cmp{l} {$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi32 addr:$src1), GR32:$src2)]>;
+def CMP8rm  : I<0x3A, MRMSrcMem,
+                (ops GR8 :$src1, i8mem :$src2),
+                "cmp{b} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR8:$src1, (loadi8 addr:$src2))]>;
+def CMP16rm : I<0x3B, MRMSrcMem,
+                (ops GR16:$src1, i16mem:$src2),
+                "cmp{w} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR16:$src1, (loadi16 addr:$src2))]>, OpSize;
+def CMP32rm : I<0x3B, MRMSrcMem,
+                (ops GR32:$src1, i32mem:$src2),
+                "cmp{l} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR32:$src1, (loadi32 addr:$src2))]>;
+def CMP8ri  : Ii8<0x80, MRM7r,
+                  (ops GR8:$src1, i8imm:$src2),
+                  "cmp{b} {$src2, $src1|$src1, $src2}",
+                  [(X86cmp GR8:$src1, imm:$src2)]>;
+def CMP16ri : Ii16<0x81, MRM7r,
+                   (ops GR16:$src1, i16imm:$src2),
+                   "cmp{w} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR16:$src1, imm:$src2)]>, OpSize;
+def CMP32ri : Ii32<0x81, MRM7r,
+                   (ops GR32:$src1, i32imm:$src2),
+                   "cmp{l} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR32:$src1, imm:$src2)]>;
+def CMP8mi  : Ii8 <0x80, MRM7m,
+                   (ops i8mem :$src1, i8imm :$src2),
+                   "cmp{b} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi8 addr:$src1), imm:$src2)]>;
+def CMP16mi : Ii16<0x81, MRM7m,
+                   (ops i16mem:$src1, i16imm:$src2),
+                   "cmp{w} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi16 addr:$src1), imm:$src2)]>, OpSize;
+def CMP32mi : Ii32<0x81, MRM7m,
+                   (ops i32mem:$src1, i32imm:$src2),
+                   "cmp{l} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi32 addr:$src1), imm:$src2)]>;
+def CMP16ri8 : Ii8<0x83, MRM7r,
+                   (ops GR16:$src1, i16i8imm:$src2),
+                   "cmp{w} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR16:$src1, i16immSExt8:$src2)]>, OpSize;
+def CMP16mi8 : Ii8<0x83, MRM7m,
+                   (ops i16mem:$src1, i16i8imm:$src2),
+                   "cmp{w} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi16 addr:$src1), i16immSExt8:$src2)]>, OpSize;
+def CMP32mi8 : Ii8<0x83, MRM7m,
+                   (ops i32mem:$src1, i32i8imm:$src2),
+                   "cmp{l} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi32 addr:$src1), i32immSExt8:$src2)]>;
+def CMP32ri8 : Ii8<0x83, MRM7r,
+                   (ops GR32:$src1, i32i8imm:$src2),
+                   "cmp{l} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR32:$src1, i32immSExt8:$src2)]>;
+
+// Sign/Zero extenders
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (ops GR16:$dst, GR8 :$src),
+                   "movs{bw|x} {$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (sext GR8:$src))]>, TB, OpSize;
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (ops GR16:$dst, i8mem :$src),
+                   "movs{bw|x} {$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB, OpSize;
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (ops GR32:$dst, GR8 :$src),
+                   "movs{bl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR8:$src))]>, TB;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (ops GR32:$dst, i8mem :$src),
+                   "movs{bl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (ops GR32:$dst, GR16:$src),
+                   "movs{wl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR16:$src))]>, TB;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (ops GR32:$dst, i16mem:$src),
+                   "movs{wl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB;
+
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (ops GR16:$dst, GR8 :$src),
+                   "movz{bw|x} {$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (zext GR8:$src))]>, TB, OpSize;
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (ops GR16:$dst, i8mem :$src),
+                   "movz{bw|x} {$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB, OpSize;
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (ops GR32:$dst, GR8 :$src),
+                   "movz{bl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR8:$src))]>, TB;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (ops GR32:$dst, i8mem :$src),
+                   "movz{bl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (ops GR32:$dst, GR16:$src),
+                   "movz{wl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR16:$src))]>, TB;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (ops GR32:$dst, i16mem:$src),
+                   "movz{wl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB;
+
+def CBW : I<0x98, RawFrm, (ops),
+            "{cbtw|cbw}", []>, Imp<[AL],[AX]>, OpSize;   // AX = signext(AL)
+def CWDE : I<0x98, RawFrm, (ops),
+            "{cwtl|cwde}", []>, Imp<[AX],[EAX]>;   // EAX = signext(AX)
+
+def CWD : I<0x99, RawFrm, (ops),
+            "{cwtd|cwd}", []>, Imp<[AX],[AX,DX]>, OpSize; // DX:AX = signext(AX)
+def CDQ : I<0x99, RawFrm, (ops),
+            "{cltd|cdq}", []>, Imp<[EAX],[EAX,EDX]>; // EDX:EAX = signext(EAX)
+          
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+def MOV8r0   : I<0x30, MRMInitReg, (ops GR8 :$dst),
+                 "xor{b} $dst, $dst",
+                 [(set GR8:$dst, 0)]>;
+def MOV16r0  : I<0x31, MRMInitReg,  (ops GR16:$dst), 
+                 "xor{w} $dst, $dst",
+                 [(set GR16:$dst, 0)]>, OpSize;
+def MOV32r0  : I<0x31, MRMInitReg,  (ops GR32:$dst), 
+                 "xor{l} $dst, $dst",
+                 [(set GR32:$dst, 0)]>;
+
+// Basic operations on GR16 / GR32 subclasses GR16_ and GR32_ which contains only
+// those registers that have GR8 sub-registers (i.e. AX - DX, EAX - EDX).
+def MOV16to16_ : I<0x89, MRMDestReg, (ops GR16_:$dst, GR16:$src),
+                "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32to32_ : I<0x89, MRMDestReg, (ops GR32_:$dst, GR32:$src),
+                "mov{l} {$src, $dst|$dst, $src}", []>;
+
+def MOV16_rr : I<0x89, MRMDestReg, (ops GR16_:$dst, GR16_:$src),
+                "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32_rr : I<0x89, MRMDestReg, (ops GR32_:$dst, GR32_:$src),
+                "mov{l} {$src, $dst|$dst, $src}", []>;
+def MOV16_rm : I<0x8B, MRMSrcMem, (ops GR16_:$dst, i16mem:$src),
+                "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32_rm : I<0x8B, MRMSrcMem, (ops GR32_:$dst, i32mem:$src),
+                "mov{l} {$src, $dst|$dst, $src}", []>;
+def MOV16_mr : I<0x89, MRMDestMem, (ops i16mem:$dst, GR16_:$src),
+                "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32_mr : I<0x89, MRMDestMem, (ops i32mem:$dst, GR32_:$src),
+                "mov{l} {$src, $dst|$dst, $src}", []>;
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+def TLS_addr : I<0, Pseudo, (ops GR32:$dst, i32imm:$sym),
+               "leal ${sym:mem}(,%ebx,1), $dst",
+               [(set GR32:$dst, (X86tlsaddr tglobaltlsaddr:$sym))]>,
+               Imp<[EBX],[]>;
+
+let AddedComplexity = 10 in
+def TLS_gs_rr : I<0, Pseudo, (ops GR32:$dst, GR32:$src),
+                  "movl %gs:($src), $dst",
+                  [(set GR32:$dst, (load (add X86TLStp, GR32:$src)))]>;
+
+let AddedComplexity = 15 in
+def TLS_gs_ri : I<0, Pseudo, (ops GR32:$dst, i32imm:$src),
+                  "movl %gs:${src:mem}, $dst",
+                  [(set GR32:$dst,
+                    (load (add X86TLStp, (X86Wrapper tglobaltlsaddr:$src))))]>;
+
+def TLS_tp : I<0, Pseudo, (ops GR32:$dst),
+               "movl %gs:0, $dst",
+               [(set GR32:$dst, X86TLStp)]>;
+
+//===----------------------------------------------------------------------===//
+// DWARF Pseudo Instructions
+//
+
+def DWARF_LOC   : I<0, Pseudo, (ops i32imm:$line, i32imm:$col, i32imm:$file),
+                    "; .loc $file, $line, $col",
+                    [(dwarf_loc (i32 imm:$line), (i32 imm:$col),
+                      (i32 imm:$file))]>;
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, noResults = 1 in {
+def EH_RETURN   : I<0xC3, RawFrm, (ops GR32:$addr),
+                    "ret #eh_return, addr: $addr",
+                    [(X86ehret GR32:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)), (MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+          (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+          (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+          (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+          (ADD32ri GR32:$src1, texternalsym:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV32mi addr:$dst, texternalsym:$src)>;
+
+// Calls
+def : Pat<(X86tailcall GR32:$dst),
+          (CALL32r     GR32:$dst)>;
+
+def : Pat<(X86tailcall (i32 tglobaladdr:$dst)),
+          (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86tailcall (i32 texternalsym:$dst)),
+          (CALLpcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+          (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+          (CALLpcrel32 texternalsym:$dst)>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR32:$src1, GR32:$src2),
+          (ADD32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(addc GR32:$src1, (load addr:$src2)),
+          (ADD32rm GR32:$src1, addr:$src2)>;
+def : Pat<(addc GR32:$src1, imm:$src2),
+          (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(addc GR32:$src1, i32immSExt8:$src2),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+def : Pat<(subc GR32:$src1, GR32:$src2),
+          (SUB32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(subc GR32:$src1, (load addr:$src2)),
+          (SUB32rm GR32:$src1, addr:$src2)>;
+def : Pat<(subc GR32:$src1, imm:$src2),
+          (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(subc GR32:$src1, i32immSExt8:$src2),
+          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+def : Pat<(truncstorei1 (i8 imm:$src), addr:$dst), 
+          (MOV8mi addr:$dst, imm:$src)>;
+def : Pat<(truncstorei1 GR8:$src, addr:$dst), 
+          (MOV8mr addr:$dst, GR8:$src)>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+          (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+          (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+          (TEST32rr GR32:$src1, GR32:$src1)>;
+
+// {s|z}extload bool -> {s|z}extload byte
+def : Pat<(sextloadi16i1 addr:$src), (MOVSX16rm8 addr:$src)>;
+def : Pat<(sextloadi32i1 addr:$src), (MOVSX32rm8 addr:$src)>;
+def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+
+// extload bool -> extload byte
+def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// anyext -> zext
+def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8  GR8 :$src)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
+def : Pat<(i32 (anyext GR16:$src)), (MOVZX32rr16 GR16:$src)>;
+def : Pat<(i16 (anyext (loadi8  addr:$src))), (MOVZX16rm8  addr:$src)>;
+def : Pat<(i32 (anyext (loadi8  addr:$src))), (MOVZX32rm8  addr:$src)>;
+def : Pat<(i32 (anyext (loadi16 addr:$src))), (MOVZX32rm16 addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+
+// (or (x >> c) | (y << (32 - c))) ==> (shrd32 x, y, c)
+def : Pat<(or (srl GR32:$src1, CL:$amt),
+              (shl GR32:$src2, (sub 32, CL:$amt))),
+          (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (srl (loadi32 addr:$dst), CL:$amt),
+                     (shl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+          (SHRD32mrCL addr:$dst, GR32:$src2)>;
+
+// (or (x << c) | (y >> (32 - c))) ==> (shld32 x, y, c)
+def : Pat<(or (shl GR32:$src1, CL:$amt),
+              (srl GR32:$src2, (sub 32, CL:$amt))),
+          (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (shl (loadi32 addr:$dst), CL:$amt),
+                     (srl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+          (SHLD32mrCL addr:$dst, GR32:$src2)>;
+
+// (or (x >> c) | (y << (16 - c))) ==> (shrd16 x, y, c)
+def : Pat<(or (srl GR16:$src1, CL:$amt),
+              (shl GR16:$src2, (sub 16, CL:$amt))),
+          (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (srl (loadi16 addr:$dst), CL:$amt),
+                     (shl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+          (SHRD16mrCL addr:$dst, GR16:$src2)>;
+
+// (or (x << c) | (y >> (16 - c))) ==> (shld16 x, y, c)
+def : Pat<(or (shl GR16:$src1, CL:$amt),
+              (srl GR16:$src2, (sub 16, CL:$amt))),
+          (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (shl (loadi16 addr:$dst), CL:$amt),
+                     (srl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+          (SHLD16mrCL addr:$dst, GR16:$src2)>;
+
+
+//===----------------------------------------------------------------------===//
+// Floating Point Stack Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrFPStack.td"
+
+//===----------------------------------------------------------------------===//
+// MMX and XMM Packed Integer support (requires MMX, SSE, and SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrMMX.td"
+
+//===----------------------------------------------------------------------===//
+// XMM Floating point support (requires SSE / SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrSSE.td"
+
+//===----------------------------------------------------------------------===//
+// X86-64 Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrX86-64.td"

diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..c774460
--- /dev/null
+++ b/lib/Target/X86/X86InstrMMX.td

@@ -0,0 +1,645 @@
+//====- X86InstrMMX.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction templates
+//===----------------------------------------------------------------------===//
+
+// MMXI   - MMX instructions with TB prefix.
+// MMX2I  - MMX / SSE2 instructions with TB and OpSize prefixes.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXID  - MMX instructions with XD prefix.
+// MMXIS  - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXRI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, XS, Requires<[HasMMX]>;
+
+// Some 'special' instructions
+def IMPLICIT_DEF_VR64 : I<0, Pseudo, (ops VR64:$dst),
+                          "#IMPLICIT_DEF $dst",
+                          [(set VR64:$dst, (v8i8 (undef)))]>,
+                        Requires<[HasMMX]>;
+
+// 64-bit vector undef's.
+def : Pat<(v8i8  (undef)), (IMPLICIT_DEF_VR64)>;
+def : Pat<(v4i16 (undef)), (IMPLICIT_DEF_VR64)>;
+def : Pat<(v2i32 (undef)), (IMPLICIT_DEF_VR64)>;
+def : Pat<(v1i64 (undef)), (IMPLICIT_DEF_VR64)>;
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (v1i64 (load node:$ptr))>;
+
+def bc_v8i8  : PatFrag<(ops node:$in), (v8i8  (bitconvert node:$in))>;
+def bc_v4i16 : PatFrag<(ops node:$in), (v4i16 (bitconvert node:$in))>;
+def bc_v2i32 : PatFrag<(ops node:$in), (v2i32 (bitconvert node:$in))>;
+def bc_v1i64 : PatFrag<(ops node:$in), (v1i64 (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// MMX Masks
+//===----------------------------------------------------------------------===//
+
+// MMX_SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to
+// PSHUFW imm.
+def MMX_SHUFFLE_get_shuf_imm : SDNodeXForm<build_vector, [{
+  return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <2, 6, 3, 7, ...>
+def MMX_UNPCKH_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKHMask(N);
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <0, 4, 2, 5, ...>
+def MMX_UNPCKL_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKLMask(N);
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+def MMX_UNPCKH_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKH_v_undef_Mask(N);
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+def MMX_UNPCKL_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKL_v_undef_Mask(N);
+}]>;
+
+// Patterns for shuffling.
+def MMX_PSHUFW_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isPSHUFDMask(N);
+}], MMX_SHUFFLE_get_shuf_imm>;
+
+// Patterns for: vector_shuffle v1, v2, <4, 5, 2, 3>; etc.
+def MMX_MOVL_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVLMask(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let isTwoAddress = 1 in {
+  // MMXI_binop_rm - Simple MMX binary operator.
+  multiclass MMXI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           ValueType OpVT, bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (OpVT (OpNode VR64:$src1, VR64:$src2)))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (OpVT (OpNode VR64:$src1,
+                                         (bitconvert
+                                          (load_mmx addr:$src2)))))]>;
+  }
+
+  multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                 !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                 !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1,
+                                   (bitconvert (load_mmx addr:$src2))))]>;
+  }
+
+  // MMXI_binop_rm_v1i64 - Simple MMX binary operator whose type is v1i64.
+  //
+  // FIXME: we could eliminate this and use MMXI_binop_rm instead if tblgen knew
+  // to collapse (bitconvert VT to VT) into its operand.
+  //
+  multiclass MMXI_binop_rm_v1i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (v1i64 (OpNode VR64:$src1, VR64:$src2)))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst,
+                    (OpNode VR64:$src1,(load_mmx addr:$src2)))]>;
+  }
+
+  multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                                string OpcodeStr, Intrinsic IntId> {
+    def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>;
+    def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1,
+                                    (bitconvert (load_mmx addr:$src2))))]>;
+    def ri : MMXIi8<opc2, ImmForm, (ops VR64:$dst, VR64:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                    [(set VR64:$dst, (IntId VR64:$src1,
+                                      (scalar_to_vector (i32 imm:$src2))))]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS & FEMMS Instructions
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS  : MMXI<0x77, RawFrm, (ops), "emms",  [(int_x86_mmx_emms)]>;
+def MMX_FEMMS : MMXI<0x0E, RawFrm, (ops), "femms", [(int_x86_mmx_femms)]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (ops VR64:$dst, GR32:$src),
+                        "movd {$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (ops VR64:$dst, i32mem:$src),
+                        "movd {$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (ops i32mem:$dst, VR64:$src),
+                        "movd {$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (ops VR64:$dst, GR64:$src),
+                             "movd {$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (ops VR64:$dst, VR64:$src),
+                        "movq {$src, $dst|$dst, $src}", []>;
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (ops VR64:$dst, i64mem:$src),
+                        "movq {$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, (load_mmx addr:$src))]>;
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (ops i64mem:$dst, VR64:$src),
+                        "movq {$src, $dst|$dst, $src}",
+                        [(store (v1i64 VR64:$src), addr:$dst)]>;
+
+def MMX_MOVDQ2Qrr : MMXID<0xD6, MRMDestMem, (ops VR64:$dst, VR128:$src),
+                          "movdq2q {$src, $dst|$dst, $src}",
+                          [(set VR64:$dst,
+                            (v1i64 (vector_extract (v2i64 VR128:$src),
+                                  (iPTR 0))))]>;
+
+def MMX_MOVQ2DQrr : MMXIS<0xD6, MRMDestMem, (ops VR128:$dst, VR64:$src),
+                          "movq2dq {$src, $dst|$dst, $src}",
+                          [(set VR128:$dst,
+                            (bitconvert (v1i64 VR64:$src)))]>;
+
+def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (ops i64mem:$dst, VR64:$src),
+                         "movntq {$src, $dst|$dst, $src}",
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>;
+
+let AddedComplexity = 15 in
+// movd to MMX register zero-extends
+def MMX_MOVZDI2PDIrr : MMX2I<0x6E, MRMSrcReg, (ops VR64:$dst, GR32:$src),
+                             "movd {$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle immAllZerosV,
+                                       (v2i32 (scalar_to_vector GR32:$src)),
+                                       MMX_MOVL_shuffle_mask)))]>;
+let AddedComplexity = 20 in
+def MMX_MOVZDI2PDIrm : MMX2I<0x6E, MRMSrcMem, (ops VR64:$dst, i32mem:$src),
+                             "movd {$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle immAllZerosV,
+                                       (v2i32 (scalar_to_vector
+                                               (loadi32 addr:$src))),
+                                       MMX_MOVL_shuffle_mask)))]>;
+
+// Arithmetic Instructions
+
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm<0xFC, "paddb", add, v8i8,  1>;
+defm MMX_PADDW : MMXI_binop_rm<0xFD, "paddw", add, v4i16, 1>;
+defm MMX_PADDD : MMXI_binop_rm<0xFE, "paddd", add, v2i32, 1>;
+defm MMX_PADDQ : MMXI_binop_rm<0xD4, "paddq", add, v1i64, 1>;
+
+defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
+defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm<0xF8, "psubb", sub, v8i8>;
+defm MMX_PSUBW : MMXI_binop_rm<0xF9, "psubw", sub, v4i16>;
+defm MMX_PSUBD : MMXI_binop_rm<0xFA, "psubd", sub, v2i32>;
+defm MMX_PSUBQ : MMXI_binop_rm<0xFB, "psubq", sub, v1i64>;
+
+defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
+defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
+
+// -- Multiplication
+defm MMX_PMULLW  : MMXI_binop_rm<0xD5, "pmullw", mul, v4i16, 1>;
+
+defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,  1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
+
+defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
+defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
+
+defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>;
+defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>;
+
+defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>;
+defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>;
+
+defm MMX_PSADBW  : MMXI_binop_rm_int<0xE0, "psadbw", int_x86_mmx_psad_bw, 1>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_v1i64<0xDB, "pand", and, 1>;
+defm MMX_POR  : MMXI_binop_rm_v1i64<0xEB, "por" , or,  1>;
+defm MMX_PXOR : MMXI_binop_rm_v1i64<0xEF, "pxor", xor, 1>;
+
+let isTwoAddress = 1 in {
+  def MMX_PANDNrr : MMXI<0xDF, MRMSrcReg,
+                         (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                         "pandn {$src2, $dst|$dst, $src2}",
+                         [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+                                                  VR64:$src2)))]>;
+  def MMX_PANDNrm : MMXI<0xDF, MRMSrcMem,
+                         (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                         "pandn {$src2, $dst|$dst, $src2}",
+                         [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+                                                  (load addr:$src2))))]>;
+}
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                                    int_x86_mmx_psrl_w>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                                    int_x86_mmx_psrl_d>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                                    int_x86_mmx_psrl_q>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                                    int_x86_mmx_psll_w>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                                    int_x86_mmx_psll_d>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                                    int_x86_mmx_psll_q>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                                    int_x86_mmx_psra_w>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                                    int_x86_mmx_psra_d>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
+
+// Conversion Instructions
+
+// -- Unpack Instructions
+let isTwoAddress = 1 in {
+  // Unpack High Packed Data Instructions
+  def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg, 
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpckhbw {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (vector_shuffle VR64:$src1, VR64:$src2,
+                                      MMX_UNPCKH_shuffle_mask)))]>;
+  def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem, 
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpckhbw {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (vector_shuffle VR64:$src1,
+                                      (bc_v8i8 (load_mmx addr:$src2)),
+                                      MMX_UNPCKH_shuffle_mask)))]>;
+
+  def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg, 
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpckhwd {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (vector_shuffle VR64:$src1, VR64:$src2,
+                                       MMX_UNPCKH_shuffle_mask)))]>;
+  def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem, 
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpckhwd {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (vector_shuffle VR64:$src1,
+                                       (bc_v4i16 (load_mmx addr:$src2)),
+                                       MMX_UNPCKH_shuffle_mask)))]>;
+
+  def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg, 
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpckhdq {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle VR64:$src1, VR64:$src2,
+                                       MMX_UNPCKH_shuffle_mask)))]>;
+  def MMX_PUNPCKHDQrm : MMXI<0x6A, MRMSrcMem,
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpckhdq {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle VR64:$src1,
+                                       (bc_v2i32 (load_mmx addr:$src2)),
+                                       MMX_UNPCKH_shuffle_mask)))]>;
+
+  // Unpack Low Packed Data Instructions
+  def MMX_PUNPCKLBWrr : MMXI<0x60, MRMSrcReg,
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpcklbw {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (vector_shuffle VR64:$src1, VR64:$src2,
+                                      MMX_UNPCKL_shuffle_mask)))]>;
+  def MMX_PUNPCKLBWrm : MMXI<0x60, MRMSrcMem,
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpcklbw {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (vector_shuffle VR64:$src1,
+                                      (bc_v8i8 (load_mmx addr:$src2)),
+                                      MMX_UNPCKL_shuffle_mask)))]>;
+
+  def MMX_PUNPCKLWDrr : MMXI<0x61, MRMSrcReg,
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpcklwd {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (vector_shuffle VR64:$src1, VR64:$src2,
+                                       MMX_UNPCKL_shuffle_mask)))]>;
+  def MMX_PUNPCKLWDrm : MMXI<0x61, MRMSrcMem,
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpcklwd {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (vector_shuffle VR64:$src1,
+                                       (bc_v4i16 (load_mmx addr:$src2)),
+                                       MMX_UNPCKL_shuffle_mask)))]>;
+
+  def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg, 
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpckldq {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle VR64:$src1, VR64:$src2,
+                                       MMX_UNPCKL_shuffle_mask)))]>;
+  def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem, 
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpckldq {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle VR64:$src1,
+                                       (bc_v2i32 (load_mmx addr:$src2)),
+                                       MMX_UNPCKL_shuffle_mask)))]>;
+}
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
+
+// -- Shuffle Instructions
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+                          (ops VR64:$dst, VR64:$src1, i8imm:$src2),
+                          "pshufw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                            (v4i16 (vector_shuffle
+                                    VR64:$src1, (undef),
+                                    MMX_PSHUFW_shuffle_mask:$src2)))]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+                          (ops VR64:$dst, i64mem:$src1, i8imm:$src2),
+                          "pshufw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                            (v4i16 (vector_shuffle
+                                    (bc_v4i16 (load_mmx addr:$src1)),
+                                    (undef),
+                                    MMX_PSHUFW_shuffle_mask:$src2)))]>;
+
+// -- Conversion Instructions
+def MMX_CVTPD2PIrr  : MMX2I<0x2D, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+                            "cvtpd2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPD2PIrm  : MMX2I<0x2D, MRMSrcMem, (ops VR64:$dst, f128mem:$src),
+                            "cvtpd2pi {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PDrr  : MMX2I<0x2A, MRMSrcReg, (ops VR128:$dst, VR64:$src),
+                            "cvtpi2pd {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPI2PDrm  : MMX2I<0x2A, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                            "cvtpi2pd {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PSrr  : MMXI<0x2A, MRMSrcReg, (ops VR128:$dst, VR64:$src),
+                           "cvtpi2ps {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPI2PSrm  : MMXI<0x2A, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                           "cvtpi2ps {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPS2PIrr  : MMXI<0x2D, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+                           "cvtps2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPS2PIrm  : MMXI<0x2D, MRMSrcMem, (ops VR64:$dst, f64mem:$src),
+                           "cvtps2pi {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPD2PIrr : MMX2I<0x2C, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+                            "cvttpd2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (ops VR64:$dst, f128mem:$src),
+                            "cvttpd2pi {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPS2PIrr : MMXI<0x2C, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+                           "cvttps2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (ops VR64:$dst, f64mem:$src),
+                           "cvttps2pi {$src, $dst|$dst, $src}", []>;
+
+// Extract / Insert
+def MMX_X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
+def MMX_X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+
+def MMX_PEXTRWri  : MMXIi8<0xC5, MRMSrcReg,
+                           (ops GR32:$dst, VR64:$src1, i16i8imm:$src2),
+                           "pextrw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                           [(set GR32:$dst, (MMX_X86pextrw (v4i16 VR64:$src1),
+                                             (iPTR imm:$src2)))]>;
+let isTwoAddress = 1 in {
+  def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
+                      (ops VR64:$dst, VR64:$src1, GR32:$src2, i16i8imm:$src3),
+                      "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set VR64:$dst, (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+                                               GR32:$src2, (iPTR imm:$src3))))]>;
+  def MMX_PINSRWrmi : MMXIi8<0xC4, MRMSrcMem,
+                     (ops VR64:$dst, VR64:$src1, i16mem:$src2, i16i8imm:$src3),
+                     "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set VR64:$dst,
+                       (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+                               (i32 (anyext (loadi16 addr:$src2))),
+                               (iPTR imm:$src3))))]>;
+}
+
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (ops GR32:$dst, VR64:$src),
+                          "pmovmskb {$src, $dst|$dst, $src}",
+                          [(set GR32:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+// Misc.
+def MMX_MASKMOVQ : MMXI<0xF7, MRMDestMem, (ops VR64:$src, VR64:$mask),
+                        "maskmovq {$mask, $src|$src, $mask}",
+                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>,
+                        Imp<[EDI],[]>;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map zero vector to pxor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let isReMaterializable = 1 in {
+  def MMX_V_SET0       : MMXI<0xEF, MRMInitReg, (ops VR64:$dst),
+                              "pxor $dst, $dst",
+                              [(set VR64:$dst, (v1i64 immAllZerosV))]>;
+  def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (ops VR64:$dst),
+                              "pcmpeqd $dst, $dst",
+                              [(set VR64:$dst, (v1i64 immAllOnesV))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Store 64-bit integer vector values.
+def : Pat<(store (v8i8  VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v4i16 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2i32 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v1i64 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+
+// 64-bit vector all zero's.
+def : Pat<(v8i8  immAllZerosV), (MMX_V_SET0)>;
+def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
+def : Pat<(v2i32 immAllZerosV), (MMX_V_SET0)>;
+def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
+
+// 64-bit vector all one's.
+def : Pat<(v8i8  immAllOnesV), (MMX_V_SETALLONES)>;
+def : Pat<(v4i16 immAllOnesV), (MMX_V_SETALLONES)>;
+def : Pat<(v2i32 immAllOnesV), (MMX_V_SETALLONES)>;
+def : Pat<(v1i64 immAllOnesV), (MMX_V_SETALLONES)>;
+
+// Bit convert.
+def : Pat<(v8i8  (bitconvert (v1i64 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  VR64:$src))), (v1i64 VR64:$src)>;
+
+// 64-bit bit convert.
+def : Pat<(v1i64 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2i32 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v8i8  (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+
+def MMX_X86s2vec : SDNode<"X86ISD::S2VEC",  SDTypeProfile<1, 1, []>, []>;
+
+// Move scalar to XMM zero-extended
+// movd to XMM register zero-extends
+let AddedComplexity = 15 in {
+  def : Pat<(v8i8 (vector_shuffle immAllZerosV,
+                    (v8i8 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+            (MMX_MOVZDI2PDIrr GR32:$src)>;
+  def : Pat<(v4i16 (vector_shuffle immAllZerosV,
+                    (v4i16 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+            (MMX_MOVZDI2PDIrr GR32:$src)>;
+  def : Pat<(v2i32 (vector_shuffle immAllZerosV,
+                    (v2i32 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+            (MMX_MOVZDI2PDIrr GR32:$src)>;
+}
+
+// Scalar to v2i32 / v4i16 / v8i8. The source may be a GR32, but only the lower
+// 8 or 16-bits matter.
+def : Pat<(v8i8  (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
+def : Pat<(v4i16 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
+def : Pat<(v2i32 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
+
+// Patterns to perform canonical versions of vector shuffling.
+let AddedComplexity = 10 in {
+  def : Pat<(v8i8  (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKL_v_undef_shuffle_mask)),
+            (MMX_PUNPCKLBWrr VR64:$src, VR64:$src)>;
+  def : Pat<(v4i16 (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKL_v_undef_shuffle_mask)),
+            (MMX_PUNPCKLWDrr VR64:$src, VR64:$src)>;
+  def : Pat<(v2i32 (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKL_v_undef_shuffle_mask)),
+            (MMX_PUNPCKLDQrr VR64:$src, VR64:$src)>;
+}
+
+let AddedComplexity = 10 in {
+  def : Pat<(v8i8  (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKH_v_undef_shuffle_mask)),
+            (MMX_PUNPCKHBWrr VR64:$src, VR64:$src)>;
+  def : Pat<(v4i16 (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKH_v_undef_shuffle_mask)),
+            (MMX_PUNPCKHWDrr VR64:$src, VR64:$src)>;
+  def : Pat<(v2i32 (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKH_v_undef_shuffle_mask)),
+            (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>;
+}
+
+// Patterns to perform vector shuffling with a zeroed out vector.
+let AddedComplexity = 20 in {
+  def : Pat<(bc_v2i32 (vector_shuffle immAllZerosV,
+                       (v2i32 (scalar_to_vector (load_mmx addr:$src))),
+                       MMX_UNPCKL_shuffle_mask)),
+            (MMX_PUNPCKLDQrm VR64:$src, VR64:$src)>;
+}
+
+// Some special case PANDN patterns.
+// FIXME: Get rid of these.
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;

diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..5fc7a65
--- /dev/null
+++ b/lib/Target/X86/X86InstrSSE.td

@@ -0,0 +1,2572 @@
+//====- X86InstrSSE.td - Describe the X86 Instruction Set -------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the University
+// of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+                                            SDTCisFP<0>, SDTCisInt<2> ]>;
+
+def X86loadp   : SDNode<"X86ISD::LOAD_PACK", SDTLoad, [SDNPHasChain]>;
+def X86loadu   : SDNode<"X86ISD::LOAD_UA",   SDTLoad, [SDNPHasChain]>;
+def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
+def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
+def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
+def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
+def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86s2vec   : SDNode<"X86ISD::S2VEC",  SDTypeProfile<1, 1, []>, []>;
+def X86pextrw  : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
+def X86pinsrw  : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+
+//===----------------------------------------------------------------------===//
+// SSE 'Special' Instructions
+//===----------------------------------------------------------------------===//
+
+def IMPLICIT_DEF_VR128 : I<0, Pseudo, (ops VR128:$dst),
+                           "#IMPLICIT_DEF $dst",
+                           [(set VR128:$dst, (v4f32 (undef)))]>,
+                         Requires<[HasSSE1]>;
+def IMPLICIT_DEF_FR32  : I<0, Pseudo, (ops FR32:$dst),
+                           "#IMPLICIT_DEF $dst",
+                           [(set FR32:$dst, (undef))]>, Requires<[HasSSE2]>;
+def IMPLICIT_DEF_FR64  : I<0, Pseudo, (ops FR64:$dst),
+                           "#IMPLICIT_DEF $dst",
+                           [(set FR64:$dst, (undef))]>, Requires<[HasSSE2]>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 4, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain]>;
+def sse_load_f64 : ComplexPattern<v2f64, 4, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain]>;
+
+def ssmem : Operand<v4f32> {
+  let PrintMethod = "printf32mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
+}
+def sdmem : Operand<v2f64> {
+  let PrintMethod = "printf64mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+def X86loadpf32  : PatFrag<(ops node:$ptr), (f32   (X86loadp node:$ptr))>;
+def X86loadpf64  : PatFrag<(ops node:$ptr), (f64   (X86loadp node:$ptr))>;
+
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def PSxLDQ_imm  : SDNodeXForm<imm, [{
+  // Transformation function: imm >> 3
+  return getI32Imm(N->getValue() >> 3);
+}]>;
+
+// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
+// SHUFP* etc. imm.
+def SHUFFLE_get_shuf_imm : SDNodeXForm<build_vector, [{
+  return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to 
+// PSHUFHW imm.
+def SHUFFLE_get_pshufhw_imm : SDNodeXForm<build_vector, [{
+  return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to 
+// PSHUFLW imm.
+def SHUFFLE_get_pshuflw_imm : SDNodeXForm<build_vector, [{
+  return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
+}]>;
+
+def SSE_splat_mask : PatLeaf<(build_vector), [{
+  return X86::isSplatMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def SSE_splat_lo_mask : PatLeaf<(build_vector), [{
+  return X86::isSplatLoMask(N);
+}]>;
+
+def MOVHLPS_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVHLPSMask(N);
+}]>;
+
+def MOVHLPS_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVHLPS_v_undef_Mask(N);
+}]>;
+
+def MOVHP_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVHPMask(N);
+}]>;
+
+def MOVLP_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVLPMask(N);
+}]>;
+
+def MOVL_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVLMask(N);
+}]>;
+
+def MOVSHDUP_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVSHDUPMask(N);
+}]>;
+
+def MOVSLDUP_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVSLDUPMask(N);
+}]>;
+
+def UNPCKL_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKLMask(N);
+}]>;
+
+def UNPCKH_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKHMask(N);
+}]>;
+
+def UNPCKL_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKL_v_undef_Mask(N);
+}]>;
+
+def UNPCKH_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKH_v_undef_Mask(N);
+}]>;
+
+def PSHUFD_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isPSHUFDMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def PSHUFHW_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isPSHUFHWMask(N);
+}], SHUFFLE_get_pshufhw_imm>;
+
+def PSHUFLW_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isPSHUFLWMask(N);
+}], SHUFFLE_get_pshuflw_imm>;
+
+def SHUFP_unary_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isPSHUFDMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def SHUFP_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isSHUFPMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def PSHUFD_binary_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isSHUFPMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+//===----------------------------------------------------------------------===//
+// SSE scalar FP Instructions
+//===----------------------------------------------------------------------===//
+
+// CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded by the
+// scheduler into a branch sequence.
+let usesCustomDAGSchedInserter = 1 in {  // Expanded by the scheduler.
+  def CMOV_FR32 : I<0, Pseudo,
+                    (ops FR32:$dst, FR32:$t, FR32:$f, i8imm:$cond),
+                    "#CMOV_FR32 PSEUDO!",
+                    [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond))]>;
+  def CMOV_FR64 : I<0, Pseudo,
+                    (ops FR64:$dst, FR64:$t, FR64:$f, i8imm:$cond),
+                    "#CMOV_FR64 PSEUDO!",
+                    [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond))]>;
+  def CMOV_V4F32 : I<0, Pseudo,
+                    (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V4F32 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>;
+  def CMOV_V2F64 : I<0, Pseudo,
+                    (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2F64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>;
+  def CMOV_V2I64 : I<0, Pseudo,
+                    (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2I64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE1 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE1 Instruction Templates:
+// 
+//   SSI   - SSE1 instructions with XS prefix.
+//   PSI   - SSE1 instructions with TB prefix.
+//   PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+
+class SSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE1]>;
+class PSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
+class PSIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
+
+// Move Instructions
+def MOVSSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+                  "movss {$src, $dst|$dst, $src}", []>;
+def MOVSSrm : SSI<0x10, MRMSrcMem, (ops FR32:$dst, f32mem:$src),
+                  "movss {$src, $dst|$dst, $src}",
+                  [(set FR32:$dst, (loadf32 addr:$src))]>;
+def MOVSSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, FR32:$src),
+                  "movss {$src, $dst|$dst, $src}",
+                  [(store FR32:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, FR32:$src),
+                      "cvttss2si {$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+                      "cvttss2si {$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def CVTSI2SSrr  : SSI<0x2A, MRMSrcReg, (ops FR32:$dst, GR32:$src),
+                      "cvtsi2ss {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SSrm  : SSI<0x2A, MRMSrcMem, (ops FR32:$dst, i32mem:$src),
+                      "cvtsi2ss {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                         "cvtss2si {$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
+def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+                         "cvtss2si {$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse_cvtss2si
+                                           (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                          "cvttss2si {$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse_cvttss2si VR128:$src))]>;
+def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+                          "cvttss2si {$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse_cvttss2si(load addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+  def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
+                           (ops VR128:$dst, VR128:$src1, GR32:$src2),
+                           "cvtsi2ss {$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+                                              GR32:$src2))]>;
+  def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
+                           (ops VR128:$dst, VR128:$src1, i32mem:$src2),
+                           "cvtsi2ss {$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+                                              (loadi32 addr:$src2)))]>;
+}
+
+// Comparison instructions
+let isTwoAddress = 1 in {
+  def CMPSSrr : SSI<0xC2, MRMSrcReg, 
+                    (ops FR32:$dst, FR32:$src1, FR32:$src, SSECC:$cc),
+                    "cmp${cc}ss {$src, $dst|$dst, $src}",
+                    []>;
+  def CMPSSrm : SSI<0xC2, MRMSrcMem, 
+                    (ops FR32:$dst, FR32:$src1, f32mem:$src, SSECC:$cc),
+                    "cmp${cc}ss {$src, $dst|$dst, $src}", []>;
+}
+
+def UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops FR32:$src1, FR32:$src2),
+                   "ucomiss {$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR32:$src1, FR32:$src2)]>;
+def UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops FR32:$src1, f32mem:$src2),
+                   "ucomiss {$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR32:$src1, (loadf32 addr:$src2))]>;
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let isTwoAddress = 1 in {
+  def Int_CMPSSrr : SSI<0xC2, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+                        "cmp${cc}ss {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+                                           VR128:$src, imm:$cc))]>;
+  def Int_CMPSSrm : SSI<0xC2, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, f32mem:$src, SSECC:$cc),
+                        "cmp${cc}ss {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+                                           (load addr:$src), imm:$cc))]>;
+}
+
+def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+                       "ucomiss {$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v4f32 VR128:$src1), VR128:$src2)]>;
+def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+                       "ucomiss {$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2))]>;
+
+def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+                      "comiss {$src2, $src1|$src1, $src2}",
+                      [(X86comi (v4f32 VR128:$src1), VR128:$src2)]>;
+def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+                      "comiss {$src2, $src1|$src1, $src2}",
+                      [(X86comi (v4f32 VR128:$src1), (load addr:$src2))]>;
+
+// Aliases of packed SSE1 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+def FsFLD0SS : I<0xEF, MRMInitReg, (ops FR32:$dst),
+                 "pxor $dst, $dst", [(set FR32:$dst, fp32imm0)]>,
+               Requires<[HasSSE1]>, TB, OpSize;
+
+// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
+// disregarded.
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+                     "movaps {$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
+// disregarded.
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (ops FR32:$dst, f128mem:$src),
+                     "movaps {$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (X86loadpf32 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let isTwoAddress = 1 in {
+let isCommutable = 1 in {
+  def FsANDPSrr : PSI<0x54, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                      "andps {$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>;
+  def FsORPSrr  : PSI<0x56, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                      "orps {$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>;
+  def FsXORPSrr : PSI<0x57, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                      "xorps {$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>;
+}
+
+def FsANDPSrm : PSI<0x54, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+                    "andps {$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86fand FR32:$src1,
+                                      (X86loadpf32 addr:$src2)))]>;
+def FsORPSrm  : PSI<0x56, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+                    "orps {$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86for FR32:$src1,
+                                      (X86loadpf32 addr:$src2)))]>;
+def FsXORPSrm : PSI<0x57, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+                    "xorps {$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86fxor FR32:$src1,
+                                      (X86loadpf32 addr:$src2)))]>;
+
+def FsANDNPSrr : PSI<0x55, MRMSrcReg,
+                     (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                     "andnps {$src2, $dst|$dst, $src2}", []>;
+def FsANDNPSrm : PSI<0x55, MRMSrcMem,
+                     (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+                     "andnps {$src2, $dst|$dst, $src2}", []>;
+}
+
+/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements undefined.
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, Intrinsic F32Int,
+                                  bit Commutable = 0> {
+  // Scalar operation, reg+reg.
+  def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                 !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2),
+                 !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2),
+                     !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1,
+                                               sse_load_f32:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
+defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
+defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
+defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
+
+/// sse1_fp_binop_rm - Other SSE1 binops
+///
+/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form.  Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,
+                            Intrinsic F32Int,
+                            Intrinsic V4F32Int,
+                            bit Commutable = 0> {
+
+  // Scalar operation, reg+reg.
+  def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                 !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2),
+                 !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2),
+                     !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1,
+                                               sse_load_f32:$src2))]>;
+
+  // Vector intrinsic operation, reg+reg.
+  def PSrr_Int : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, reg+mem.
+  def PSrm_Int : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2),
+                     !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V4F32Int VR128:$src1, (load addr:$src2)))]>;
+}
+}
+
+defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax,
+                            int_x86_sse_max_ss, int_x86_sse_max_ps>;
+defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
+                            int_x86_sse_min_ss, int_x86_sse_min_ps>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+def MOVAPSrr : PSI<0x28, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                   "movaps {$src, $dst|$dst, $src}", []>;
+def MOVAPSrm : PSI<0x28, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                   "movaps {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv4f32 addr:$src))]>;
+
+def MOVAPSmr : PSI<0x29, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+                   "movaps {$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)]>;
+
+def MOVUPSrr : PSI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                   "movups {$src, $dst|$dst, $src}", []>;
+def MOVUPSrm : PSI<0x10, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                   "movups {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+                   "movups {$src, $dst|$dst, $src}",
+                   [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
+
+let isTwoAddress = 1 in {
+  let AddedComplexity = 20 in {
+    def MOVLPSrm : PSI<0x12, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                       "movlps {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, 
+                         (v4f32 (vector_shuffle VR128:$src1,
+                         (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
+                                 MOVLP_shuffle_mask)))]>;
+    def MOVHPSrm : PSI<0x16, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                       "movhps {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, 
+                         (v4f32 (vector_shuffle VR128:$src1,
+                         (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
+                                 MOVHP_shuffle_mask)))]>;
+  } // AddedComplexity
+} // isTwoAddress
+
+def MOVLPSmr : PSI<0x13, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+                   "movlps {$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPSmr : PSI<0x17, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+                   "movhps {$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (vector_shuffle
+                                         (bc_v2f64 (v4f32 VR128:$src)), (undef),
+                                         UNPCKH_shuffle_mask)), (iPTR 0))),
+                     addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let AddedComplexity = 15 in {
+def MOVLHPSrr : PSI<0x16, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                    "movlhps {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+                              MOVHP_shuffle_mask)))]>;
+
+def MOVHLPSrr : PSI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                    "movhlps {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+                              MOVHLPS_shuffle_mask)))]>;
+} // AddedComplexity
+} // isTwoAddress
+
+
+
+// Arithmetic
+
+/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
+                           SDNode OpNode,
+                           Intrinsic F32Int,
+                           Intrinsic V4F32Int,
+                           bit Commutable = 0> {
+  // Scalar operation, reg.
+  def SSr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+                !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, mem.
+  def SSm : SSI<opc, MRMSrcMem, (ops FR32:$dst, f32mem:$src),
+                !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))]>;
+                 
+  // Vector operation, reg.
+  def PSr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+              !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, mem.
+  def PSm : PSI<opc, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>;
+
+  // Intrinsic operation, reg.
+  def SSr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, mem.
+  def SSm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, ssmem:$src),
+                    !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+
+  // Vector intrinsic operation, reg
+  def PSr_Int : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, mem
+  def PSm_Int : PSI<opc, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+                    !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int (load addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT  : sse1_fp_unop_rm<0x51, "sqrt",  fsqrt,
+                             int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
+                             int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
+defm RCP   : sse1_fp_unop_rm<0x53, "rcp",   X86frcp,
+                             int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
+
+// Logical
+let isTwoAddress = 1 in {
+  let isCommutable = 1 in {
+    def ANDPSrr : PSI<0x54, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "andps {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (and VR128:$src1, VR128:$src2)))]>;
+    def ORPSrr  : PSI<0x56, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "orps {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (or VR128:$src1, VR128:$src2)))]>;
+    def XORPSrr : PSI<0x57, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "xorps {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (xor VR128:$src1, VR128:$src2)))]>;
+  }
+
+  def ANDPSrm : PSI<0x54, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "andps {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (and VR128:$src1,
+                                       (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+  def ORPSrm  : PSI<0x56, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "orps {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (or VR128:$src1,
+                                       (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+  def XORPSrm : PSI<0x57, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "xorps {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (xor VR128:$src1,
+                                       (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+  def ANDNPSrr : PSI<0x55, MRMSrcReg,
+                     (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     "andnps {$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (v2i64 (and (xor VR128:$src1,
+                                    (bc_v2i64 (v4i32 immAllOnesV))),
+                               VR128:$src2)))]>;
+  def ANDNPSrm : PSI<0x55, MRMSrcMem,
+                     (ops VR128:$dst, VR128:$src1,f128mem:$src2),
+                     "andnps {$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (v2i64 (and (xor VR128:$src1,
+                                    (bc_v2i64 (v4i32 immAllOnesV))),
+                               (bc_v2i64 (loadv4f32 addr:$src2)))))]>;
+}
+
+let isTwoAddress = 1 in {
+  def CMPPSrri : PSIi8<0xC2, MRMSrcReg, 
+                      (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+                      "cmp${cc}ps {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+                                         VR128:$src, imm:$cc))]>;
+  def CMPPSrmi : PSIi8<0xC2, MRMSrcMem, 
+                      (ops VR128:$dst, VR128:$src1, f128mem:$src, SSECC:$cc),
+                      "cmp${cc}ps {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+                                         (load addr:$src), imm:$cc))]>;
+}
+
+// Shuffle and unpack instructions
+let isTwoAddress = 1 in {
+  let isConvertibleToThreeAddress = 1 in // Convert to pshufd
+    def SHUFPSrri : PSIi8<0xC6, MRMSrcReg, 
+                          (ops VR128:$dst, VR128:$src1,
+                           VR128:$src2, i32i8imm:$src3),
+                          "shufps {$src3, $src2, $dst|$dst, $src2, $src3}",
+                          [(set VR128:$dst,
+                            (v4f32 (vector_shuffle
+                                    VR128:$src1, VR128:$src2,
+                                    SHUFP_shuffle_mask:$src3)))]>;
+  def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1,
+                         f128mem:$src2, i32i8imm:$src3),
+                        "shufps {$src3, $src2, $dst|$dst, $src2, $src3}",
+                        [(set VR128:$dst,
+                          (v4f32 (vector_shuffle
+                                  VR128:$src1, (load addr:$src2),
+                                  SHUFP_shuffle_mask:$src3)))]>;
+
+  let AddedComplexity = 10 in {
+    def UNPCKHPSrr : PSI<0x15, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "unpckhps {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (vector_shuffle
+                                   VR128:$src1, VR128:$src2,
+                                   UNPCKH_shuffle_mask)))]>;
+    def UNPCKHPSrm : PSI<0x15, MRMSrcMem, 
+                         (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                         "unpckhps {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (vector_shuffle
+                                   VR128:$src1, (load addr:$src2),
+                                   UNPCKH_shuffle_mask)))]>;
+
+    def UNPCKLPSrr : PSI<0x14, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "unpcklps {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (vector_shuffle
+                                   VR128:$src1, VR128:$src2,
+                                   UNPCKL_shuffle_mask)))]>;
+    def UNPCKLPSrm : PSI<0x14, MRMSrcMem, 
+                         (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                         "unpcklps {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (vector_shuffle
+                                   VR128:$src1, (load addr:$src2),
+                                   UNPCKL_shuffle_mask)))]>;
+  } // AddedComplexity
+} // isTwoAddress
+
+// Mask creation
+def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                     "movmskps {$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
+def MOVMSKPDrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                     "movmskpd {$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
+
+// Prefetching loads.
+// TODO: no intrinsics for these?
+def PREFETCHT0   : PSI<0x18, MRM1m, (ops i8mem:$src), "prefetcht0 $src", []>;
+def PREFETCHT1   : PSI<0x18, MRM2m, (ops i8mem:$src), "prefetcht1 $src", []>;
+def PREFETCHT2   : PSI<0x18, MRM3m, (ops i8mem:$src), "prefetcht2 $src", []>;
+def PREFETCHNTA  : PSI<0x18, MRM0m, (ops i8mem:$src), "prefetchnta $src", []>;
+
+// Non-temporal stores
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+                    "movntps {$src, $dst|$dst, $src}",
+                    [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+
+// Load, store, and memory fence
+def SFENCE : PSI<0xAE, MRM7m, (ops), "sfence", [(int_x86_sse_sfence)]>;
+
+// MXCSR register
+def LDMXCSR : PSI<0xAE, MRM2m, (ops i32mem:$src),
+                  "ldmxcsr $src", [(int_x86_sse_ldmxcsr addr:$src)]>;
+def STMXCSR : PSI<0xAE, MRM3m, (ops i32mem:$dst),
+                  "stmxcsr $dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let isReMaterializable = 1 in
+def V_SET0 : PSI<0x57, MRMInitReg, (ops VR128:$dst),
+                 "xorps $dst, $dst",
+                 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+
+// FR32 to 128-bit vector conversion.
+def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (ops VR128:$dst, FR32:$src),
+                      "movss {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4f32 (scalar_to_vector FR32:$src)))]>;
+def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+                     "movss {$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+//           (f32 FR32:$src)>;
+def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, VR128:$src),
+                     "movss {$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (vector_extract (v4f32 VR128:$src),
+                                       (iPTR 0)))]>;
+def MOVPS2SSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, VR128:$src),
+                     "movss {$src, $dst|$dst, $src}",
+                     [(store (f32 (vector_extract (v4f32 VR128:$src),
+                                   (iPTR 0))), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let isTwoAddress = 1 in {
+  def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
+                        (ops VR128:$dst, VR128:$src1, FR32:$src2),
+                        "movss {$src2, $dst|$dst, $src2}", []>;
+
+  let AddedComplexity = 15 in
+    def MOVLPSrr : SSI<0x10, MRMSrcReg,
+                       (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                       "movss {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+                                 MOVL_shuffle_mask)))]>;
+}
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in
+def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+                      "movss {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV,
+                                 (v4f32 (scalar_to_vector (loadf32 addr:$src))),
+                                                MOVL_shuffle_mask)))]>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE2 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE2 Instruction Templates:
+// 
+//   SDI   - SSE2 instructions with XD prefix.
+//   PDI   - SSE2 instructions with TB and OpSize prefixes.
+//   PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+
+class SDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE2]>;
+class PDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+class PDIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+
+// Move Instructions
+def MOVSDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, FR64:$src),
+                  "movsd {$src, $dst|$dst, $src}", []>;
+def MOVSDrm : SDI<0x10, MRMSrcMem, (ops FR64:$dst, f64mem:$src),
+                  "movsd {$src, $dst|$dst, $src}",
+                  [(set FR64:$dst, (loadf64 addr:$src))]>;
+def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src),
+                  "movsd {$src, $dst|$dst, $src}",
+                  [(store FR64:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (ops GR32:$dst, FR64:$src),
+                      "cvttsd2si {$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (ops GR32:$dst, f64mem:$src),
+                      "cvttsd2si {$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (ops FR32:$dst, FR64:$src),
+                      "cvtsd2ss {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround FR64:$src))]>;
+def CVTSD2SSrm  : SDI<0x5A, MRMSrcMem, (ops FR32:$dst, f64mem:$src), 
+                      "cvtsd2ss {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround (loadf64 addr:$src)))]>;
+def CVTSI2SDrr  : SDI<0x2A, MRMSrcReg, (ops FR64:$dst, GR32:$src),
+                      "cvtsi2sd {$src, $dst|$dst, $src}",
+                      [(set FR64:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SDrm  : SDI<0x2A, MRMSrcMem, (ops FR64:$dst, i32mem:$src),
+                      "cvtsi2sd {$src, $dst|$dst, $src}",
+                      [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (ops FR64:$dst, FR32:$src),
+                   "cvtss2sd {$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (fextend FR32:$src))]>, XS,
+                 Requires<[HasSSE2]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (ops FR64:$dst, f32mem:$src),
+                   "cvtss2sd {$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
+                 Requires<[HasSSE2]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                         "cvtsd2si {$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>;
+def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (ops GR32:$dst, f128mem:$src),
+                         "cvtsd2si {$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse2_cvtsd2si
+                                           (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                          "cvttsd2si {$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse2_cvttsd2si VR128:$src))]>;
+def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (ops GR32:$dst, f128mem:$src),
+                          "cvttsd2si {$src, $dst|$dst, $src}",
+                          [(set GR32:$dst, (int_x86_sse2_cvttsd2si
+                                            (load addr:$src)))]>;
+
+// Comparison instructions
+let isTwoAddress = 1 in {
+  def CMPSDrr : SDI<0xC2, MRMSrcReg, 
+                    (ops FR64:$dst, FR64:$src1, FR64:$src, SSECC:$cc),
+                    "cmp${cc}sd {$src, $dst|$dst, $src}", []>;
+  def CMPSDrm : SDI<0xC2, MRMSrcMem, 
+                    (ops FR64:$dst, FR64:$src1, f64mem:$src, SSECC:$cc),
+                    "cmp${cc}sd {$src, $dst|$dst, $src}", []>;
+}
+
+def UCOMISDrr: PDI<0x2E, MRMSrcReg, (ops FR64:$src1, FR64:$src2),
+                   "ucomisd {$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR64:$src1, FR64:$src2)]>;
+def UCOMISDrm: PDI<0x2E, MRMSrcMem, (ops FR64:$src1, f64mem:$src2),
+                   "ucomisd {$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR64:$src1, (loadf64 addr:$src2))]>;
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let isTwoAddress = 1 in {
+  def Int_CMPSDrr : SDI<0xC2, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+                        "cmp${cc}sd {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+                                           VR128:$src, imm:$cc))]>;
+  def Int_CMPSDrm : SDI<0xC2, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, f64mem:$src, SSECC:$cc),
+                        "cmp${cc}sd {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+                                           (load addr:$src), imm:$cc))]>;
+}
+
+def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+                       "ucomisd {$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v2f64 VR128:$src1), (v2f64 VR128:$src2))]>;
+def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+                       "ucomisd {$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v2f64 VR128:$src1), (load addr:$src2))]>;
+
+def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+                      "comisd {$src2, $src1|$src1, $src2}",
+                      [(X86comi (v2f64 VR128:$src1), (v2f64 VR128:$src2))]>;
+def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+                      "comisd {$src2, $src1|$src1, $src2}",
+                      [(X86comi (v2f64 VR128:$src1), (load addr:$src2))]>;
+
+// Aliases of packed SSE2 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+def FsFLD0SD : I<0xEF, MRMInitReg, (ops FR64:$dst),
+                 "pxor $dst, $dst", [(set FR64:$dst, fpimm0)]>,
+               Requires<[HasSSE2]>, TB, OpSize;
+
+// Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
+// disregarded.
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (ops FR64:$dst, FR64:$src),
+                     "movapd {$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR64 from f128mem using movapd. Upper bits are
+// disregarded.
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (ops FR64:$dst, f128mem:$src),
+                     "movapd {$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (X86loadpf64 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let isTwoAddress = 1 in {
+let isCommutable = 1 in {
+  def FsANDPDrr : PDI<0x54, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                      "andpd {$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>;
+  def FsORPDrr  : PDI<0x56, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                      "orpd {$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>;
+  def FsXORPDrr : PDI<0x57, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                      "xorpd {$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>;
+}
+
+def FsANDPDrm : PDI<0x54, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+                    "andpd {$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86fand FR64:$src1,
+                                      (X86loadpf64 addr:$src2)))]>;
+def FsORPDrm  : PDI<0x56, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+                    "orpd {$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86for FR64:$src1,
+                                      (X86loadpf64 addr:$src2)))]>;
+def FsXORPDrm : PDI<0x57, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+                    "xorpd {$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86fxor FR64:$src1,
+                                      (X86loadpf64 addr:$src2)))]>;
+
+def FsANDNPDrr : PDI<0x55, MRMSrcReg,
+                     (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                     "andnpd {$src2, $dst|$dst, $src2}", []>;
+def FsANDNPDrm : PDI<0x55, MRMSrcMem,
+                     (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+                     "andnpd {$src2, $dst|$dst, $src2}", []>;
+}
+
+/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements undefined.
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, Intrinsic F64Int,
+                                  bit Commutable = 0> {
+  // Scalar operation, reg+reg.
+  def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                 !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2),
+                 !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2),
+                     !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1,
+                                               sse_load_f64:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
+defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
+defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
+defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
+
+/// sse2_fp_binop_rm - Other SSE2 binops
+///
+/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form.  Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,
+                            Intrinsic F64Int,
+                            Intrinsic V2F64Int,
+                            bit Commutable = 0> {
+
+  // Scalar operation, reg+reg.
+  def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                 !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2),
+                 !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2),
+                     !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1,
+                                               sse_load_f64:$src2))]>;
+
+  // Vector intrinsic operation, reg+reg.
+  def PDrr_Int : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, reg+mem.
+  def PDrm_Int : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                     !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V2F64Int VR128:$src1, (load addr:$src2)))]>;
+}
+}
+
+defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
+                            int_x86_sse2_max_sd, int_x86_sse2_max_pd>;
+defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
+                            int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+def MOVAPDrr : PDI<0x28, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                   "movapd {$src, $dst|$dst, $src}", []>;
+def MOVAPDrm : PDI<0x28, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                   "movapd {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv2f64 addr:$src))]>;
+
+def MOVAPDmr : PDI<0x29, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+                   "movapd {$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)]>;
+
+def MOVUPDrr : PDI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                   "movupd {$src, $dst|$dst, $src}", []>;
+def MOVUPDrm : PDI<0x10, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                   "movupd {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+                   "movupd {$src, $dst|$dst, $src}",
+                   [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
+
+let isTwoAddress = 1 in {
+  let AddedComplexity = 20 in {
+    def MOVLPDrm : PDI<0x12, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                       "movlpd {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, 
+                         (v2f64 (vector_shuffle VR128:$src1,
+                                 (scalar_to_vector (loadf64 addr:$src2)),
+                                 MOVLP_shuffle_mask)))]>;
+    def MOVHPDrm : PDI<0x16, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                       "movhpd {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, 
+                         (v2f64 (vector_shuffle VR128:$src1,
+                                 (scalar_to_vector (loadf64 addr:$src2)),
+                                 MOVHP_shuffle_mask)))]>;
+  } // AddedComplexity
+} // isTwoAddress
+
+def MOVLPDmr : PDI<0x13, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+                   "movlpd {$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPDmr : PDI<0x17, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+                   "movhpd {$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (vector_shuffle VR128:$src, (undef),
+                                         UNPCKH_shuffle_mask)), (iPTR 0))),
+                     addr:$dst)]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                       "cvtdq2ps {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
+                     TB, Requires<[HasSSE2]>;
+def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+                       "cvtdq2ps {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
+                                         (bitconvert (loadv2i64 addr:$src))))]>,
+                     TB, Requires<[HasSSE2]>;
+
+// SSE2 instructions with XS prefix
+def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                       "cvtdq2pd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
+                     XS, Requires<[HasSSE2]>;
+def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                       "cvtdq2pd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
+                                          (bitconvert (loadv2i64 addr:$src))))]>,
+                     XS, Requires<[HasSSE2]>;
+
+def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                         "cvtps2dq {$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
+def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                         "cvtps2dq {$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
+                                            (load addr:$src)))]>;
+// SSE2 packed instructions with XS prefix
+def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                        "cvttps2dq {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))]>,
+                      XS, Requires<[HasSSE2]>;
+def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                        "cvttps2dq {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                           (load addr:$src)))]>,
+                      XS, Requires<[HasSSE2]>;
+
+// SSE2 packed instructions with XD prefix
+def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                       "cvtpd2dq {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+                     XD, Requires<[HasSSE2]>;
+def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                       "cvtpd2dq {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
+                                          (load addr:$src)))]>,
+                     XD, Requires<[HasSSE2]>;
+
+def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                          "cvttpd2dq {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
+def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                          "cvttpd2dq {$src, $dst|$dst, $src}",
+                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                             (load addr:$src)))]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                       "cvtps2pd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
+                     TB, Requires<[HasSSE2]>;
+def Int_CVTPS2PDrm : I<0x5A, MRMSrcReg, (ops VR128:$dst, f64mem:$src),
+                       "cvtps2pd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
+                                          (load addr:$src)))]>,
+                     TB, Requires<[HasSSE2]>;
+
+def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                         "cvtpd2ps {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
+def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, f128mem:$src),
+                         "cvtpd2ps {$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
+                                            (load addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+// Aliases for intrinsics
+let isTwoAddress = 1 in {
+def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg,
+                        (ops VR128:$dst, VR128:$src1, GR32:$src2),
+                        "cvtsi2sd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+                                           GR32:$src2))]>;
+def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem,
+                        (ops VR128:$dst, VR128:$src1, i32mem:$src2),
+                        "cvtsi2sd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+                                           (loadi32 addr:$src2)))]>;
+def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                   "cvtsd2ss {$src2, $dst|$dst, $src2}",
+                   [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+                                      VR128:$src2))]>;
+def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
+                        (ops VR128:$dst, VR128:$src1, f64mem:$src2), 
+                   "cvtsd2ss {$src2, $dst|$dst, $src2}",
+                   [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+                                      (load addr:$src2)))]>;
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                    "cvtss2sd {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       VR128:$src2))]>, XS,
+                    Requires<[HasSSE2]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (ops VR128:$dst, VR128:$src1, f32mem:$src2),
+                    "cvtss2sd {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       (load addr:$src2)))]>, XS,
+                    Requires<[HasSSE2]>;
+}
+
+// Arithmetic
+
+/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
+                           SDNode OpNode,
+                           Intrinsic F64Int,
+                           Intrinsic V2F64Int,
+                           bit Commutable = 0> {
+  // Scalar operation, reg.
+  def SDr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src),
+                !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode FR64:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, mem.
+  def SDm : SDI<opc, MRMSrcMem, (ops FR64:$dst, f64mem:$src),
+                !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode (load addr:$src)))]>;
+                 
+  // Vector operation, reg.
+  def PDr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+              !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, mem.
+  def PDm : PDI<opc, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, mem.
+  def SDm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, sdmem:$src),
+                    !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
+
+  // Vector intrinsic operation, reg
+  def PDr_Int : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, mem
+  def PDm_Int : PDI<opc, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+                    !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int (load addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT  : sse2_fp_unop_rm<0x51, "sqrt",  fsqrt,
+                             int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// Logical
+let isTwoAddress = 1 in {
+  let isCommutable = 1 in {
+    def ANDPDrr : PDI<0x54, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "andpd {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (and (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+    def ORPDrr  : PDI<0x56, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "orpd {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (or (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+    def XORPDrr : PDI<0x57, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "xorpd {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (xor (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+  }
+
+  def ANDPDrm : PDI<0x54, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "andpd {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (and (bc_v2i64 (v2f64 VR128:$src1)),
+                       (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+  def ORPDrm  : PDI<0x56, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "orpd {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (or (bc_v2i64 (v2f64 VR128:$src1)),
+                       (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+  def XORPDrm : PDI<0x57, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "xorpd {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (xor (bc_v2i64 (v2f64 VR128:$src1)),
+                       (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+  def ANDNPDrr : PDI<0x55, MRMSrcReg,
+                     (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     "andnpd {$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+                        (bc_v2i64 (v2f64 VR128:$src2))))]>;
+  def ANDNPDrm : PDI<0x55, MRMSrcMem,
+                     (ops VR128:$dst, VR128:$src1,f128mem:$src2),
+                     "andnpd {$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+                        (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+}
+
+let isTwoAddress = 1 in {
+  def CMPPDrri : PDIi8<0xC2, MRMSrcReg, 
+                      (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+                      "cmp${cc}pd {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+                                         VR128:$src, imm:$cc))]>;
+  def CMPPDrmi : PDIi8<0xC2, MRMSrcMem, 
+                      (ops VR128:$dst, VR128:$src1, f128mem:$src, SSECC:$cc),
+                      "cmp${cc}pd {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+                                         (load addr:$src), imm:$cc))]>;
+}
+
+// Shuffle and unpack instructions
+let isTwoAddress = 1 in {
+  def SHUFPDrri : PDIi8<0xC6, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2, i8imm:$src3),
+                        "shufpd {$src3, $src2, $dst|$dst, $src2, $src3}",
+                        [(set VR128:$dst, (v2f64 (vector_shuffle
+                                                  VR128:$src1, VR128:$src2,
+                                                  SHUFP_shuffle_mask:$src3)))]>;
+  def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1,
+                         f128mem:$src2, i8imm:$src3),
+                        "shufpd {$src3, $src2, $dst|$dst, $src2, $src3}",
+                        [(set VR128:$dst,
+                          (v2f64 (vector_shuffle
+                                  VR128:$src1, (load addr:$src2),
+                                  SHUFP_shuffle_mask:$src3)))]>;
+
+  let AddedComplexity = 10 in {
+    def UNPCKHPDrr : PDI<0x15, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "unpckhpd {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (vector_shuffle
+                                   VR128:$src1, VR128:$src2,
+                                   UNPCKH_shuffle_mask)))]>;
+    def UNPCKHPDrm : PDI<0x15, MRMSrcMem, 
+                         (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                         "unpckhpd {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (vector_shuffle
+                                   VR128:$src1, (load addr:$src2),
+                                   UNPCKH_shuffle_mask)))]>;
+
+    def UNPCKLPDrr : PDI<0x14, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "unpcklpd {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (vector_shuffle
+                                   VR128:$src1, VR128:$src2,
+                                   UNPCKL_shuffle_mask)))]>;
+    def UNPCKLPDrm : PDI<0x14, MRMSrcMem, 
+                         (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                         "unpcklpd {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (vector_shuffle
+                                   VR128:$src1, (load addr:$src2),
+                                   UNPCKL_shuffle_mask)))]>;
+  } // AddedComplexity
+} // isTwoAddress
+
+
+//===----------------------------------------------------------------------===//
+// SSE integer instructions
+
+// Move Instructions
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                   "movdqa {$src, $dst|$dst, $src}", []>;
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+                   "movdqa {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv2i64 addr:$src))]>;
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+                   "movdqa {$src, $dst|$dst, $src}",
+                   [(store (v2i64 VR128:$src), addr:$dst)]>;
+def MOVDQUrm :   I<0x6F, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+                   "movdqu {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
+                 XS, Requires<[HasSSE2]>;
+def MOVDQUmr :   I<0x7F, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+                   "movdqu {$src, $dst|$dst, $src}",
+                   [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
+                 XS, Requires<[HasSSE2]>;
+
+
+let isTwoAddress = 1 in {
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                            bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                        (bitconvert (loadv2i64 addr:$src2))))]>;
+}
+
+multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                             string OpcodeStr, Intrinsic IntId> {
+  def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
+  def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                        (bitconvert (loadv2i64 addr:$src2))))]>;
+  def ri : PDIi8<opc2, ImmForm, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                        (scalar_to_vector (i32 imm:$src2))))]>;
+}
+
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
+                                       (bitconvert (loadv2i64 addr:$src2)))))]>;
+}
+
+/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
+///
+/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
+/// to collapse (bitconvert VT to VT) into its operand.
+///
+multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpNode VR128:$src1,(loadv2i64 addr:$src2)))]>;
+}
+
+} // isTwoAddress
+
+// 128-bit Integer Arithmetic
+
+defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
+defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
+defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
+defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
+
+defm PADDSB  : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
+defm PADDSW  : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
+defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
+defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
+
+defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
+defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
+defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
+defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
+
+defm PSUBSB  : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
+defm PSUBSW  : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
+defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
+defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
+
+defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
+
+defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
+defm PMULHW  : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>;
+defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
+
+defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
+
+defm PAVGB  : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
+defm PAVGW  : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
+
+
+defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
+defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
+defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
+defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
+defm PSADBW : PDI_binop_rm_int<0xE0, "psadbw", int_x86_sse2_psad_bw, 1>;
+
+
+defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_sse2_psll_w>;
+defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", int_x86_sse2_psll_d>;
+defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_sse2_psll_q>;
+
+defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", int_x86_sse2_psrl_w>;
+defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", int_x86_sse2_psrl_d>;
+defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_sse2_psrl_q>;
+
+defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_sse2_psra_w>;
+defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_sse2_psra_d>;
+// PSRAQ doesn't exist in SSE[1-3].
+
+// 128-bit logical shifts.
+let isTwoAddress = 1 in {
+  def PSLLDQri : PDIi8<0x73, MRM7r,
+                       (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+                       "pslldq {$src2, $dst|$dst, $src2}", []>;
+  def PSRLDQri : PDIi8<0x73, MRM3r,
+                       (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+                       "psrldq {$src2, $dst|$dst, $src2}", []>;
+  // PSRADQri doesn't exist in SSE[1-3].
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+            (v2i64 (PSLLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+            (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+            (v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+}
+
+// Logical
+defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
+defm POR  : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
+defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
+
+let isTwoAddress = 1 in {
+  def PANDNrr : PDI<0xDF, MRMSrcReg,
+                    (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                    "pandn {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              VR128:$src2)))]>;
+
+  def PANDNrm : PDI<0xDF, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                    "pandn {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              (load addr:$src2))))]>;
+}
+
+// SSE2 Integer comparison
+defm PCMPEQB  : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>;
+defm PCMPEQW  : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>;
+defm PCMPEQD  : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>;
+defm PCMPGTB  : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
+defm PCMPGTW  : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
+defm PCMPGTD  : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
+
+// Pack instructions
+defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
+defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
+defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
+
+// Shuffle and unpack instructions
+def PSHUFDri : PDIi8<0x70, MRMSrcReg,
+                     (ops VR128:$dst, VR128:$src1, i8imm:$src2),
+                     "pshufd {$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set VR128:$dst, (v4i32 (vector_shuffle
+                                               VR128:$src1, (undef),
+                                               PSHUFD_shuffle_mask:$src2)))]>;
+def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
+                     (ops VR128:$dst, i128mem:$src1, i8imm:$src2),
+                     "pshufd {$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set VR128:$dst, (v4i32 (vector_shuffle
+                                               (bc_v4i32(loadv2i64 addr:$src1)),
+                                               (undef),
+                                               PSHUFD_shuffle_mask:$src2)))]>;
+
+// SSE2 with ImmT == Imm8 and XS prefix.
+def PSHUFHWri : Ii8<0x70, MRMSrcReg,
+                    (ops VR128:$dst, VR128:$src1, i8imm:$src2),
+                    "pshufhw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (vector_shuffle
+                                              VR128:$src1, (undef),
+                                              PSHUFHW_shuffle_mask:$src2)))]>,
+                XS, Requires<[HasSSE2]>;
+def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
+                    (ops VR128:$dst, i128mem:$src1, i8imm:$src2),
+                    "pshufhw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (vector_shuffle
+                                              (bc_v8i16 (loadv2i64 addr:$src1)),
+                                              (undef),
+                                              PSHUFHW_shuffle_mask:$src2)))]>,
+                XS, Requires<[HasSSE2]>;
+
+// SSE2 with ImmT == Imm8 and XD prefix.
+def PSHUFLWri : Ii8<0x70, MRMSrcReg,
+                    (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+                    "pshuflw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (vector_shuffle
+                                              VR128:$src1, (undef),
+                                              PSHUFLW_shuffle_mask:$src2)))]>,
+                XD, Requires<[HasSSE2]>;
+def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
+                    (ops VR128:$dst, i128mem:$src1, i32i8imm:$src2),
+                    "pshuflw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (vector_shuffle
+                                              (bc_v8i16 (loadv2i64 addr:$src1)),
+                                              (undef),
+                                              PSHUFLW_shuffle_mask:$src2)))]>,
+                XD, Requires<[HasSSE2]>;
+
+
+let isTwoAddress = 1 in {
+  def PUNPCKLBWrr : PDI<0x60, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpcklbw {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLBWrm : PDI<0x60, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpcklbw {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (vector_shuffle VR128:$src1,
+                                  (bc_v16i8 (loadv2i64 addr:$src2)),
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLWDrr : PDI<0x61, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpcklwd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLWDrm : PDI<0x61, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpcklwd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (vector_shuffle VR128:$src1,
+                                  (bc_v8i16 (loadv2i64 addr:$src2)),
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLDQrr : PDI<0x62, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpckldq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLDQrm : PDI<0x62, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpckldq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (vector_shuffle VR128:$src1,
+                                  (bc_v4i32 (loadv2i64 addr:$src2)),
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "punpcklqdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem, 
+                         (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                         "punpcklqdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (vector_shuffle VR128:$src1,
+                                  (loadv2i64 addr:$src2),
+                                  UNPCKL_shuffle_mask)))]>;
+  
+  def PUNPCKHBWrr : PDI<0x68, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpckhbw {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHBWrm : PDI<0x68, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpckhbw {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (vector_shuffle VR128:$src1,
+                                  (bc_v16i8 (loadv2i64 addr:$src2)),
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHWDrr : PDI<0x69, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpckhwd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHWDrm : PDI<0x69, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpckhwd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (vector_shuffle VR128:$src1,
+                                  (bc_v8i16 (loadv2i64 addr:$src2)),
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpckhdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpckhdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (vector_shuffle VR128:$src1,
+                                  (bc_v4i32 (loadv2i64 addr:$src2)),
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "punpckhqdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpckhqdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (vector_shuffle VR128:$src1,
+                                  (loadv2i64 addr:$src2),
+                                  UNPCKH_shuffle_mask)))]>;
+}
+
+// Extract / Insert
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+                    (ops GR32:$dst, VR128:$src1, i32i8imm:$src2),
+                    "pextrw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                     (iPTR imm:$src2)))]>;
+let isTwoAddress = 1 in {
+  def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
+                       (ops VR128:$dst, VR128:$src1,
+                        GR32:$src2, i32i8imm:$src3),
+                       "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(set VR128:$dst,
+                         (v8i16 (X86pinsrw (v8i16 VR128:$src1),
+                                 GR32:$src2, (iPTR imm:$src3))))]>;
+  def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1,
+                        i16mem:$src2, i32i8imm:$src3),
+                       "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(set VR128:$dst,
+                         (v8i16 (X86pinsrw (v8i16 VR128:$src1),
+                                 (i32 (anyext (loadi16 addr:$src2))),
+                                 (iPTR imm:$src3))))]>;
+}
+
+// Mask creation
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                     "pmovmskb {$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
+
+// Conditional store
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (ops VR128:$src, VR128:$mask),
+                     "maskmovdqu {$mask, $src|$src, $mask}",
+                     [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
+                 Imp<[EDI],[]>;
+
+// Non-temporal stores
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+                    "movntpd {$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+                    "movntdq {$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
+def MOVNTImr  :   I<0xC3, MRMDestMem, (ops i32mem:$dst, GR32:$src),
+                    "movnti {$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>, 
+                  TB, Requires<[HasSSE2]>;
+
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (ops i8mem:$src),
+               "clflush $src", [(int_x86_sse2_clflush addr:$src)]>,
+              TB, Requires<[HasSSE2]>;
+
+// Load, store, and memory fence
+def LFENCE : I<0xAE, MRM5m, (ops),
+               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM6m, (ops),
+               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let isReMaterializable = 1 in
+  def V_SETALLONES : PDI<0x76, MRMInitReg, (ops VR128:$dst),
+                         "pcmpeqd $dst, $dst",
+                         [(set VR128:$dst, (v2f64 immAllOnesV))]>;
+
+// FR64 to 128-bit vector conversion.
+def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (ops VR128:$dst, FR64:$src),
+                      "movsd {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v2f64 (scalar_to_vector FR64:$src)))]>;
+def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+                     "movsd {$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, 
+                       (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>;
+
+def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR32:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))]>;
+def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+
+def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (ops FR32:$dst, GR32:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert GR32:$src))]>;
+
+def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (ops FR32:$dst, i32mem:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                    "movq {$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                  Requires<[HasSSE2]>;
+def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (ops i64mem:$dst, VR128:$src),
+                      "movq {$src, $dst|$dst, $src}",
+                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+//           (f32 FR32:$src)>;
+def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, VR128:$src),
+                     "movsd {$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (vector_extract (v2f64 VR128:$src),
+                                       (iPTR 0)))]>;
+def MOVPD2SDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+                     "movsd {$src, $dst|$dst, $src}",
+                     [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                   (iPTR 0))), addr:$dst)]>;
+def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (ops GR32:$dst, VR128:$src),
+                       "movd {$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                                        (iPTR 0)))]>;
+def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (ops i32mem:$dst, VR128:$src),
+                       "movd {$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)]>;
+
+def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (ops GR32:$dst, FR32:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32:$src))]>;
+def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (ops i32mem:$dst, FR32:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let isTwoAddress = 1 in {
+  def MOVLSD2PDrr : SDI<0x10, MRMSrcReg,
+                        (ops VR128:$dst, VR128:$src1, FR64:$src2),
+                        "movsd {$src2, $dst|$dst, $src2}", []>;
+
+  let AddedComplexity = 15 in
+    def MOVLPDrr : SDI<0x10, MRMSrcReg,
+                       (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                       "movsd {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v2f64 (vector_shuffle VR128:$src1, VR128:$src2,
+                                 MOVL_shuffle_mask)))]>;
+}
+
+// Store / copy lower 64-bits of a XMM register.
+def MOVLQ128mr : PDI<0xD6, MRMDestMem, (ops i64mem:$dst, VR128:$src),
+                     "movq {$src, $dst|$dst, $src}",
+                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in
+  def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+                        "movsd {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2f64 (vector_shuffle immAllZerosV,
+                                  (v2f64 (scalar_to_vector
+                                          (loadf64 addr:$src))),
+                                  MOVL_shuffle_mask)))]>;
+
+let AddedComplexity = 15 in
+// movd / movq to XMM register zero-extends
+def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR32:$src),
+                       "movd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (vector_shuffle immAllZerosV,
+                                 (v4i32 (scalar_to_vector GR32:$src)),
+                                 MOVL_shuffle_mask)))]>;
+let AddedComplexity = 20 in
+def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src),
+                       "movd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (vector_shuffle immAllZerosV,
+                                 (v4i32 (scalar_to_vector (loadi32 addr:$src))),
+                                 MOVL_shuffle_mask)))]>;
+
+// Moving from XMM to XMM but still clear upper 64 bits.
+let AddedComplexity = 15 in
+def MOVZQI2PQIrr : I<0x7E, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                     "movq {$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_movl_dq VR128:$src))]>,
+                   XS, Requires<[HasSSE2]>;
+let AddedComplexity = 20 in
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                     "movq {$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_movl_dq
+                                        (bitconvert (loadv2i64 addr:$src))))]>,
+                   XS, Requires<[HasSSE2]>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE3 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE3 Instruction Templates:
+// 
+//   S3I   - SSE3 instructions with TB and OpSize prefixes.
+//   S3SI  - SSE3 instructions with XS prefix.
+//   S3DI  - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE3]>;
+class S3DI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE3]>;
+class S3I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>;
+
+// Move Instructions
+def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                      "movshdup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (vector_shuffle
+                                                VR128:$src, (undef),
+                                                MOVSHDUP_shuffle_mask)))]>;
+def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                      "movshdup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (vector_shuffle
+                                                (loadv4f32 addr:$src), (undef),
+                                                MOVSHDUP_shuffle_mask)))]>;
+
+def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                      "movsldup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (vector_shuffle
+                                                VR128:$src, (undef),
+                                                MOVSLDUP_shuffle_mask)))]>;
+def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                      "movsldup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (vector_shuffle
+                                                (loadv4f32 addr:$src), (undef),
+                                                MOVSLDUP_shuffle_mask)))]>;
+
+def MOVDDUPrr  : S3DI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                      "movddup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v2f64 (vector_shuffle
+                                                VR128:$src, (undef),
+                                                SSE_splat_lo_mask)))]>;
+def MOVDDUPrm  : S3DI<0x12, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+                      "movddup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v2f64 (vector_shuffle
+                                (scalar_to_vector (loadf64 addr:$src)),
+                                (undef),
+                                SSE_splat_lo_mask)))]>;
+
+// Arithmetic
+let isTwoAddress = 1 in {
+  def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg,
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "addsubps {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+                                           VR128:$src2))]>;
+  def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem,
+                        (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                        "addsubps {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+                                           (load addr:$src2)))]>;
+  def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
+                       (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                       "addsubpd {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+                                          VR128:$src2))]>;
+  def ADDSUBPDrm : S3I<0xD0, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                       "addsubpd {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+                                          (load addr:$src2)))]>;
+}
+
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+                   "lddqu {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
+
+// Horizontal ops
+class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3DI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+         !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3DI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+         !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (load addr:$src2))))]>;
+class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3I<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+        !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+        [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3I<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+        !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+        [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (load addr:$src2))))]>;
+
+let isTwoAddress = 1 in {
+  def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+  def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+  def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+  def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+  def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+  def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+  def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+  def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+}
+
+// Thread synchronization
+def MONITOR : I<0xC8, RawFrm, (ops), "monitor",
+                [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
+def MWAIT   : I<0xC9, RawFrm, (ops), "mwait",
+                [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <1, 1, 3, 3>
+let AddedComplexity = 15 in
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+                  MOVSHDUP_shuffle_mask)),
+          (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef),
+                  MOVSHDUP_shuffle_mask)),
+          (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <0, 0, 2, 2>
+let AddedComplexity = 15 in
+  def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+                    MOVSLDUP_shuffle_mask)),
+            (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+  def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef),
+                    MOVSLDUP_shuffle_mask)),
+            (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+//===----------------------------------------------------------------------===//
+// SSSE3 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE3 Instruction Templates:
+// 
+//   SS38I - SSSE3 instructions with T8 and OpSize prefixes.
+//   SS3AI - SSSE3 instructions with TA and OpSize prefixes.
+
+class SS38I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, T8, OpSize, Requires<[HasSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TA, OpSize, Requires<[HasSSSE3]>;
+
+/// SS3I_binop_rm_int - Simple SSSE3 binary operatr whose type is v2i64.
+let isTwoAddress = 1 in {
+  multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               bit Commutable = 0> {
+    def rr : SS38I<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                   !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : SS38I<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                   !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst,
+                     (IntId VR128:$src1,
+                      (bitconvert (loadv2i64 addr:$src2))))]>;
+  }
+}
+
+defm PMULHRSW128 : SS3I_binop_rm_int<0x0B, "pmulhrsw",
+                                     int_x86_ssse3_pmulhrsw_128, 1>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// 128-bit vector undef's.
+def : Pat<(v2f64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+
+// 128-bit vector all zero's.
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+
+// 128-bit vector all one's.
+def : Pat<(v16i8 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE1]>;
+
+// Store 128-bit integer vector values.
+def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+          (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+          (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+          (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+
+// Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or
+// 16-bits matter.
+def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
+      Requires<[HasSSE2]>;
+def : Pat<(v16i8 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
+      Requires<[HasSSE2]>;
+
+// bit_convert
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Move scalar to XMM zero-extended
+// movd to XMM register zero-extends
+let AddedComplexity = 15 in {
+def : Pat<(v8i16 (vector_shuffle immAllZerosV,
+                  (v8i16 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
+          (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (vector_shuffle immAllZerosV,
+                  (v16i8 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
+          (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
+// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
+def : Pat<(v2f64 (vector_shuffle immAllZerosV,
+                  (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
+          (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (vector_shuffle immAllZerosV,
+                  (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
+          (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
+}
+
+// Splat v2f64 / v2i64
+let AddedComplexity = 10 in {
+def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), SSE_splat_lo_mask:$sm),
+          (UNPCKLPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm),
+          (UNPCKHPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), SSE_splat_lo_mask:$sm),
+          (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm),
+          (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+// Splat v4f32
+def : Pat<(vector_shuffle (v4f32 VR128:$src), (undef), SSE_splat_mask:$sm),
+          (SHUFPSrri VR128:$src, VR128:$src, SSE_splat_mask:$sm)>,
+      Requires<[HasSSE1]>;
+
+// Special unary SHUFPSrri case.
+// FIXME: when we want non two-address code, then we should use PSHUFD?
+def : Pat<(vector_shuffle (v4f32 VR128:$src1), (undef),
+           SHUFP_unary_shuffle_mask:$sm),
+          (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
+      Requires<[HasSSE1]>;
+// Unary v4f32 shuffle with PSHUF* in order to fold a load.
+def : Pat<(vector_shuffle (loadv4f32 addr:$src1), (undef),
+           SHUFP_unary_shuffle_mask:$sm),
+          (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>,
+      Requires<[HasSSE2]>;
+// Special binary v4i32 shuffle cases with SHUFPS.
+def : Pat<(vector_shuffle (v4i32 VR128:$src1), (v4i32 VR128:$src2),
+           PSHUFD_binary_shuffle_mask:$sm),
+          (SHUFPSrri VR128:$src1, VR128:$src2, PSHUFD_binary_shuffle_mask:$sm)>,
+           Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v4i32 VR128:$src1),
+           (bc_v4i32 (loadv2i64 addr:$src2)), PSHUFD_binary_shuffle_mask:$sm),
+          (SHUFPSrmi VR128:$src1, addr:$src2, PSHUFD_binary_shuffle_mask:$sm)>,
+           Requires<[HasSSE2]>;
+
+// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
+                  UNPCKL_v_undef_shuffle_mask)),
+          (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef),
+                  UNPCKL_v_undef_shuffle_mask)),
+          (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef),
+                  UNPCKL_v_undef_shuffle_mask)),
+          (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+                  UNPCKL_v_undef_shuffle_mask)),
+          (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
+                  UNPCKH_v_undef_shuffle_mask)),
+          (UNPCKHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef),
+                  UNPCKH_v_undef_shuffle_mask)),
+          (PUNPCKHBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef),
+                  UNPCKH_v_undef_shuffle_mask)),
+          (PUNPCKHWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+                  UNPCKH_v_undef_shuffle_mask)),
+          (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+let AddedComplexity = 15 in {
+// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVHP_shuffle_mask)),
+          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVHLPS_shuffle_mask)),
+          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef),
+                  MOVHLPS_v_undef_shuffle_mask)),
+          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, (undef),
+                  MOVHLPS_v_undef_shuffle_mask)),
+          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+// vector_shuffle v1, (load v2) <0, 1, 4, 5> using MOVHPS
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (loadv4f32 addr:$src2),
+                  MOVLP_shuffle_mask)),
+          (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (vector_shuffle VR128:$src1, (loadv2f64 addr:$src2),
+                  MOVLP_shuffle_mask)),
+          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (loadv4f32 addr:$src2),
+                  MOVHP_shuffle_mask)),
+          (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (vector_shuffle VR128:$src1, (loadv2f64 addr:$src2),
+                  MOVHP_shuffle_mask)),
+          (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)),
+                  MOVLP_shuffle_mask)),
+          (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2),
+                  MOVLP_shuffle_mask)),
+          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)),
+                  MOVHP_shuffle_mask)),
+          (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2),
+                  MOVLP_shuffle_mask)),
+          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+}
+
+let AddedComplexity = 15 in {
+// Setting the lowest element in the vector.
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVL_shuffle_mask)),
+          (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVL_shuffle_mask)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+// vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd)
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVLP_shuffle_mask)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVLP_shuffle_mask)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+}
+
+// Set lowest element and zero upper elements.
+let AddedComplexity = 20 in
+def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV,
+                     (v2f64 (scalar_to_vector (loadf64 addr:$src))),
+                     MOVL_shuffle_mask)),
+          (MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>;
+
+// FIXME: Temporary workaround since 2-wide shuffle is broken.
+def : Pat<(int_x86_sse2_movs_d  VR128:$src1, VR128:$src2),
+          (v2f64 (MOVLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_loadh_pd VR128:$src1, addr:$src2),
+          (v2f64 (MOVHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_loadl_pd VR128:$src1, addr:$src2),
+          (v2f64 (MOVLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, VR128:$src2, imm:$src3),
+          (v2f64 (SHUFPDrri VR128:$src1, VR128:$src2, imm:$src3))>,
+      Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, (load addr:$src2), imm:$src3),
+          (v2f64 (SHUFPDrmi VR128:$src1, addr:$src2, imm:$src3))>,
+      Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, VR128:$src2),
+          (v2f64 (UNPCKHPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, (load addr:$src2)),
+          (v2f64 (UNPCKHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, VR128:$src2),
+          (v2f64 (UNPCKLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, (load addr:$src2)),
+          (v2f64 (UNPCKLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, VR128:$src2),
+          (v2i64 (PUNPCKHQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, (load addr:$src2)),
+          (v2i64 (PUNPCKHQDQrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, VR128:$src2),
+          (v2i64 (PUNPCKLQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, (load addr:$src2)),
+          (PUNPCKLQDQrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+// Some special case pandn patterns.
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+                  (load addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+                  (load addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+                  (load addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+// Unaligned load
+def : Pat<(v4f32 (X86loadu addr:$src)), (MOVUPSrm addr:$src)>,
+      Requires<[HasSSE1]>;

diff --git a/lib/Target/X86/X86InstrX86-64.td b/lib/Target/X86/X86InstrX86-64.td
new file mode 100644
index 0000000..ac43846
--- /dev/null
+++ b/lib/Target/X86/X86InstrX86-64.td

@@ -0,0 +1,1165 @@
+//====- X86InstrX86-64.td - Describe the X86 Instruction Set ----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86-64 instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions...
+//
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm  : Operand<i64>;
+// 64-bits but only 8 bits are significant.
+def i64i8imm   : Operand<i64>;
+
+def lea64mem : Operand<i64> {
+  let PrintMethod = "printi64mem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64, i32imm);
+}
+
+def lea64_32mem : Operand<i32> {
+  let PrintMethod = "printlea64_32mem";
+  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Complex Pattern Definitions...
+//
+def lea64addr : ComplexPattern<i64, 4, "SelectLEAAddr",
+                               [add, mul, shl, or, frameindex, X86Wrapper],
+                               []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, REX_W;
+class RIi8 <bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, REX_W;
+class RIi32 <bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii32<o, F, ops, asm, pattern>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm64, ops, asm>, REX_W {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RSSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : SSI<o, F, ops, asm, pattern>, REX_W;
+class RSDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : SDI<o, F, ops, asm, pattern>, REX_W;
+class RPDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : PDI<o, F, ops, asm, pattern>, REX_W;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments...
+//
+
+def i64immSExt32  : PatLeaf<(i64 imm), [{
+  // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // sign extended field.
+  return (int64_t)N->getValue() == (int32_t)N->getValue();
+}]>;
+
+def i64immZExt32  : PatLeaf<(i64 imm), [{
+  // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // unsignedsign extended field.
+  return (uint64_t)N->getValue() == (uint32_t)N->getValue();
+}]>;
+
+def i64immSExt8  : PatLeaf<(i64 imm), [{
+  // i64immSExt8 predicate - True if the 64-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int64_t)N->getValue() == (int8_t)N->getValue();
+}]>;
+
+def sextloadi64i1  : PatFrag<(ops node:$ptr), (i64 (sextloadi1 node:$ptr))>;
+def sextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi64i1  : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi64i1   : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+def IMPLICIT_DEF_GR64  : I<0, Pseudo, (ops GR64:$dst),
+                         "#IMPLICIT_DEF $dst",
+                         [(set GR64:$dst, (undef))]>;
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1, noResults = 1 in
+  // All calls clobber the non-callee saved registers...
+  let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+              FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15] in {
+    def CALL64pcrel32 : I<0xE8, RawFrm, (ops i64imm:$dst, variable_ops),
+                          "call ${dst:call}", []>;
+    def CALL64r       : I<0xFF, MRM2r, (ops GR64:$dst, variable_ops),
+                          "call {*}$dst", [(X86call GR64:$dst)]>;
+    def CALL64m       : I<0xFF, MRM2m, (ops i64mem:$dst, variable_ops),
+                          "call {*}$dst", []>;
+  }
+
+// Branches
+let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1 in {
+  def JMP64r     : I<0xFF, MRM4r, (ops GR64:$dst), "jmp{q} {*}$dst",
+                     [(brind GR64:$dst)]>;
+  def JMP64m     : I<0xFF, MRM4m, (ops i64mem:$dst), "jmp{q} {*}$dst",
+                     [(brind (loadi64 addr:$dst))]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+def LEAVE64  : I<0xC9, RawFrm,
+                 (ops), "leave", []>, Imp<[RBP,RSP],[RBP,RSP]>;
+def POP64r   : I<0x58, AddRegFrm,
+                 (ops GR64:$reg), "pop{q} $reg", []>, Imp<[RSP],[RSP]>;
+def PUSH64r  : I<0x50, AddRegFrm,
+                 (ops GR64:$reg), "push{q} $reg", []>, Imp<[RSP],[RSP]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+                  (ops GR32:$dst, lea64_32mem:$src),
+                  "lea{l} {$src|$dst}, {$dst|$src}",
+                  [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>;
+
+def LEA64r   : RI<0x8D, MRMSrcMem, (ops GR64:$dst, lea64mem:$src),
+                  "lea{q} {$src|$dst}, {$dst|$src}",
+                  [(set GR64:$dst, lea64addr:$src)]>;
+
+let isTwoAddress = 1 in
+def BSWAP64r : RI<0xC8, AddRegFrm, (ops GR64:$dst, GR64:$src),
+                  "bswap{q} $dst", 
+                  [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+// Exchange
+def XCHG64rr : RI<0x87, MRMDestReg, (ops GR64:$src1, GR64:$src2),
+                  "xchg{q} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG64mr : RI<0x87, MRMDestMem, (ops i64mem:$src1, GR64:$src2),
+                  "xchg{q} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG64rm : RI<0x87, MRMSrcMem, (ops GR64:$src1, i64mem:$src2),
+                  "xchg{q} {$src2|$src1}, {$src1|$src2}", []>;
+
+// Repeat string ops
+def REP_MOVSQ : RI<0xA5, RawFrm, (ops), "{rep;movsq|rep movsq}",
+                   [(X86rep_movs i64)]>,
+                Imp<[RCX,RDI,RSI], [RCX,RDI,RSI]>, REP;
+def REP_STOSQ : RI<0xAB, RawFrm, (ops), "{rep;stosq|rep stosq}",
+                   [(X86rep_stos i64)]>,
+                Imp<[RAX,RCX,RDI], [RCX,RDI]>, REP;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions...
+//
+
+def MOV64rr : RI<0x89, MRMDestReg, (ops GR64:$dst, GR64:$src),
+                 "mov{q} {$src, $dst|$dst, $src}", []>;
+
+def MOV64ri : RIi64<0xB8, AddRegFrm, (ops GR64:$dst, i64imm:$src),
+                    "movabs{q} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, imm:$src)]>;
+def MOV64ri32 : RIi32<0xC7, MRM0r, (ops GR64:$dst, i64i32imm:$src),
+                      "mov{q} {$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, i64immSExt32:$src)]>;
+
+def MOV64rm : RI<0x8B, MRMSrcMem, (ops GR64:$dst, i64mem:$src),
+                 "mov{q} {$src, $dst|$dst, $src}",
+                 [(set GR64:$dst, (load addr:$src))]>;
+
+def MOV64mr : RI<0x89, MRMDestMem, (ops i64mem:$dst, GR64:$src),
+                 "mov{q} {$src, $dst|$dst, $src}",
+                 [(store GR64:$src, addr:$dst)]>;
+def MOV64mi32 : RIi32<0xC7, MRM0m, (ops i64mem:$dst, i64i32imm:$src),
+                      "mov{q} {$src, $dst|$dst, $src}",
+                      [(store i64immSExt32:$src, addr:$dst)]>;
+
+// Sign/Zero extenders
+
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (ops GR64:$dst, GR8 :$src),
+                    "movs{bq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR8:$src))]>, TB;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (ops GR64:$dst, i8mem :$src),
+                    "movs{bq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (ops GR64:$dst, GR16:$src),
+                    "movs{wq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR16:$src))]>, TB;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (ops GR64:$dst, i16mem:$src),
+                    "movs{wq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (ops GR64:$dst, GR32:$src),
+                    "movs{lq|xd} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR32:$src))]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (ops GR64:$dst, i32mem:$src),
+                    "movs{lq|xd} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i32 addr:$src))]>;
+
+def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (ops GR64:$dst, GR8 :$src),
+                    "movz{bq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (zext GR8:$src))]>, TB;
+def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (ops GR64:$dst, i8mem :$src),
+                    "movz{bq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB;
+def MOVZX64rr16: RI<0xB7, MRMSrcReg, (ops GR64:$dst, GR16:$src),
+                    "movz{wq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (zext GR16:$src))]>, TB;
+def MOVZX64rm16: RI<0xB7, MRMSrcMem, (ops GR64:$dst, i16mem:$src),
+                    "movz{wq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
+
+def CDQE : RI<0x98, RawFrm, (ops),
+             "{cltq|cdqe}", []>, Imp<[EAX],[RAX]>;     // RAX = signext(EAX)
+
+def CQO  : RI<0x99, RawFrm, (ops),
+              "{cqto|cqo}", []>, Imp<[RAX],[RAX,RDX]>; // RDX:RAX = signext(RAX)
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions...
+//
+
+let isTwoAddress = 1 in {
+let isConvertibleToThreeAddress = 1 in {
+let isCommutable = 1 in
+def ADD64rr  : RI<0x01, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "add{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (add GR64:$src1, GR64:$src2))]>;
+
+def ADD64ri32 : RIi32<0x81, MRM0r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                      "add{q} {$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (add GR64:$src1, i64immSExt32:$src2))]>;
+def ADD64ri8 : RIi8<0x83, MRM0r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "add{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (add GR64:$src1, i64immSExt8:$src2))]>;
+} // isConvertibleToThreeAddress
+
+def ADD64rm  : RI<0x03, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "add{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (add GR64:$src1, (load addr:$src2)))]>;
+} // isTwoAddress
+
+def ADD64mr  : RI<0x01, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+                  "add{q} {$src2, $dst|$dst, $src2}",
+                  [(store (add (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def ADD64mi32 : RIi32<0x81, MRM0m, (ops i64mem:$dst, i64i32imm :$src2),
+                      "add{q} {$src2, $dst|$dst, $src2}",
+               [(store (add (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+def ADD64mi8 : RIi8<0x83, MRM0m, (ops i64mem:$dst, i64i8imm :$src2),
+                    "add{q} {$src2, $dst|$dst, $src2}",
+                [(store (add (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def ADC64rr  : RI<0x11, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "adc{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (adde GR64:$src1, GR64:$src2))]>;
+
+def ADC64rm  : RI<0x13, MRMSrcMem , (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "adc{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (adde GR64:$src1, (load addr:$src2)))]>;
+
+def ADC64ri32 : RIi32<0x81, MRM2r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                      "adc{q} {$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (adde GR64:$src1, i64immSExt32:$src2))]>;
+def ADC64ri8 : RIi8<0x83, MRM2r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "adc{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (adde GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def ADC64mr  : RI<0x11, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+                  "adc{q} {$src2, $dst|$dst, $src2}",
+                  [(store (adde (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def ADC64mi32 : RIi32<0x81, MRM2m, (ops i64mem:$dst, i64i32imm:$src2),
+                      "adc{q} {$src2, $dst|$dst, $src2}",
+               [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+def ADC64mi8 : RIi8<0x83, MRM2m, (ops i64mem:$dst, i64i8imm :$src2),
+                    "adc{q} {$src2, $dst|$dst, $src2}",
+               [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SUB64rr  : RI<0x29, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "sub{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sub GR64:$src1, GR64:$src2))]>;
+
+def SUB64rm  : RI<0x2B, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "sub{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sub GR64:$src1, (load addr:$src2)))]>;
+
+def SUB64ri32 : RIi32<0x81, MRM5r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                      "sub{q} {$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (sub GR64:$src1, i64immSExt32:$src2))]>;
+def SUB64ri8 : RIi8<0x83, MRM5r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "sub{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sub GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def SUB64mr  : RI<0x29, MRMDestMem, (ops i64mem:$dst, GR64:$src2), 
+                  "sub{q} {$src2, $dst|$dst, $src2}",
+                  [(store (sub (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def SUB64mi32 : RIi32<0x81, MRM5m, (ops i64mem:$dst, i64i32imm:$src2), 
+                      "sub{q} {$src2, $dst|$dst, $src2}",
+               [(store (sub (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+def SUB64mi8 : RIi8<0x83, MRM5m, (ops i64mem:$dst, i64i8imm :$src2), 
+                    "sub{q} {$src2, $dst|$dst, $src2}",
+                [(store (sub (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SBB64rr    : RI<0x19, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                    "sbb{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sube GR64:$src1, GR64:$src2))]>;
+
+def SBB64rm  : RI<0x1B, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "sbb{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sube GR64:$src1, (load addr:$src2)))]>;
+
+def SBB64ri32 : RIi32<0x81, MRM3r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                      "sbb{q} {$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (sube GR64:$src1, i64immSExt32:$src2))]>;
+def SBB64ri8 : RIi8<0x83, MRM3r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "sbb{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sube GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def SBB64mr  : RI<0x19, MRMDestMem, (ops i64mem:$dst, GR64:$src2), 
+                  "sbb{q} {$src2, $dst|$dst, $src2}",
+                  [(store (sube (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def SBB64mi32 : RIi32<0x81, MRM3m, (ops i64mem:$dst, i64i32imm:$src2), 
+                      "sbb{q} {$src2, $dst|$dst, $src2}",
+              [(store (sube (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+def SBB64mi8 : RIi8<0x83, MRM3m, (ops i64mem:$dst, i64i8imm :$src2), 
+                    "sbb{q} {$src2, $dst|$dst, $src2}",
+               [(store (sube (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+// Unsigned multiplication
+def MUL64r : RI<0xF7, MRM4r, (ops GR64:$src),
+                "mul{q} $src", []>,
+             Imp<[RAX],[RAX,RDX]>;         // RAX,RDX = RAX*GR64
+def MUL64m : RI<0xF7, MRM4m, (ops i64mem:$src),
+                "mul{q} $src", []>,
+             Imp<[RAX],[RAX,RDX]>;         // RAX,RDX = RAX*[mem64]
+
+// Signed multiplication
+def IMUL64r : RI<0xF7, MRM5r, (ops GR64:$src),
+                 "imul{q} $src", []>,
+              Imp<[RAX],[RAX,RDX]>;         // RAX,RDX = RAX*GR64
+def IMUL64m : RI<0xF7, MRM5m, (ops i64mem:$src),
+                 "imul{q} $src", []>,
+              Imp<[RAX],[RAX,RDX]>;         // RAX,RDX = RAX*[mem64]
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def IMUL64rr : RI<0xAF, MRMSrcReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "imul{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (mul GR64:$src1, GR64:$src2))]>, TB;
+
+def IMUL64rm : RI<0xAF, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "imul{q} {$src2, $dst|$dst, $src2}",
+                 [(set GR64:$dst, (mul GR64:$src1, (load addr:$src2)))]>, TB;
+} // isTwoAddress
+
+// Suprisingly enough, these are not two address instructions!
+def IMUL64rri32 : RIi32<0x69, MRMSrcReg,                    // GR64 = GR64*I32
+                        (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                        "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2))]>;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
+                      (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                      "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, (mul GR64:$src1, i64immSExt8:$src2))]>;
+def IMUL64rmi32 : RIi32<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
+                        (ops GR64:$dst, i64mem:$src1, i64i32imm:$src2),
+                        "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                [(set GR64:$dst, (mul (load addr:$src1), i64immSExt32:$src2))]>;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
+                      (ops GR64:$dst, i64mem:$src1, i64i8imm: $src2),
+                      "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                 [(set GR64:$dst, (mul (load addr:$src1), i64immSExt8:$src2))]>;
+
+// Unsigned division / remainder
+def DIV64r : RI<0xF7, MRM6r, (ops GR64:$src),        // RDX:RAX/r64 = RAX,RDX
+                "div{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+def DIV64m : RI<0xF7, MRM6m, (ops i64mem:$src),      // RDX:RAX/[mem64] = RAX,RDX
+                "div{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+
+// Signed division / remainder
+def IDIV64r: RI<0xF7, MRM7r, (ops GR64:$src),        // RDX:RAX/r64 = RAX,RDX
+                "idiv{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+def IDIV64m: RI<0xF7, MRM7m, (ops i64mem:$src),      // RDX:RAX/[mem64] = RAX,RDX
+                "idiv{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+
+// Unary instructions
+let CodeSize = 2 in {
+let isTwoAddress = 1 in
+def NEG64r : RI<0xF7, MRM3r, (ops GR64:$dst, GR64:$src), "neg{q} $dst",
+                [(set GR64:$dst, (ineg GR64:$src))]>;
+def NEG64m : RI<0xF7, MRM3m, (ops i64mem:$dst), "neg{q} $dst",
+                [(store (ineg (loadi64 addr:$dst)), addr:$dst)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def INC64r : RI<0xFF, MRM0r, (ops GR64:$dst, GR64:$src), "inc{q} $dst",
+                [(set GR64:$dst, (add GR64:$src, 1))]>;
+def INC64m : RI<0xFF, MRM0m, (ops i64mem:$dst), "inc{q} $dst",
+                [(store (add (loadi64 addr:$dst), 1), addr:$dst)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def DEC64r : RI<0xFF, MRM1r, (ops GR64:$dst, GR64:$src), "dec{q} $dst",
+                [(set GR64:$dst, (add GR64:$src, -1))]>;
+def DEC64m : RI<0xFF, MRM1m, (ops i64mem:$dst), "dec{q} $dst",
+                [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
+
+// In 64-bit mode, single byte INC and DEC cannot be encoded.
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in {
+// Can transform into LEA.
+def INC64_16r : I<0xFF, MRM0r, (ops GR16:$dst, GR16:$src), "inc{w} $dst",
+                  [(set GR16:$dst, (add GR16:$src, 1))]>,
+                OpSize, Requires<[In64BitMode]>;
+def INC64_32r : I<0xFF, MRM0r, (ops GR32:$dst, GR32:$src), "inc{l} $dst",
+                  [(set GR32:$dst, (add GR32:$src, 1))]>,
+                Requires<[In64BitMode]>;
+def DEC64_16r : I<0xFF, MRM1r, (ops GR16:$dst, GR16:$src), "dec{w} $dst",
+                  [(set GR16:$dst, (add GR16:$src, -1))]>,
+                OpSize, Requires<[In64BitMode]>;
+def DEC64_32r : I<0xFF, MRM1r, (ops GR32:$dst, GR32:$src), "dec{l} $dst",
+                  [(set GR32:$dst, (add GR32:$src, -1))]>,
+                Requires<[In64BitMode]>;
+} // isConvertibleToThreeAddress
+} // CodeSize
+
+
+// Shift instructions
+let isTwoAddress = 1 in {
+def SHL64rCL : RI<0xD3, MRM4r, (ops GR64:$dst, GR64:$src),
+                  "shl{q} {%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (shl GR64:$src, CL))]>,
+               Imp<[CL],[]>;
+def SHL64ri  : RIi8<0xC1, MRM4r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+                    "shl{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
+def SHL64r1  : RI<0xD1, MRM4r, (ops GR64:$dst, GR64:$src1),
+                 "shl{q} $dst", []>;
+} // isTwoAddress
+
+def SHL64mCL : RI<0xD3, MRM4m, (ops i64mem:$dst),
+                  "shl{q} {%cl, $dst|$dst, %CL}",
+                  [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>,
+               Imp<[CL],[]>;
+def SHL64mi : RIi8<0xC1, MRM4m, (ops i64mem:$dst, i8imm:$src),
+                  "shl{q} {$src, $dst|$dst, $src}",
+                 [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHL64m1 : RI<0xD1, MRM4m, (ops i64mem:$dst),
+                  "shl{q} $dst",
+                 [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SHR64rCL : RI<0xD3, MRM5r, (ops GR64:$dst, GR64:$src),
+                  "shr{q} {%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (srl GR64:$src, CL))]>,
+               Imp<[CL],[]>;
+def SHR64ri : RIi8<0xC1, MRM5r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+                  "shr{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
+def SHR64r1  : RI<0xD1, MRM5r, (ops GR64:$dst, GR64:$src1),
+                 "shr{q} $dst",
+                 [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def SHR64mCL : RI<0xD3, MRM5m, (ops i64mem:$dst),
+                  "shr{q} {%cl, $dst|$dst, %CL}",
+                  [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>,
+               Imp<[CL],[]>;
+def SHR64mi : RIi8<0xC1, MRM5m, (ops i64mem:$dst, i8imm:$src),
+                  "shr{q} {$src, $dst|$dst, $src}",
+                 [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHR64m1 : RI<0xD1, MRM5m, (ops i64mem:$dst),
+                  "shr{q} $dst",
+                 [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SAR64rCL : RI<0xD3, MRM7r, (ops GR64:$dst, GR64:$src),
+                 "sar{q} {%cl, $dst|$dst, %CL}",
+                 [(set GR64:$dst, (sra GR64:$src, CL))]>, Imp<[CL],[]>;
+def SAR64ri  : RIi8<0xC1, MRM7r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+                   "sar{q} {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
+def SAR64r1  : RI<0xD1, MRM7r, (ops GR64:$dst, GR64:$src1),
+                 "sar{q} $dst",
+                 [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def SAR64mCL : RI<0xD3, MRM7m, (ops i64mem:$dst), 
+                 "sar{q} {%cl, $dst|$dst, %CL}",
+                 [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>,
+               Imp<[CL],[]>;
+def SAR64mi  : RIi8<0xC1, MRM7m, (ops i64mem:$dst, i8imm:$src),
+                    "sar{q} {$src, $dst|$dst, $src}",
+                 [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SAR64m1 : RI<0xD1, MRM7m, (ops i64mem:$dst),
+                  "sar{q} $dst",
+                 [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Rotate instructions
+let isTwoAddress = 1 in {
+def ROL64rCL : RI<0xD3, MRM0r, (ops GR64:$dst, GR64:$src),
+                  "rol{q} {%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (rotl GR64:$src, CL))]>, Imp<[CL],[]>;
+def ROL64ri  : RIi8<0xC1, MRM0r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+                    "rol{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+def ROL64r1  : RI<0xD1, MRM0r, (ops GR64:$dst, GR64:$src1),
+                  "rol{q} $dst",
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def ROL64mCL :  I<0xD3, MRM0m, (ops i64mem:$dst),
+                  "rol{q} {%cl, $dst|$dst, %CL}",
+                  [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>,
+               Imp<[CL],[]>;
+def ROL64mi  : RIi8<0xC1, MRM0m, (ops i64mem:$dst, i8imm:$src),
+                    "rol{q} {$src, $dst|$dst, $src}",
+                [(store (rotl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROL64m1  : RI<0xD1, MRM0m, (ops i64mem:$dst),
+                 "rol{q} $dst",
+               [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def ROR64rCL : RI<0xD3, MRM1r, (ops GR64:$dst, GR64:$src),
+                  "ror{q} {%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (rotr GR64:$src, CL))]>, Imp<[CL],[]>;
+def ROR64ri  : RIi8<0xC1, MRM1r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+                    "ror{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
+def ROR64r1  : RI<0xD1, MRM1r, (ops GR64:$dst, GR64:$src1),
+                  "ror{q} $dst",
+                  [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def ROR64mCL : RI<0xD3, MRM1m, (ops i64mem:$dst), 
+                  "ror{q} {%cl, $dst|$dst, %CL}",
+                  [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>,
+               Imp<[CL],[]>;
+def ROR64mi  : RIi8<0xC1, MRM1m, (ops i64mem:$dst, i8imm:$src),
+                    "ror{q} {$src, $dst|$dst, $src}",
+                [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROR64m1  : RI<0xD1, MRM1m, (ops i64mem:$dst),
+                 "ror{q} $dst",
+               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Double shift instructions (generalizations of rotate)
+let isTwoAddress = 1 in {
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                    "shld{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+                 Imp<[CL],[]>, TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                    "shrd{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+                 Imp<[CL],[]>, TB;
+
+let isCommutable = 1 in {  // FIXME: Update X86InstrInfo::commuteInstruction
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+                      (ops GR64:$dst, GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shld{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                      TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+                      (ops GR64:$dst, GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shrd{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                 TB;
+} // isCommutable
+} // isTwoAddress
+
+// Temporary hack: there is no patterns associated with these instructions
+// so we have to tell tblgen that these do not produce results.
+let noResults = 1 in {
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+                    "shld{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+                 Imp<[CL],[]>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+                    "shrd{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+                 Imp<[CL],[]>, TB;
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+                      (ops i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shld{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                 TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem, 
+                      (ops i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shrd{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                 TB;
+} // noResults
+
+//===----------------------------------------------------------------------===//
+//  Logical Instructions...
+//
+
+let isTwoAddress = 1 in
+def NOT64r : RI<0xF7, MRM2r, (ops GR64:$dst, GR64:$src), "not{q} $dst",
+                [(set GR64:$dst, (not GR64:$src))]>;
+def NOT64m : RI<0xF7, MRM2m, (ops i64mem:$dst), "not{q} $dst",
+                [(store (not (loadi64 addr:$dst)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def AND64rr  : RI<0x21, MRMDestReg, 
+                  (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "and{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (and GR64:$src1, GR64:$src2))]>;
+def AND64rm  : RI<0x23, MRMSrcMem,
+                  (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "and{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (and GR64:$src1, (load addr:$src2)))]>;
+def AND64ri32  : RIi32<0x81, MRM4r, 
+                       (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                       "and{q} {$src2, $dst|$dst, $src2}",
+                       [(set GR64:$dst, (and GR64:$src1, i64immSExt32:$src2))]>;
+def AND64ri8 : RIi8<0x83, MRM4r, 
+                    (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "and{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (and GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def AND64mr  : RI<0x21, MRMDestMem,
+                  (ops i64mem:$dst, GR64:$src),
+                  "and{q} {$src, $dst|$dst, $src}",
+                  [(store (and (load addr:$dst), GR64:$src), addr:$dst)]>;
+def AND64mi32  : RIi32<0x81, MRM4m,
+                       (ops i64mem:$dst, i64i32imm:$src),
+                       "and{q} {$src, $dst|$dst, $src}",
+             [(store (and (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>;
+def AND64mi8 : RIi8<0x83, MRM4m,
+                    (ops i64mem:$dst, i64i8imm :$src),
+                    "and{q} {$src, $dst|$dst, $src}",
+                 [(store (and (load addr:$dst), i64immSExt8:$src), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def OR64rr   : RI<0x09, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "or{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, GR64:$src2))]>;
+def OR64rm   : RI<0x0B, MRMSrcMem , (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "or{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, (load addr:$src2)))]>;
+def OR64ri32 : RIi32<0x81, MRM1r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                     "or{q} {$src2, $dst|$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64immSExt32:$src2))]>;
+def OR64ri8  : RIi8<0x83, MRM1r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "or{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (or GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def OR64mr : RI<0x09, MRMDestMem, (ops i64mem:$dst, GR64:$src),
+                "or{q} {$src, $dst|$dst, $src}",
+                [(store (or (load addr:$dst), GR64:$src), addr:$dst)]>;
+def OR64mi32 : RIi32<0x81, MRM1m, (ops i64mem:$dst, i64i32imm:$src),
+                     "or{q} {$src, $dst|$dst, $src}",
+              [(store (or (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>;
+def OR64mi8  : RIi8<0x83, MRM1m, (ops i64mem:$dst, i64i8imm:$src),
+                    "or{q} {$src, $dst|$dst, $src}",
+                  [(store (or (load addr:$dst), i64immSExt8:$src), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def XOR64rr  : RI<0x31, MRMDestReg,  (ops GR64:$dst, GR64:$src1, GR64:$src2), 
+                  "xor{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (xor GR64:$src1, GR64:$src2))]>;
+def XOR64rm  : RI<0x33, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2), 
+                  "xor{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (xor GR64:$src1, (load addr:$src2)))]>;
+def XOR64ri32 : RIi32<0x81, MRM6r, 
+                      (ops GR64:$dst, GR64:$src1, i64i32imm:$src2), 
+                      "xor{q} {$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (xor GR64:$src1, i64immSExt32:$src2))]>;
+def XOR64ri8 : RIi8<0x83, MRM6r,  (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "xor{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (xor GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def XOR64mr  : RI<0x31, MRMDestMem, (ops i64mem:$dst, GR64:$src),
+                  "xor{q} {$src, $dst|$dst, $src}",
+                  [(store (xor (load addr:$dst), GR64:$src), addr:$dst)]>;
+def XOR64mi32 : RIi32<0x81, MRM6m, (ops i64mem:$dst, i64i32imm:$src),
+                      "xor{q} {$src, $dst|$dst, $src}",
+             [(store (xor (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>;
+def XOR64mi8 : RIi8<0x83, MRM6m, (ops i64mem:$dst, i64i8imm :$src),
+                    "xor{q} {$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i64immSExt8:$src), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+// Integer comparison
+let isCommutable = 1 in
+def TEST64rr : RI<0x85, MRMDestReg, (ops GR64:$src1, GR64:$src2),
+                  "test{q} {$src2, $src1|$src1, $src2}",
+                  [(X86cmp (and GR64:$src1, GR64:$src2), 0)]>;
+def TEST64rm : RI<0x85, MRMSrcMem, (ops GR64:$src1, i64mem:$src2),
+                  "test{q} {$src2, $src1|$src1, $src2}",
+                  [(X86cmp (and GR64:$src1, (loadi64 addr:$src2)), 0)]>;
+def TEST64ri32 : RIi32<0xF7, MRM0r, (ops GR64:$src1, i64i32imm:$src2),
+                       "test{q} {$src2, $src1|$src1, $src2}",
+                       [(X86cmp (and GR64:$src1, i64immSExt32:$src2), 0)]>;
+def TEST64mi32 : RIi32<0xF7, MRM0m, (ops i64mem:$src1, i64i32imm:$src2),
+                       "test{q} {$src2, $src1|$src1, $src2}",
+                  [(X86cmp (and (loadi64 addr:$src1), i64immSExt32:$src2), 0)]>;
+
+def CMP64rr : RI<0x39, MRMDestReg, (ops GR64:$src1, GR64:$src2),
+                 "cmp{q} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp GR64:$src1, GR64:$src2)]>;
+def CMP64mr : RI<0x39, MRMDestMem, (ops i64mem:$src1, GR64:$src2),
+                 "cmp{q} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (loadi64 addr:$src1), GR64:$src2)]>;
+def CMP64rm : RI<0x3B, MRMSrcMem, (ops GR64:$src1, i64mem:$src2),
+                 "cmp{q} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp GR64:$src1, (loadi64 addr:$src2))]>;
+def CMP64ri32 : RIi32<0x81, MRM7r, (ops GR64:$src1, i64i32imm:$src2),
+                      "cmp{q} {$src2, $src1|$src1, $src2}",
+                      [(X86cmp GR64:$src1, i64immSExt32:$src2)]>;
+def CMP64mi32 : RIi32<0x81, MRM7m, (ops i64mem:$src1, i64i32imm:$src2),
+                      "cmp{q} {$src2, $src1|$src1, $src2}",
+                      [(X86cmp (loadi64 addr:$src1), i64immSExt32:$src2)]>;
+def CMP64mi8 : RIi8<0x83, MRM7m, (ops i64mem:$src1, i64i8imm:$src2),
+                    "cmp{q} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (loadi64 addr:$src1), i64immSExt8:$src2)]>;
+def CMP64ri8 : RIi8<0x83, MRM7r, (ops GR64:$src1, i64i8imm:$src2),
+                    "cmp{q} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp GR64:$src1, i64immSExt8:$src2)]>;
+
+// Conditional moves
+let isTwoAddress = 1 in {
+def CMOVB64rr : RI<0x42, MRMSrcReg,       // if <u, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovb {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_B))]>, TB;
+def CMOVB64rm : RI<0x42, MRMSrcMem,       // if <u, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovb {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_B))]>, TB;
+def CMOVAE64rr: RI<0x43, MRMSrcReg,       // if >=u, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovae {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_AE))]>, TB;
+def CMOVAE64rm: RI<0x43, MRMSrcMem,       // if >=u, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovae {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_AE))]>, TB;
+def CMOVE64rr : RI<0x44, MRMSrcReg,       // if ==, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmove {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_E))]>, TB;
+def CMOVE64rm : RI<0x44, MRMSrcMem,       // if ==, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmove {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_E))]>, TB;
+def CMOVNE64rr: RI<0x45, MRMSrcReg,       // if !=, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovne {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_NE))]>, TB;
+def CMOVNE64rm: RI<0x45, MRMSrcMem,       // if !=, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovne {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_NE))]>, TB;
+def CMOVBE64rr: RI<0x46, MRMSrcReg,       // if <=u, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovbe {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_BE))]>, TB;
+def CMOVBE64rm: RI<0x46, MRMSrcMem,       // if <=u, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovbe {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_BE))]>, TB;
+def CMOVA64rr : RI<0x47, MRMSrcReg,       // if >u, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmova {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_A))]>, TB;
+def CMOVA64rm : RI<0x47, MRMSrcMem,       // if >u, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmova {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_A))]>, TB;
+def CMOVL64rr : RI<0x4C, MRMSrcReg,       // if <s, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovl {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_L))]>, TB;
+def CMOVL64rm : RI<0x4C, MRMSrcMem,       // if <s, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovl {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_L))]>, TB;
+def CMOVGE64rr: RI<0x4D, MRMSrcReg,       // if >=s, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovge {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_GE))]>, TB;
+def CMOVGE64rm: RI<0x4D, MRMSrcMem,       // if >=s, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovge {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_GE))]>, TB;
+def CMOVLE64rr: RI<0x4E, MRMSrcReg,       // if <=s, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovle {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_LE))]>, TB;
+def CMOVLE64rm: RI<0x4E, MRMSrcMem,       // if <=s, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovle {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_LE))]>, TB;
+def CMOVG64rr : RI<0x4F, MRMSrcReg,       // if >s, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovg {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_G))]>, TB;
+def CMOVG64rm : RI<0x4F, MRMSrcMem,       // if >s, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovg {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_G))]>, TB;
+def CMOVS64rr : RI<0x48, MRMSrcReg,       // if signed, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovs {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_S))]>, TB;
+def CMOVS64rm : RI<0x48, MRMSrcMem,       // if signed, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovs {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_S))]>, TB;
+def CMOVNS64rr: RI<0x49, MRMSrcReg,       // if !signed, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovns {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_NS))]>, TB;
+def CMOVNS64rm: RI<0x49, MRMSrcMem,       // if !signed, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovns {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_NS))]>, TB;
+def CMOVP64rr : RI<0x4A, MRMSrcReg,       // if parity, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovp {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_P))]>, TB;
+def CMOVP64rm : RI<0x4A, MRMSrcMem,       // if parity, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovp {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_P))]>, TB;
+def CMOVNP64rr : RI<0x4B, MRMSrcReg,       // if !parity, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovnp {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_NP))]>, TB;
+def CMOVNP64rm : RI<0x4B, MRMSrcMem,       // if !parity, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovnp {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_NP))]>, TB;
+} // isTwoAddress
+
+//===----------------------------------------------------------------------===//
+//  Conversion Instructions...
+//
+
+// f64 -> signed i64
+def Int_CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+                           "cvtsd2si{q} {$src, $dst|$dst, $src}",
+                           []>; // TODO: add intrinsic
+def Int_CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (ops GR64:$dst, f128mem:$src),
+                           "cvtsd2si{q} {$src, $dst|$dst, $src}",
+                           []>; // TODO: add intrinsic
+def CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (ops GR64:$dst, FR64:$src),
+                        "cvttsd2si{q} {$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (ops GR64:$dst, f64mem:$src),
+                        "cvttsd2si{q} {$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def Int_CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+                            "cvttsd2si{q} {$src, $dst|$dst, $src}",
+                            []>; // TODO: add intrinsic
+def Int_CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (ops GR64:$dst, f128mem:$src),
+                            "cvttsd2si{q} {$src, $dst|$dst, $src}",
+                            []>; // TODO: add intrinsic
+
+// Signed i64 -> f64
+def CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg, (ops FR64:$dst, GR64:$src),
+                       "cvtsi2sd{q} {$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, (ops FR64:$dst, i64mem:$src),
+                       "cvtsi2sd{q} {$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+let isTwoAddress = 1 in {
+def Int_CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg,
+                           (ops VR128:$dst, VR128:$src1, GR64:$src2),
+                           "cvtsi2sd{q} {$src2, $dst|$dst, $src2}",
+                           []>; // TODO: add intrinsic
+def Int_CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem,
+                           (ops VR128:$dst, VR128:$src1, i64mem:$src2),
+                           "cvtsi2sd{q} {$src2, $dst|$dst, $src2}",
+                           []>; // TODO: add intrinsic
+} // isTwoAddress
+
+// Signed i64 -> f32
+def CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, (ops FR32:$dst, GR64:$src),
+                       "cvtsi2ss{q} {$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, (ops FR32:$dst, i64mem:$src),
+                       "cvtsi2ss{q} {$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+let isTwoAddress = 1 in {
+def Int_CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg,
+                           (ops VR128:$dst, VR128:$src1, GR64:$src2),
+                           "cvtsi2ss{q} {$src2, $dst|$dst, $src2}",
+                           []>; // TODO: add intrinsic
+def Int_CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem,
+                           (ops VR128:$dst, VR128:$src1, i64mem:$src2),
+                           "cvtsi2ss{q} {$src2, $dst|$dst, $src2}",
+                           []>; // TODO: add intrinsic
+} // isTwoAddress
+
+// f32 -> signed i64
+def Int_CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+                           "cvtss2si{q} {$src, $dst|$dst, $src}",
+                           []>; // TODO: add intrinsic
+def Int_CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (ops GR64:$dst, f32mem:$src),
+                           "cvtss2si{q} {$src, $dst|$dst, $src}",
+                           []>; // TODO: add intrinsic
+def CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (ops GR64:$dst, FR32:$src),
+                        "cvttss2si{q} {$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (ops GR64:$dst, f32mem:$src),
+                        "cvttss2si{q} {$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def Int_CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+                            "cvttss2si{q} {$src, $dst|$dst, $src}",
+                            []>; // TODO: add intrinsic
+def Int_CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (ops GR64:$dst, f32mem:$src),
+                            "cvttss2si{q} {$src, $dst|$dst, $src}",
+                            []>; // TODO: add intrinsic
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Truncate
+// In 64-mode, each 64-bit and 32-bit registers has a low 8-bit sub-register.
+def TRUNC_64to8  : I<0x88, MRMDestReg, (ops GR8:$dst, GR64:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}",
+                     [(set GR8:$dst, (trunc GR64:$src))]>;
+def TRUNC_32to8  : I<0x88, MRMDestReg, (ops GR8:$dst, GR32:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}",
+                     [(set GR8:$dst, (trunc GR32:$src))]>,
+                   Requires<[In64BitMode]>;
+def TRUNC_16to8  : I<0x88, MRMDestReg, (ops GR8:$dst, GR16:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}}",
+                     [(set GR8:$dst, (trunc GR16:$src))]>,
+                   Requires<[In64BitMode]>;
+
+def TRUNC_64to16 : I<0x89, MRMDestReg, (ops GR16:$dst, GR64:$src),
+                     "mov{w} {${src:subreg16}, $dst|$dst, ${src:subreg16}}",
+                     [(set GR16:$dst, (trunc GR64:$src))]>;
+
+def TRUNC_64to32 : I<0x89, MRMDestReg, (ops GR32:$dst, GR64:$src),
+                     "mov{l} {${src:subreg32}, $dst|$dst, ${src:subreg32}}",
+                     [(set GR32:$dst, (trunc GR64:$src))]>;
+
+// Zero-extension
+// TODO: Remove this after proper i32 -> i64 zext support.
+def PsMOVZX64rr32: I<0x89, MRMDestReg, (ops GR64:$dst, GR32:$src),
+                     "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                     [(set GR64:$dst, (zext GR32:$src))]>;
+def PsMOVZX64rm32: I<0x8B, MRMSrcMem, (ops GR64:$dst, i32mem:$src),
+                     "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                     [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
+
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+// FIXME: AddedComplexity gives MOV64r0 a higher priority than MOV64ri32. Remove
+// when we have a better way to specify isel priority.
+let AddedComplexity = 1 in
+def MOV64r0  : RI<0x31, MRMInitReg,  (ops GR64:$dst), 
+                 "xor{q} $dst, $dst",
+                 [(set GR64:$dst, 0)]>;
+
+// Materialize i64 constant where top 32-bits are zero.
+let AddedComplexity = 1 in
+def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (ops GR64:$dst, i64i32imm:$src),
+                        "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                        [(set GR64:$dst, i64immZExt32:$src)]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri tconstpool  :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri tjumptable  :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri tglobaladdr :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri texternalsym:$dst)>, Requires<[NotSmallCode]>;
+
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tconstpool:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tjumptable:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, texternalsym:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+
+// Calls
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86tailcall (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86tailcall (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86tailcall GR64:$dst),
+          (CALL64r GR64:$dst)>;
+
+// {s|z}extload bool -> {s|z}extload byte
+def : Pat<(sextloadi64i1 addr:$src), (MOVSX64rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+
+// extload
+def : Pat<(extloadi64i1 addr:$src),  (MOVZX64rm8  addr:$src)>;
+def : Pat<(extloadi64i8 addr:$src),  (MOVZX64rm8  addr:$src)>;
+def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
+def : Pat<(extloadi64i32 addr:$src), (PsMOVZX64rm32 addr:$src)>;
+
+// anyext -> zext
+def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8  GR8 :$src)>;
+def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16:$src)>;
+def : Pat<(i64 (anyext GR32:$src)), (PsMOVZX64rr32 GR32:$src)>;
+def : Pat<(i64 (anyext (loadi8  addr:$src))), (MOVZX64rm8  addr:$src)>;
+def : Pat<(i64 (anyext (loadi16 addr:$src))), (MOVZX64rm16 addr:$src)>;
+def : Pat<(i64 (anyext (loadi32 addr:$src))), (PsMOVZX64rm32 addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// (or (x >> c) | (y << (64 - c))) ==> (shrd64 x, y, c)
+def : Pat<(or (srl GR64:$src1, CL:$amt),
+              (shl GR64:$src2, (sub 64, CL:$amt))),
+          (SHRD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (srl (loadi64 addr:$dst), CL:$amt),
+                     (shl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
+          (SHRD64mrCL addr:$dst, GR64:$src2)>;
+
+// (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+def : Pat<(or (shl GR64:$src1, CL:$amt),
+              (srl GR64:$src2, (sub 64, CL:$amt))),
+          (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (shl (loadi64 addr:$dst), CL:$amt),
+                     (srl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
+          (SHLD64mrCL addr:$dst, GR64:$src2)>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR64:$src1, GR64:$src2),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(addc GR64:$src1, (load addr:$src2)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt32:$src2),
+          (ADD64ri32 GR64:$src1, imm:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt8:$src2),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+
+def : Pat<(subc GR64:$src1, GR64:$src2),
+          (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(subc GR64:$src1, (load addr:$src2)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(subc GR64:$src1, imm:$src2),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(subc GR64:$src1, i64immSExt8:$src2),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE Instructions
+//===----------------------------------------------------------------------===//
+
+// Move instructions...
+
+def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR64:$src),
+                        "mov{d|q} {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))]>;
+def MOV64toPQIrm : RPDI<0x6E, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                        "mov{d|q} {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>;
+
+def MOVPQIto64rr  : RPDI<0x7E, MRMDestReg, (ops GR64:$dst, VR128:$src),
+                         "mov{d|q} {$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                           (iPTR 0)))]>;
+def MOVPQIto64mr  : RPDI<0x7E, MRMDestMem, (ops i64mem:$dst, VR128:$src),
+                         "mov{d|q} {$src, $dst|$dst, $src}",
+                         [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                       (iPTR 0))), addr:$dst)]>;
+
+def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (ops FR64:$dst, GR64:$src),
+                       "mov{d|q} {$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))]>;
+def MOV64toSDrm : RPDI<0x6E, MRMSrcMem, (ops FR64:$dst, i64mem:$src),
+                       "mov{d|q} {$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
+
+def MOVSDto64rr  : RPDI<0x7E, MRMDestReg, (ops GR64:$dst, FR64:$src),
+                        "mov{d|q} {$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def MOVSDto64mr  : RPDI<0x7E, MRMDestMem, (ops i64mem:$dst, FR64:$src),
+                        "mov{d|q} {$src, $dst|$dst, $src}",
+                        [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;

diff --git a/lib/Target/X86/X86IntelAsmPrinter.cpp b/lib/Target/X86/X86IntelAsmPrinter.cpp
new file mode 100755
index 0000000..39b65ee
--- /dev/null
+++ b/lib/Target/X86/X86IntelAsmPrinter.cpp

@@ -0,0 +1,533 @@
+//===-- X86IntelAsmPrinter.cpp - Convert X86 LLVM code to Intel assembly --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Intel format assembly language.
+// This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86IntelAsmPrinter.h"
+#include "X86TargetAsmInfo.h"
+#include "X86.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+std::string X86IntelAsmPrinter::getSectionForFunction(const Function &F) const {
+  // Intel asm always emits functions to _text.
+  return "_text";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86IntelAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  unsigned CC = F->getCallingConv();
+
+  // Populate function information map.  Actually, We don't want to populate
+  // non-stdcall or non-fastcall functions' information right now.
+  if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+    FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
+
+  X86SharedAsmPrinter::decorateName(CurrentFnName, F);
+
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unsupported linkage type!");
+  case Function::InternalLinkage:
+    EmitAlignment(4);
+    break;    
+  case Function::DLLExportLinkage:
+    DLLExportedFns.insert(CurrentFnName);
+    //FALLS THROUGH
+  case Function::ExternalLinkage:
+    O << "\tpublic " << CurrentFnName << "\n";
+    EmitAlignment(4);
+    break;    
+  }
+  
+  O << CurrentFnName << "\tproc near\n";
+  
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block if there are any predecessors.
+    if (I->pred_begin() != I->pred_end()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printMachineInstruction(II);
+    }
+  }
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  O << CurrentFnName << "\tendp\n";
+
+  // We didn't modify anything.
+  return false;
+}
+
+void X86IntelAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+  unsigned char value = MI->getOperand(Op).getImmedValue();
+  assert(value <= 7 && "Invalid ssecc argument!");
+  switch (value) {
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+void X86IntelAsmPrinter::printOp(const MachineOperand &MO, 
+                                 const char *Modifier) {
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {      
+    if (MRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      unsigned Reg = MO.getReg();
+      if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+        MVT::ValueType VT = (strcmp(Modifier,"subreg64") == 0) ?
+          MVT::i64 : ((strcmp(Modifier, "subreg32") == 0) ? MVT::i32 :
+                      ((strcmp(Modifier,"subreg16") == 0) ? MVT::i16 :MVT::i8));
+        Reg = getX86SubSuperRegister(Reg, VT);
+      }
+      O << RI.get(Reg).Name;
+    } else
+      O << "reg" << MO.getReg();
+    return;
+  }
+  case MachineOperand::MO_Immediate:
+    O << MO.getImmedValue();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMachineBasicBlock());
+    return;
+  case MachineOperand::MO_JumpTableIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << "OFFSET ";
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << "_" << MO.getJumpTableIndex();
+    return;
+  }    
+  case MachineOperand::MO_ConstantPoolIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << "OFFSET ";
+    O << "[" << TAI->getPrivateGlobalPrefix() << "CPI"
+      << getFunctionNumber() << "_" << MO.getConstantPoolIndex();
+    int Offset = MO.getOffset();
+    if (Offset > 0)
+      O << " + " << Offset;
+    else if (Offset < 0)
+      O << Offset;
+    O << "]";
+    return;
+  }
+  case MachineOperand::MO_GlobalAddress: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    GlobalValue *GV = MO.getGlobal();    
+    std::string Name = Mang->getValueName(GV);
+
+    X86SharedAsmPrinter::decorateName(Name, GV);
+
+    if (!isMemOp && !isCallOp) O << "OFFSET ";
+    if (GV->hasDLLImportLinkage()) {
+      // FIXME: This should be fixed with full support of stdcall & fastcall
+      // CC's
+      O << "__imp_";          
+    } 
+    O << Name;
+    int Offset = MO.getOffset();
+    if (Offset > 0)
+      O << " + " << Offset;
+    else if (Offset < 0)
+      O << Offset;
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    if (!isCallOp) O << "OFFSET ";
+    O << TAI->getGlobalPrefix() << MO.getSymbolName();
+    return;
+  }
+  default:
+    O << "<unknown operand type>"; return;
+  }
+}
+
+void X86IntelAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                           const char *Modifier) {
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+
+  const MachineOperand &BaseReg  = MI->getOperand(Op);
+  int ScaleVal                   = MI->getOperand(Op+1).getImmedValue();
+  const MachineOperand &IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+  O << "[";
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOp(BaseReg, Modifier);
+    NeedPlus = true;
+  }
+
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << "*";
+    printOp(IndexReg, Modifier);
+    NeedPlus = true;
+  }
+
+  if (DispSpec.isGlobalAddress() || DispSpec.isConstantPoolIndex() ||
+      DispSpec.isJumpTableIndex()) {
+    if (NeedPlus)
+      O << " + ";
+    printOp(DispSpec, "mem");
+  } else {
+    int DispVal = DispSpec.getImmedValue();
+    if (DispVal || (!BaseReg.getReg() && !IndexReg.getReg())) {
+      if (NeedPlus)
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      O << DispVal;
+    }
+  }
+  O << "]";
+}
+
+void X86IntelAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+  O << "\"L" << getFunctionNumber() << "$pb\"\n";
+  O << "\"L" << getFunctionNumber() << "$pb\":";
+}
+
+bool X86IntelAsmPrinter::printAsmMRegister(const MachineOperand &MO,
+                                           const char Mode) {
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    break;
+  }
+
+  O << '%' << RI.get(Reg).Name;
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86IntelAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                         unsigned AsmVariant, 
+                                         const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+      return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
+    }
+  }
+  
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool X86IntelAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                               unsigned OpNo,
+                                               unsigned AsmVariant, 
+                                               const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  printMemReference(MI, OpNo);
+  return false;
+}
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction
+/// MI in Intel syntax to the current output stream.
+///
+void X86IntelAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // See if a truncate instruction can be turned into a nop.
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::TRUNC_64to32:
+  case X86::TRUNC_64to16:
+  case X86::TRUNC_32to16:
+  case X86::TRUNC_32to8:
+  case X86::TRUNC_16to8:
+  case X86::TRUNC_32_to8:
+  case X86::TRUNC_16_to8: {
+    const MachineOperand &MO0 = MI->getOperand(0);
+    const MachineOperand &MO1 = MI->getOperand(1);
+    unsigned Reg0 = MO0.getReg();
+    unsigned Reg1 = MO1.getReg();
+    unsigned Opc = MI->getOpcode();
+    if (Opc == X86::TRUNC_64to32)
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i32);
+    else if (Opc == X86::TRUNC_32to16 || Opc == X86::TRUNC_64to16)
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i16);
+    else
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i8);
+    O << TAI->getCommentString() << " TRUNCATE ";
+    if (Reg0 != Reg1)
+      O << "\n\t";
+    break;
+  }
+  case X86::PsMOVZX64rr32:
+    O << TAI->getCommentString() << " ZERO-EXTEND " << "\n\t";
+    break;
+  }
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+}
+
+bool X86IntelAsmPrinter::doInitialization(Module &M) {
+  X86SharedAsmPrinter::doInitialization(M);
+  
+  Mang->markCharUnacceptable('.');
+
+  O << "\t.686\n\t.model flat\n\n";
+
+  // Emit declarations for external functions.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (I->isDeclaration()) {
+      std::string Name = Mang->getValueName(I);
+      X86SharedAsmPrinter::decorateName(Name, I);
+
+      O << "\textern " ;
+      if (I->hasDLLImportLinkage()) {
+        O << "__imp_";
+      }      
+      O << Name << ":near\n";
+    }
+  
+  // Emit declarations for external globals.  Note that VC++ always declares
+  // external globals to have type byte, and if that's good enough for VC++...
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->isDeclaration()) {
+      std::string Name = Mang->getValueName(I);
+
+      O << "\textern " ;
+      if (I->hasDLLImportLinkage()) {
+        O << "__imp_";
+      }      
+      O << Name << ":byte\n";
+    }
+  }
+
+  return false;
+}
+
+bool X86IntelAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->isDeclaration()) continue;   // External global require no code
+    
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(I))
+      continue;
+    
+    std::string name = Mang->getValueName(I);
+    Constant *C = I->getInitializer();
+    unsigned Align = TD->getPreferredAlignmentLog(I);
+    bool bCustomSegment = false;
+
+    switch (I->getLinkage()) {
+    case GlobalValue::LinkOnceLinkage:
+    case GlobalValue::WeakLinkage:
+      SwitchToDataSection("");
+      O << name << "?\tsegment common 'COMMON'\n";
+      bCustomSegment = true;
+      // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256
+      // are also available.
+      break;
+    case GlobalValue::AppendingLinkage:
+      SwitchToDataSection("");
+      O << name << "?\tsegment public 'DATA'\n";
+      bCustomSegment = true;
+      // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256
+      // are also available.
+      break;
+    case GlobalValue::DLLExportLinkage:
+      DLLExportedGVs.insert(name);
+      // FALL THROUGH
+    case GlobalValue::ExternalLinkage:
+      O << "\tpublic " << name << "\n";
+      // FALL THROUGH
+    case GlobalValue::InternalLinkage:
+      SwitchToDataSection(TAI->getDataSection(), I);
+      break;
+    default:
+      assert(0 && "Unknown linkage type!");
+    }
+
+    if (!bCustomSegment)
+      EmitAlignment(Align, I);
+
+    O << name << ":\t\t\t\t" << TAI->getCommentString()
+      << " " << I->getName() << '\n';
+
+    EmitGlobalConstant(C);
+
+    if (bCustomSegment)
+      O << name << "?\tends\n";
+  }
+
+    // Output linker support code for dllexported globals
+  if ((DLLExportedGVs.begin() != DLLExportedGVs.end()) ||
+      (DLLExportedFns.begin() != DLLExportedFns.end())) {
+    SwitchToDataSection("");
+    O << "; WARNING: The following code is valid only with MASM v8.x and (possible) higher\n"
+      << "; This version of MASM is usually shipped with Microsoft Visual Studio 2005\n"
+      << "; or (possible) further versions. Unfortunately, there is no way to support\n"
+      << "; dllexported symbols in the earlier versions of MASM in fully automatic way\n\n";
+    O << "_drectve\t segment info alias('.drectve')\n";
+  }
+
+  for (std::set<std::string>::iterator i = DLLExportedGVs.begin(),
+         e = DLLExportedGVs.end();
+         i != e; ++i) {
+    O << "\t db ' /EXPORT:" << *i << ",data'\n";
+  }    
+
+  for (std::set<std::string>::iterator i = DLLExportedFns.begin(),
+         e = DLLExportedFns.end();
+         i != e; ++i) {
+    O << "\t db ' /EXPORT:" << *i << "'\n";
+  }    
+
+  if ((DLLExportedGVs.begin() != DLLExportedGVs.end()) ||
+      (DLLExportedFns.begin() != DLLExportedFns.end())) {
+    O << "_drectve\t ends\n";    
+  }
+  
+  // Bypass X86SharedAsmPrinter::doFinalization().
+  AsmPrinter::doFinalization(M);
+  SwitchToDataSection("");
+  O << "\tend\n";
+  return false; // success
+}
+
+void X86IntelAsmPrinter::EmitString(const ConstantArray *CVA) const {
+  unsigned NumElts = CVA->getNumOperands();
+  if (NumElts) {
+    // ML does not have escape sequences except '' for '.  It also has a maximum
+    // string length of 255.
+    unsigned len = 0;
+    bool inString = false;
+    for (unsigned i = 0; i < NumElts; i++) {
+      int n = cast<ConstantInt>(CVA->getOperand(i))->getZExtValue() & 255;
+      if (len == 0)
+        O << "\tdb ";
+
+      if (n >= 32 && n <= 127) {
+        if (!inString) {
+          if (len > 0) {
+            O << ",'";
+            len += 2;
+          } else {
+            O << "'";
+            len++;
+          }
+          inString = true;
+        }
+        if (n == '\'') {
+          O << "'";
+          len++;
+        }
+        O << char(n);
+      } else {
+        if (inString) {
+          O << "'";
+          len++;
+          inString = false;
+        }
+        if (len > 0) {
+          O << ",";
+          len++;
+        }
+        O << n;
+        len += 1 + (n > 9) + (n > 99);
+      }
+
+      if (len > 60) {
+        if (inString) {
+          O << "'";
+          inString = false;
+        }
+        O << "\n";
+        len = 0;
+      }
+    }
+
+    if (len > 0) {
+      if (inString)
+        O << "'";
+      O << "\n";
+    }
+  }
+}
+
+// Include the auto-generated portion of the assembly writer.
+#include "X86GenAsmWriter1.inc"

diff --git a/lib/Target/X86/X86IntelAsmPrinter.h b/lib/Target/X86/X86IntelAsmPrinter.h
new file mode 100755
index 0000000..9ad11ff
--- /dev/null
+++ b/lib/Target/X86/X86IntelAsmPrinter.h

@@ -0,0 +1,112 @@
+//===-- X86IntelAsmPrinter.h - Convert X86 LLVM code to Intel assembly ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Intel assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INTELASMPRINTER_H
+#define X86INTELASMPRINTER_H
+
+#include "X86AsmPrinter.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Target/MRegisterInfo.h"
+
+namespace llvm {
+
+struct X86IntelAsmPrinter : public X86SharedAsmPrinter {
+  X86IntelAsmPrinter(std::ostream &O, X86TargetMachine &TM,
+                     const TargetAsmInfo *T)
+      : X86SharedAsmPrinter(O, TM, T) {
+  }
+
+  virtual const char *getPassName() const {
+    return "X86 Intel-Style Assembly Printer";
+  }
+
+  /// printInstruction - This method is automatically generated by tablegen
+  /// from the instruction set description.  This method returns true if the
+  /// machine instruction was sufficiently described to print it, otherwise it
+  /// returns false.
+  bool printInstruction(const MachineInstr *MI);
+
+  // This method is used by the tablegen'erated instruction printer.
+  void printOperand(const MachineInstr *MI, unsigned OpNo,
+                    const char *Modifier = 0) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    if (MO.isRegister()) {
+      assert(MRegisterInfo::isPhysicalRegister(MO.getReg()) && "Not physreg??");
+      O << TM.getRegisterInfo()->get(MO.getReg()).Name;
+    } else {
+      printOp(MO, Modifier);
+    }
+  }
+
+  void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "BYTE PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "WORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo, "subreg64");
+  }
+
+  bool printAsmMRegister(const MachineOperand &MO, const char Mode);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode);
+  void printMachineInstruction(const MachineInstr *MI);
+  void printOp(const MachineOperand &MO, const char *Modifier = 0);
+  void printSSECC(const MachineInstr *MI, unsigned Op);
+  void printMemReference(const MachineInstr *MI, unsigned Op,
+                         const char *Modifier=NULL);
+  void printPICLabel(const MachineInstr *MI, unsigned Op);
+  bool runOnMachineFunction(MachineFunction &F);
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+  
+  /// getSectionForFunction - Return the section that we should emit the
+  /// specified function body into.
+  virtual std::string getSectionForFunction(const Function &F) const;
+
+  virtual void EmitString(const ConstantArray *CVA) const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
new file mode 100644
index 0000000..b9e5d5b
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.cpp

@@ -0,0 +1,372 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/Config/alloca.h"
+#include <cstdlib>
+using namespace llvm;
+
+#ifdef _MSC_VER
+  extern "C" void *_AddressOfReturnAddress(void);
+  #pragma intrinsic(_AddressOfReturnAddress)
+#endif
+
+void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  unsigned char *OldByte = (unsigned char *)Old;
+  *OldByte++ = 0xE9;                // Emit JMP opcode.
+  unsigned *OldWord = (unsigned *)OldByte;
+  unsigned NewAddr = (intptr_t)New;
+  unsigned OldAddr = (intptr_t)OldWord;
+  *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+}
+
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// Provide a wrapper for X86CompilationCallback2 that saves non-traditional
+// callee saved registers, for the fastcc calling convention.
+extern "C" {
+#if defined(__x86_64__)
+  // No need to save EAX/EDX for X86-64.
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback\n"
+  ASMPREFIX "X86CompilationCallback:\n"
+    // Save RBP
+    "pushq   %rbp\n"
+    // Save RSP
+    "movq    %rsp, %rbp\n"
+    // Save all int arg registers
+    "pushq   %rdi\n"
+    "pushq   %rsi\n"
+    "pushq   %rdx\n"
+    "pushq   %rcx\n"
+    "pushq   %r8\n"
+    "pushq   %r9\n"
+    // Align stack on 16-byte boundary. ESP might not be properly aligned
+    // (8 byte) if this is called from an indirect stub.
+    "andq    $-16, %rsp\n"
+    // Save all XMM arg registers
+    "subq    $128, %rsp\n"
+    "movaps  %xmm0, (%rsp)\n"
+    "movaps  %xmm1, 16(%rsp)\n"
+    "movaps  %xmm2, 32(%rsp)\n"
+    "movaps  %xmm3, 48(%rsp)\n"
+    "movaps  %xmm4, 64(%rsp)\n"
+    "movaps  %xmm5, 80(%rsp)\n"
+    "movaps  %xmm6, 96(%rsp)\n"
+    "movaps  %xmm7, 112(%rsp)\n"
+    // JIT callee
+    "movq    %rbp, %rdi\n"    // Pass prev frame and return address
+    "movq    8(%rbp), %rsi\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    // Restore all XMM arg registers
+    "movaps  112(%rsp), %xmm7\n"
+    "movaps  96(%rsp), %xmm6\n"
+    "movaps  80(%rsp), %xmm5\n"
+    "movaps  64(%rsp), %xmm4\n"
+    "movaps  48(%rsp), %xmm3\n"
+    "movaps  32(%rsp), %xmm2\n"
+    "movaps  16(%rsp), %xmm1\n"
+    "movaps  (%rsp), %xmm0\n"
+    // Restore RSP
+    "movq    %rbp, %rsp\n"
+    // Restore all int arg registers
+    "subq    $48, %rsp\n"
+    "popq    %r9\n"
+    "popq    %r8\n"
+    "popq    %rcx\n"
+    "popq    %rdx\n"
+    "popq    %rsi\n"
+    "popq    %rdi\n"
+    // Restore RBP
+    "popq    %rbp\n"
+    "ret\n");
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+#ifndef _MSC_VER
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX  "X86CompilationCallback\n"
+  ASMPREFIX "X86CompilationCallback:\n"
+    "pushl   %ebp\n"
+    "movl    %esp, %ebp\n"    // Standard prologue
+    "pushl   %eax\n"
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    "pushl   %ecx\n"
+#if defined(__APPLE__)
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+#endif
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "movl    %ebp, %esp\n"    // Restore ESP
+    "subl    $12, %esp\n"
+    "popl    %ecx\n"
+    "popl    %edx\n"
+    "popl    %eax\n"
+    "popl    %ebp\n"
+    "ret\n");
+
+  // Same as X86CompilationCallback but also saves XMM argument registers.
+  void X86CompilationCallback_SSE(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX  "X86CompilationCallback_SSE\n"
+  ASMPREFIX "X86CompilationCallback_SSE:\n"
+    "pushl   %ebp\n"
+    "movl    %esp, %ebp\n"    // Standard prologue
+    "pushl   %eax\n"
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    "pushl   %ecx\n"
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+    // Save all XMM arg registers
+    "subl    $64, %esp\n"
+    "movaps  %xmm0, (%esp)\n"
+    "movaps  %xmm1, 16(%esp)\n"
+    "movaps  %xmm2, 32(%esp)\n"
+    "movaps  %xmm3, 48(%esp)\n"
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "addl    $16, %esp\n"
+    "movaps  48(%esp), %xmm3\n"
+    "movaps  32(%esp), %xmm2\n"
+    "movaps  16(%esp), %xmm1\n"
+    "movaps  (%esp), %xmm0\n"
+    "movl    %ebp, %esp\n"    // Restore ESP
+    "subl    $12, %esp\n"
+    "popl    %ecx\n"
+    "popl    %edx\n"
+    "popl    %eax\n"
+    "popl    %ebp\n"
+    "ret\n");
+#else
+  void X86CompilationCallback2(void);
+
+  _declspec(naked) void X86CompilationCallback(void) {
+    __asm {
+      push  eax
+      push  edx
+      push  ecx
+      call  X86CompilationCallback2
+      pop   ecx
+      pop   edx
+      pop   eax
+      ret
+    }
+  }
+#endif // _MSC_VER
+
+#else // Not an i386 host
+  void X86CompilationCallback() {
+    assert(0 && "Cannot call X86CompilationCallback() on a non-x86 arch!\n");
+    abort();
+  }
+#endif
+}
+
+/// X86CompilationCallback - This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call.  This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+#ifdef _MSC_VER
+extern "C" void X86CompilationCallback2() {
+  assert(sizeof(size_t) == 4); // FIXME: handle Win64
+  intptr_t *RetAddrLoc = (intptr_t *)_AddressOfReturnAddress();
+  RetAddrLoc += 4;  // skip over ret addr, edx, eax, ecx
+  intptr_t RetAddr = *RetAddrLoc;
+#else
+extern "C" void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
+  intptr_t *RetAddrLoc = &StackPtr[1];
+#endif
+  assert(*RetAddrLoc == RetAddr &&
+         "Could not find return address on the stack!");
+
+  // It's a stub if there is an interrupt marker after the call.
+  bool isStub = ((unsigned char*)RetAddr)[0] == 0xCD;
+
+  // The call instruction should have pushed the return value onto the stack...
+#ifdef __x86_64__
+  RetAddr--;     // Backtrack to the reference itself...
+#else
+  RetAddr -= 4;  // Backtrack to the reference itself...
+#endif
+
+#if 0
+  DOUT << "In callback! Addr=" << (void*)RetAddr
+       << " ESP=" << (void*)StackPtr
+       << ": Resolving call to function: "
+       << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n";
+#endif
+
+  // Sanity check to make sure this really is a call instruction.
+#ifdef __x86_64__
+  assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
+  assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
+#else
+  assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+#endif
+
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call.
+#ifdef __x86_64__
+  *(intptr_t *)(RetAddr - 0xa) = NewVal;
+#else
+  *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+#endif
+
+  if (isStub) {
+    // If this is a stub, rewrite the call into an unconditional branch
+    // instruction so that two return addresses are not pushed onto the stack
+    // when the requested function finally gets called.  This also makes the
+    // 0xCD byte (interrupt) dead, so the marker doesn't effect anything.
+#ifdef __x86_64__
+    ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
+#else
+    ((unsigned char*)RetAddr)[-1] = 0xE9;
+#endif
+  }
+
+  // Change the return address to reexecute the call instruction...
+#ifdef __x86_64__
+  *RetAddrLoc -= 0xd;
+#else
+  *RetAddrLoc -= 5;
+#endif
+}
+
+TargetJITInfo::LazyResolverFn
+X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+
+#if (defined(__i386__) || defined(i386) || defined(_M_IX86)) && \
+  !defined(_MSC_VER) && !defined(__x86_64__)
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  if (!X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1)) {
+    // FIXME: support for AMD family of processors.
+    if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+      X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+      if ((EDX >> 25) & 0x1)
+        return X86CompilationCallback_SSE;
+    }
+  }
+#endif
+
+  return X86CompilationCallback;
+}
+
+void *X86JITInfo::emitFunctionStub(void *Fn, MachineCodeEmitter &MCE) {
+  // Note, we cast to intptr_t here to silence a -pedantic warning that 
+  // complains about casting a function pointer to a normal pointer.
+#if (defined(__i386__) || defined(i386) || defined(_M_IX86)) && \
+  !defined(_MSC_VER) && !defined(__x86_64__)
+  bool NotCC = (Fn != (void*)(intptr_t)X86CompilationCallback &&
+                Fn != (void*)(intptr_t)X86CompilationCallback_SSE);
+#else
+  bool NotCC = Fn != (void*)(intptr_t)X86CompilationCallback;
+#endif
+  if (NotCC) {
+#ifdef __x86_64__
+    MCE.startFunctionStub(13, 4);
+    MCE.emitByte(0x49);          // REX prefix
+    MCE.emitByte(0xB8+2);        // movabsq r10
+    MCE.emitWordLE(((unsigned *)&Fn)[0]);
+    MCE.emitWordLE(((unsigned *)&Fn)[1]);
+    MCE.emitByte(0x41);          // REX prefix
+    MCE.emitByte(0xFF);          // jmpq *r10
+    MCE.emitByte(2 | (4 << 3) | (3 << 6));
+#else
+    MCE.startFunctionStub(5, 4);
+    MCE.emitByte(0xE9);
+    MCE.emitWordLE((intptr_t)Fn-MCE.getCurrentPCValue()-4);
+#endif
+    return MCE.finishFunctionStub(0);
+  }
+
+#ifdef __x86_64__
+  MCE.startFunctionStub(14, 4);
+  MCE.emitByte(0x49);          // REX prefix
+  MCE.emitByte(0xB8+2);        // movabsq r10
+  MCE.emitWordLE(((unsigned *)&Fn)[0]);
+  MCE.emitWordLE(((unsigned *)&Fn)[1]);
+  MCE.emitByte(0x41);          // REX prefix
+  MCE.emitByte(0xFF);          // callq *r10
+  MCE.emitByte(2 | (2 << 3) | (3 << 6));
+#else
+  MCE.startFunctionStub(6, 4);
+  MCE.emitByte(0xE8);   // Call with 32 bit pc-rel destination...
+
+  MCE.emitWordLE((intptr_t)Fn-MCE.getCurrentPCValue()-4);
+#endif
+
+  MCE.emitByte(0xCD);   // Interrupt - Just a marker identifying the stub!
+  return MCE.finishFunctionStub(0);
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((X86::RelocationType)MR->getRelocationType()) {
+    case X86::reloc_pcrel_word: {
+      // PC relative relocation, add the relocated value to the value already in
+      // memory, after we adjust it for where the PC is.
+      ResultPtr = ResultPtr-(intptr_t)RelocPos-4-MR->getConstantVal();
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    }
+    case X86::reloc_absolute_word:
+      // Absolute relocation, just add the relocated value to the value already
+      // in memory.
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    case X86::reloc_absolute_dword:
+      *((intptr_t*)RelocPos) += ResultPtr;
+      break;
+    }
+  }
+}

diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
new file mode 100644
index 0000000..a4c731a
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.h

@@ -0,0 +1,50 @@
+//===- X86JITInfo.h - X86 implementation of the JIT interface  --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86JITINFO_H
+#define X86JITINFO_H
+
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class X86TargetMachine;
+
+  class X86JITInfo : public TargetJITInfo {
+    X86TargetMachine &TM;
+  public:
+    X86JITInfo(X86TargetMachine &tm) : TM(tm) {useGOT = 0;}
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    /// emitFunctionStub - Use the specified MachineCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(void *Fn, MachineCodeEmitter &MCE);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+  };
+}
+
+#endif

diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..7a21fb2
--- /dev/null
+++ b/lib/Target/X86/X86MachineFunctionInfo.h

@@ -0,0 +1,74 @@
+//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MACHINEFUNCTIONINFO_H
+#define X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+enum NameDecorationStyle {
+  None,
+  StdCall,
+  FastCall
+};
+  
+/// X86MachineFunctionInfo - This class is derived from MachineFunction private
+/// X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+  /// ForceFramePointer - True if the function is required to use of frame
+  /// pointer for reasons other than it containing dynamic allocation or 
+  /// that FP eliminatation is turned off. For example, Cygwin main function
+  /// contains stack pointer re-alignment code which requires FP.
+  bool ForceFramePointer;
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// BytesToPopOnReturn - amount of bytes function pops on return.
+  /// Used on windows platform for stdcall & fastcall name decoration
+  unsigned BytesToPopOnReturn;
+
+  /// If the function requires additional name decoration, DecorationStyle holds
+  /// the right way to do so.
+  NameDecorationStyle DecorationStyle;
+  
+public:
+  X86MachineFunctionInfo() : ForceFramePointer(false),
+                             CalleeSavedFrameSize(0),
+                             BytesToPopOnReturn(0),
+                             DecorationStyle(None) {}
+  
+  X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false),
+                                                CalleeSavedFrameSize(0),
+                                                BytesToPopOnReturn(0),
+                                                DecorationStyle(None) {}
+  
+  bool getForceFramePointer() const { return ForceFramePointer;} 
+  void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+  void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+  NameDecorationStyle getDecorationStyle() const { return DecorationStyle; }
+  void setDecorationStyle(NameDecorationStyle style) { DecorationStyle = style;}
+  
+};
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..da65db0
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.cpp

@@ -0,0 +1,1613 @@
+//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the MRegisterInfo class.  This
+// file is responsible for the frame pointer elimination optimization on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+namespace {
+  cl::opt<bool>
+  NoFusing("disable-spill-fusing",
+           cl::desc("Disable fusing of spill code into instructions"));
+  cl::opt<bool>
+  PrintFailedFusing("print-failed-fuse-candidates",
+                    cl::desc("Print instructions that the allocator wants to"
+                             " fuse, but the X86 backend currently can't"),
+                    cl::Hidden);
+}
+
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
+                                 const TargetInstrInfo &tii)
+  : X86GenRegisterInfo(X86::ADJCALLSTACKDOWN, X86::ADJCALLSTACKUP),
+    TM(tm), TII(tii) {
+  // Cache some information.
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  Is64Bit = Subtarget->is64Bit();
+  if (Is64Bit) {
+    SlotSize = 8;
+    StackPtr = X86::RSP;
+    FramePtr = X86::RBP;
+  } else {
+    SlotSize = 4;
+    StackPtr = X86::ESP;
+    FramePtr = X86::EBP;
+  }
+}
+
+bool X86RegisterInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                                MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  X86FI->setCalleeSavedFrameSize(CSI.size() * SlotSize);
+  unsigned Opc = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    BuildMI(MBB, MI, TII.get(Opc)).addReg(Reg);
+  }
+  return true;
+}
+
+bool X86RegisterInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                                 MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  unsigned Opc = Is64Bit ? X86::POP64r : X86::POP32r;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    BuildMI(MBB, MI, TII.get(Opc), Reg);
+  }
+  return true;
+}
+
+void X86RegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                          unsigned SrcReg, int FrameIdx,
+                                          const TargetRegisterClass *RC) const {
+  unsigned Opc;
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32RegClass) {
+    Opc = X86::MOV32mr;
+  } else if (RC == &X86::GR16RegClass) {
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8RegClass) {
+    Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR32_RegClass) {
+    Opc = X86::MOV32_mr;
+  } else if (RC == &X86::GR16_RegClass) {
+    Opc = X86::MOV16_mr;
+  } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) {
+    Opc = X86::ST_Fp64m;
+  } else if (RC == &X86::RFP32RegClass) {
+    Opc = X86::ST_Fp32m;
+  } else if (RC == &X86::FR32RegClass) {
+    Opc = X86::MOVSSmr;
+  } else if (RC == &X86::FR64RegClass) {
+    Opc = X86::MOVSDmr;
+  } else if (RC == &X86::VR128RegClass) {
+    Opc = X86::MOVAPSmr;
+  } else if (RC == &X86::VR64RegClass) {
+    Opc = X86::MMX_MOVQ64mr;
+  } else {
+    assert(0 && "Unknown regclass");
+    abort();
+  }
+  addFrameReference(BuildMI(MBB, MI, TII.get(Opc)), FrameIdx)
+    .addReg(SrcReg, false, false, true);
+}
+
+void X86RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC) const{
+  unsigned Opc;
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32RegClass) {
+    Opc = X86::MOV32rm;
+  } else if (RC == &X86::GR16RegClass) {
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8RegClass) {
+    Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR32_RegClass) {
+    Opc = X86::MOV32_rm;
+  } else if (RC == &X86::GR16_RegClass) {
+    Opc = X86::MOV16_rm;
+  } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) {
+    Opc = X86::LD_Fp64m;
+  } else if (RC == &X86::RFP32RegClass) {
+    Opc = X86::LD_Fp32m;
+  } else if (RC == &X86::FR32RegClass) {
+    Opc = X86::MOVSSrm;
+  } else if (RC == &X86::FR64RegClass) {
+    Opc = X86::MOVSDrm;
+  } else if (RC == &X86::VR128RegClass) {
+    Opc = X86::MOVAPSrm;
+  } else if (RC == &X86::VR64RegClass) {
+    Opc = X86::MMX_MOVQ64rm;
+  } else {
+    assert(0 && "Unknown regclass");
+    abort();
+  }
+  addFrameReference(BuildMI(MBB, MI, TII.get(Opc), DestReg), FrameIdx);
+}
+
+void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *RC) const {
+  unsigned Opc;
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64rr;
+  } else if (RC == &X86::GR32RegClass) {
+    Opc = X86::MOV32rr;
+  } else if (RC == &X86::GR16RegClass) {
+    Opc = X86::MOV16rr;
+  } else if (RC == &X86::GR8RegClass) {
+    Opc = X86::MOV8rr;
+  } else if (RC == &X86::GR32_RegClass) {
+    Opc = X86::MOV32_rr;
+  } else if (RC == &X86::GR16_RegClass) {
+    Opc = X86::MOV16_rr;
+  } else if (RC == &X86::RFP32RegClass) {
+    Opc = X86::MOV_Fp3232;
+  } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) {
+    Opc = X86::MOV_Fp6464;
+  } else if (RC == &X86::FR32RegClass) {
+    Opc = X86::FsMOVAPSrr;
+  } else if (RC == &X86::FR64RegClass) {
+    Opc = X86::FsMOVAPDrr;
+  } else if (RC == &X86::VR128RegClass) {
+    Opc = X86::MOVAPSrr;
+  } else if (RC == &X86::VR64RegClass) {
+    Opc = X86::MMX_MOVQ64rr;
+  } else {
+    assert(0 && "Unknown regclass");
+    abort();
+  }
+  BuildMI(MBB, MI, TII.get(Opc), DestReg).addReg(SrcReg);
+}
+
+
+void X86RegisterInfo::reMaterialize(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned DestReg,
+                                    const MachineInstr *Orig) const {
+  MachineInstr *MI = Orig->clone();
+  MI->getOperand(0).setReg(DestReg);
+  MBB.insert(I, MI);
+}
+
+static MachineInstr *FuseTwoAddrInst(unsigned Opcode, unsigned FrameIndex,
+                                     MachineInstr *MI,
+                                     const TargetInstrInfo &TII) {
+  unsigned NumOps = TII.getNumOperands(MI->getOpcode())-2;
+  // Create the base instruction with the memory operand as the first part.
+  MachineInstrBuilder MIB = addFrameReference(BuildMI(TII.get(Opcode)),
+                                              FrameIndex);
+  
+  // Loop over the rest of the ri operands, converting them over.
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MachineOperand &MO = MI->getOperand(i+2);
+    if (MO.isReg())
+      MIB = MIB.addReg(MO.getReg(), false, MO.isImplicit());
+    else if (MO.isImm())
+      MIB = MIB.addImm(MO.getImm());
+    else if (MO.isGlobalAddress())
+      MIB = MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset());
+    else if (MO.isJumpTableIndex())
+      MIB = MIB.addJumpTableIndex(MO.getJumpTableIndex());
+    else if (MO.isExternalSymbol())
+      MIB = MIB.addExternalSymbol(MO.getSymbolName());
+    else
+      assert(0 && "Unknown operand type!");
+  }
+  return MIB;
+}
+
+static MachineInstr *FuseInst(unsigned Opcode, unsigned OpNo,
+                              unsigned FrameIndex, MachineInstr *MI,
+                              const TargetInstrInfo &TII) {
+  MachineInstrBuilder MIB = BuildMI(TII.get(Opcode));
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (i == OpNo) {
+      assert(MO.isReg() && "Expected to fold into reg operand!");
+      MIB = addFrameReference(MIB, FrameIndex);
+    } else if (MO.isReg())
+      MIB = MIB.addReg(MO.getReg(), MO.isDef(), MO.isImplicit());
+    else if (MO.isImm())
+      MIB = MIB.addImm(MO.getImm());
+    else if (MO.isGlobalAddress())
+      MIB = MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset());
+    else if (MO.isJumpTableIndex())
+      MIB = MIB.addJumpTableIndex(MO.getJumpTableIndex());
+    else if (MO.isExternalSymbol())
+      MIB = MIB.addExternalSymbol(MO.getSymbolName());
+    else
+      assert(0 && "Unknown operand for FuseInst!");
+  }
+  return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII,
+                                unsigned Opcode, unsigned FrameIndex,
+                                MachineInstr *MI) {
+  return addFrameReference(BuildMI(TII.get(Opcode)), FrameIndex).addImm(0);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// TableEntry - Maps the 'from' opcode to a fused form of the 'to' opcode.
+  ///
+  struct TableEntry {
+    unsigned from;                      // Original opcode.
+    unsigned to;                        // New opcode.
+                                        
+    // less operators used by STL search.                                    
+    bool operator<(const TableEntry &TE) const { return from < TE.from; }
+    friend bool operator<(const TableEntry &TE, unsigned V) {
+      return TE.from < V;
+    }
+    friend bool operator<(unsigned V, const TableEntry &TE) {
+      return V < TE.from;
+    }
+  };
+}
+
+/// TableIsSorted - Return true if the table is in 'from' opcode order.
+///
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+  for (unsigned i = 1; i != NumEntries; ++i)
+    if (!(Table[i-1] < Table[i])) {
+      cerr << "Entries out of order " << Table[i-1].from
+           << " " << Table[i].from << "\n";
+      return false;
+    }
+  return true;
+}
+
+/// TableLookup - Return the table entry matching the specified opcode.
+/// Otherwise return NULL.
+static const TableEntry *TableLookup(const TableEntry *Table, unsigned N,
+                                unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+  if (I != Table+N && I->from == Opcode)
+    return I;
+  return NULL;
+}
+
+#define ARRAY_SIZE(TABLE)  \
+   (sizeof(TABLE)/sizeof(TABLE[0]))
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE)                                              \
+  { static bool TABLE##Checked = false;                                   \
+    if (!TABLE##Checked) {                                                \
+       assert(TableIsSorted(TABLE, ARRAY_SIZE(TABLE)) &&                  \
+              "All lookup tables must be sorted for efficient access!");  \
+       TABLE##Checked = true;                                             \
+    }                                                                     \
+  }
+#endif
+
+
+MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
+                                                 unsigned i,
+                                                 int FrameIndex) const {
+  // Check switch flag 
+  if (NoFusing) return NULL;
+
+  // Table (and size) to search
+  const TableEntry *OpcodeTablePtr = NULL;
+  unsigned OpcodeTableSize = 0;
+  bool isTwoAddrFold = false;
+  unsigned NumOps = TII.getNumOperands(MI->getOpcode());
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getInstrDescriptor()->getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+  MachineInstr *NewMI = NULL;
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  if (isTwoAddr && NumOps >= 2 && i < 2 &&
+      MI->getOperand(0).isReg() && 
+      MI->getOperand(1).isReg() &&
+      MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+    static const TableEntry OpcodeTable[] = {
+      { X86::ADC32ri,     X86::ADC32mi },
+      { X86::ADC32ri8,    X86::ADC32mi8 },
+      { X86::ADC32rr,     X86::ADC32mr },
+      { X86::ADC64ri32,   X86::ADC64mi32 },
+      { X86::ADC64ri8,    X86::ADC64mi8 },
+      { X86::ADC64rr,     X86::ADC64mr },
+      { X86::ADD16ri,     X86::ADD16mi },
+      { X86::ADD16ri8,    X86::ADD16mi8 },
+      { X86::ADD16rr,     X86::ADD16mr },
+      { X86::ADD32ri,     X86::ADD32mi },
+      { X86::ADD32ri8,    X86::ADD32mi8 },
+      { X86::ADD32rr,     X86::ADD32mr },
+      { X86::ADD64ri32,   X86::ADD64mi32 },
+      { X86::ADD64ri8,    X86::ADD64mi8 },
+      { X86::ADD64rr,     X86::ADD64mr },
+      { X86::ADD8ri,      X86::ADD8mi },
+      { X86::ADD8rr,      X86::ADD8mr },
+      { X86::AND16ri,     X86::AND16mi },
+      { X86::AND16ri8,    X86::AND16mi8 },
+      { X86::AND16rr,     X86::AND16mr },
+      { X86::AND32ri,     X86::AND32mi },
+      { X86::AND32ri8,    X86::AND32mi8 },
+      { X86::AND32rr,     X86::AND32mr },
+      { X86::AND64ri32,   X86::AND64mi32 },
+      { X86::AND64ri8,    X86::AND64mi8 },
+      { X86::AND64rr,     X86::AND64mr },
+      { X86::AND8ri,      X86::AND8mi },
+      { X86::AND8rr,      X86::AND8mr },
+      { X86::DEC16r,      X86::DEC16m },
+      { X86::DEC32r,      X86::DEC32m },
+      { X86::DEC64_16r,   X86::DEC16m },
+      { X86::DEC64_32r,   X86::DEC32m },
+      { X86::DEC64r,      X86::DEC64m },
+      { X86::DEC8r,       X86::DEC8m },
+      { X86::INC16r,      X86::INC16m },
+      { X86::INC32r,      X86::INC32m },
+      { X86::INC64_16r,   X86::INC16m },
+      { X86::INC64_32r,   X86::INC32m },
+      { X86::INC64r,      X86::INC64m },
+      { X86::INC8r,       X86::INC8m },
+      { X86::NEG16r,      X86::NEG16m },
+      { X86::NEG32r,      X86::NEG32m },
+      { X86::NEG64r,      X86::NEG64m },
+      { X86::NEG8r,       X86::NEG8m },
+      { X86::NOT16r,      X86::NOT16m },
+      { X86::NOT32r,      X86::NOT32m },
+      { X86::NOT64r,      X86::NOT64m },
+      { X86::NOT8r,       X86::NOT8m },
+      { X86::OR16ri,      X86::OR16mi },
+      { X86::OR16ri8,     X86::OR16mi8 },
+      { X86::OR16rr,      X86::OR16mr },
+      { X86::OR32ri,      X86::OR32mi },
+      { X86::OR32ri8,     X86::OR32mi8 },
+      { X86::OR32rr,      X86::OR32mr },
+      { X86::OR64ri32,    X86::OR64mi32 },
+      { X86::OR64ri8,     X86::OR64mi8 },
+      { X86::OR64rr,      X86::OR64mr },
+      { X86::OR8ri,       X86::OR8mi },
+      { X86::OR8rr,       X86::OR8mr },
+      { X86::ROL16r1,     X86::ROL16m1 },
+      { X86::ROL16rCL,    X86::ROL16mCL },
+      { X86::ROL16ri,     X86::ROL16mi },
+      { X86::ROL32r1,     X86::ROL32m1 },
+      { X86::ROL32rCL,    X86::ROL32mCL },
+      { X86::ROL32ri,     X86::ROL32mi },
+      { X86::ROL64r1,     X86::ROL64m1 },
+      { X86::ROL64rCL,    X86::ROL64mCL },
+      { X86::ROL64ri,     X86::ROL64mi },
+      { X86::ROL8r1,      X86::ROL8m1 },
+      { X86::ROL8rCL,     X86::ROL8mCL },
+      { X86::ROL8ri,      X86::ROL8mi },
+      { X86::ROR16r1,     X86::ROR16m1 },
+      { X86::ROR16rCL,    X86::ROR16mCL },
+      { X86::ROR16ri,     X86::ROR16mi },
+      { X86::ROR32r1,     X86::ROR32m1 },
+      { X86::ROR32rCL,    X86::ROR32mCL },
+      { X86::ROR32ri,     X86::ROR32mi },
+      { X86::ROR64r1,     X86::ROR64m1 },
+      { X86::ROR64rCL,    X86::ROR64mCL },
+      { X86::ROR64ri,     X86::ROR64mi },
+      { X86::ROR8r1,      X86::ROR8m1 },
+      { X86::ROR8rCL,     X86::ROR8mCL },
+      { X86::ROR8ri,      X86::ROR8mi },
+      { X86::SAR16r1,     X86::SAR16m1 },
+      { X86::SAR16rCL,    X86::SAR16mCL },
+      { X86::SAR16ri,     X86::SAR16mi },
+      { X86::SAR32r1,     X86::SAR32m1 },
+      { X86::SAR32rCL,    X86::SAR32mCL },
+      { X86::SAR32ri,     X86::SAR32mi },
+      { X86::SAR64r1,     X86::SAR64m1 },
+      { X86::SAR64rCL,    X86::SAR64mCL },
+      { X86::SAR64ri,     X86::SAR64mi },
+      { X86::SAR8r1,      X86::SAR8m1 },
+      { X86::SAR8rCL,     X86::SAR8mCL },
+      { X86::SAR8ri,      X86::SAR8mi },
+      { X86::SBB32ri,     X86::SBB32mi },
+      { X86::SBB32ri8,    X86::SBB32mi8 },
+      { X86::SBB32rr,     X86::SBB32mr },
+      { X86::SBB64ri32,   X86::SBB64mi32 },
+      { X86::SBB64ri8,    X86::SBB64mi8 },
+      { X86::SBB64rr,     X86::SBB64mr },
+      { X86::SHL16r1,     X86::SHL16m1 },
+      { X86::SHL16rCL,    X86::SHL16mCL },
+      { X86::SHL16ri,     X86::SHL16mi },
+      { X86::SHL32r1,     X86::SHL32m1 },
+      { X86::SHL32rCL,    X86::SHL32mCL },
+      { X86::SHL32ri,     X86::SHL32mi },
+      { X86::SHL64r1,     X86::SHL64m1 },
+      { X86::SHL64rCL,    X86::SHL64mCL },
+      { X86::SHL64ri,     X86::SHL64mi },
+      { X86::SHL8r1,      X86::SHL8m1 },
+      { X86::SHL8rCL,     X86::SHL8mCL },
+      { X86::SHL8ri,      X86::SHL8mi },
+      { X86::SHLD16rrCL,  X86::SHLD16mrCL },
+      { X86::SHLD16rri8,  X86::SHLD16mri8 },
+      { X86::SHLD32rrCL,  X86::SHLD32mrCL },
+      { X86::SHLD32rri8,  X86::SHLD32mri8 },
+      { X86::SHLD64rrCL,  X86::SHLD64mrCL },
+      { X86::SHLD64rri8,  X86::SHLD64mri8 },
+      { X86::SHR16r1,     X86::SHR16m1 },
+      { X86::SHR16rCL,    X86::SHR16mCL },
+      { X86::SHR16ri,     X86::SHR16mi },
+      { X86::SHR32r1,     X86::SHR32m1 },
+      { X86::SHR32rCL,    X86::SHR32mCL },
+      { X86::SHR32ri,     X86::SHR32mi },
+      { X86::SHR64r1,     X86::SHR64m1 },
+      { X86::SHR64rCL,    X86::SHR64mCL },
+      { X86::SHR64ri,     X86::SHR64mi },
+      { X86::SHR8r1,      X86::SHR8m1 },
+      { X86::SHR8rCL,     X86::SHR8mCL },
+      { X86::SHR8ri,      X86::SHR8mi },
+      { X86::SHRD16rrCL,  X86::SHRD16mrCL },
+      { X86::SHRD16rri8,  X86::SHRD16mri8 },
+      { X86::SHRD32rrCL,  X86::SHRD32mrCL },
+      { X86::SHRD32rri8,  X86::SHRD32mri8 },
+      { X86::SHRD64rrCL,  X86::SHRD64mrCL },
+      { X86::SHRD64rri8,  X86::SHRD64mri8 },
+      { X86::SUB16ri,     X86::SUB16mi },
+      { X86::SUB16ri8,    X86::SUB16mi8 },
+      { X86::SUB16rr,     X86::SUB16mr },
+      { X86::SUB32ri,     X86::SUB32mi },
+      { X86::SUB32ri8,    X86::SUB32mi8 },
+      { X86::SUB32rr,     X86::SUB32mr },
+      { X86::SUB64ri32,   X86::SUB64mi32 },
+      { X86::SUB64ri8,    X86::SUB64mi8 },
+      { X86::SUB64rr,     X86::SUB64mr },
+      { X86::SUB8ri,      X86::SUB8mi },
+      { X86::SUB8rr,      X86::SUB8mr },
+      { X86::XOR16ri,     X86::XOR16mi },
+      { X86::XOR16ri8,    X86::XOR16mi8 },
+      { X86::XOR16rr,     X86::XOR16mr },
+      { X86::XOR32ri,     X86::XOR32mi },
+      { X86::XOR32ri8,    X86::XOR32mi8 },
+      { X86::XOR32rr,     X86::XOR32mr },
+      { X86::XOR64ri32,   X86::XOR64mi32 },
+      { X86::XOR64ri8,    X86::XOR64mi8 },
+      { X86::XOR64rr,     X86::XOR64mr },
+      { X86::XOR8ri,      X86::XOR8mi },
+      { X86::XOR8rr,      X86::XOR8mr }
+    };
+    ASSERT_SORTED(OpcodeTable);
+    OpcodeTablePtr = OpcodeTable;
+    OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+    isTwoAddrFold = true;
+  } else if (i == 0) { // If operand 0
+    if (MI->getOpcode() == X86::MOV16r0)
+      NewMI = MakeM0Inst(TII, X86::MOV16mi, FrameIndex, MI);
+    else if (MI->getOpcode() == X86::MOV32r0)
+      NewMI = MakeM0Inst(TII, X86::MOV32mi, FrameIndex, MI);
+    else if (MI->getOpcode() == X86::MOV64r0)
+      NewMI = MakeM0Inst(TII, X86::MOV64mi32, FrameIndex, MI);
+    else if (MI->getOpcode() == X86::MOV8r0)
+      NewMI = MakeM0Inst(TII, X86::MOV8mi, FrameIndex, MI);
+    if (NewMI) {
+      NewMI->copyKillDeadInfo(MI);
+      return NewMI;
+    }
+    
+    static const TableEntry OpcodeTable[] = {
+      { X86::CMP16ri,     X86::CMP16mi },
+      { X86::CMP16ri8,    X86::CMP16mi8 },
+      { X86::CMP32ri,     X86::CMP32mi },
+      { X86::CMP32ri8,    X86::CMP32mi8 },
+      { X86::CMP8ri,      X86::CMP8mi },
+      { X86::DIV16r,      X86::DIV16m },
+      { X86::DIV32r,      X86::DIV32m },
+      { X86::DIV64r,      X86::DIV64m },
+      { X86::DIV8r,       X86::DIV8m },
+      { X86::FsMOVAPDrr,  X86::MOVSDmr },
+      { X86::FsMOVAPSrr,  X86::MOVSSmr },
+      { X86::IDIV16r,     X86::IDIV16m },
+      { X86::IDIV32r,     X86::IDIV32m },
+      { X86::IDIV64r,     X86::IDIV64m },
+      { X86::IDIV8r,      X86::IDIV8m },
+      { X86::IMUL16r,     X86::IMUL16m },
+      { X86::IMUL32r,     X86::IMUL32m },
+      { X86::IMUL64r,     X86::IMUL64m },
+      { X86::IMUL8r,      X86::IMUL8m },
+      { X86::MOV16ri,     X86::MOV16mi },
+      { X86::MOV16rr,     X86::MOV16mr },
+      { X86::MOV32ri,     X86::MOV32mi },
+      { X86::MOV32rr,     X86::MOV32mr },
+      { X86::MOV64ri32,   X86::MOV64mi32 },
+      { X86::MOV64rr,     X86::MOV64mr },
+      { X86::MOV8ri,      X86::MOV8mi },
+      { X86::MOV8rr,      X86::MOV8mr },
+      { X86::MOVAPDrr,    X86::MOVAPDmr },
+      { X86::MOVAPSrr,    X86::MOVAPSmr },
+      { X86::MOVPDI2DIrr, X86::MOVPDI2DImr },
+      { X86::MOVPQIto64rr,X86::MOVPQIto64mr },
+      { X86::MOVPS2SSrr,  X86::MOVPS2SSmr },
+      { X86::MOVSDrr,     X86::MOVSDmr },
+      { X86::MOVSDto64rr, X86::MOVSDto64mr },
+      { X86::MOVSS2DIrr,  X86::MOVSS2DImr },
+      { X86::MOVSSrr,     X86::MOVSSmr },
+      { X86::MOVUPDrr,    X86::MOVUPDmr },
+      { X86::MOVUPSrr,    X86::MOVUPSmr },
+      { X86::MUL16r,      X86::MUL16m },
+      { X86::MUL32r,      X86::MUL32m },
+      { X86::MUL64r,      X86::MUL64m },
+      { X86::MUL8r,       X86::MUL8m },
+      { X86::SETAEr,      X86::SETAEm },
+      { X86::SETAr,       X86::SETAm },
+      { X86::SETBEr,      X86::SETBEm },
+      { X86::SETBr,       X86::SETBm },
+      { X86::SETEr,       X86::SETEm },
+      { X86::SETGEr,      X86::SETGEm },
+      { X86::SETGr,       X86::SETGm },
+      { X86::SETLEr,      X86::SETLEm },
+      { X86::SETLr,       X86::SETLm },
+      { X86::SETNEr,      X86::SETNEm },
+      { X86::SETNPr,      X86::SETNPm },
+      { X86::SETNSr,      X86::SETNSm },
+      { X86::SETPr,       X86::SETPm },
+      { X86::SETSr,       X86::SETSm },
+      { X86::TEST16ri,    X86::TEST16mi },
+      { X86::TEST32ri,    X86::TEST32mi },
+      { X86::TEST64ri32,  X86::TEST64mi32 },
+      { X86::TEST8ri,     X86::TEST8mi },
+      { X86::XCHG16rr,    X86::XCHG16mr },
+      { X86::XCHG32rr,    X86::XCHG32mr },
+      { X86::XCHG64rr,    X86::XCHG64mr },
+      { X86::XCHG8rr,     X86::XCHG8mr }
+    };
+    ASSERT_SORTED(OpcodeTable);
+    OpcodeTablePtr = OpcodeTable;
+    OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+  } else if (i == 1) {
+    static const TableEntry OpcodeTable[] = {
+      { X86::CMP16rr,         X86::CMP16rm },
+      { X86::CMP32rr,         X86::CMP32rm },
+      { X86::CMP64ri32,       X86::CMP64mi32 },
+      { X86::CMP64ri8,        X86::CMP64mi8 },
+      { X86::CMP64rr,         X86::CMP64rm },
+      { X86::CMP8rr,          X86::CMP8rm },
+      { X86::CMPPDrri,        X86::CMPPDrmi },
+      { X86::CMPPSrri,        X86::CMPPSrmi },
+      { X86::CMPSDrr,         X86::CMPSDrm },
+      { X86::CMPSSrr,         X86::CMPSSrm },
+      { X86::CVTSD2SSrr,      X86::CVTSD2SSrm },
+      { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm },
+      { X86::CVTSI2SDrr,      X86::CVTSI2SDrm },
+      { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm },
+      { X86::CVTSI2SSrr,      X86::CVTSI2SSrm },
+      { X86::CVTSS2SDrr,      X86::CVTSS2SDrm },
+      { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm },
+      { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm },
+      { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm },
+      { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm },
+      { X86::FsMOVAPDrr,      X86::MOVSDrm },
+      { X86::FsMOVAPSrr,      X86::MOVSSrm },
+      { X86::IMUL16rri,       X86::IMUL16rmi },
+      { X86::IMUL16rri8,      X86::IMUL16rmi8 },
+      { X86::IMUL32rri,       X86::IMUL32rmi },
+      { X86::IMUL32rri8,      X86::IMUL32rmi8 },
+      { X86::IMUL64rr,        X86::IMUL64rm },
+      { X86::IMUL64rri32,     X86::IMUL64rmi32 },
+      { X86::IMUL64rri8,      X86::IMUL64rmi8 },
+      { X86::Int_CMPSDrr,     X86::Int_CMPSDrm },
+      { X86::Int_CMPSSrr,     X86::Int_CMPSSrm },
+      { X86::Int_COMISDrr,    X86::Int_COMISDrm },
+      { X86::Int_COMISSrr,    X86::Int_COMISSrm },
+      { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm },
+      { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm },
+      { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm },
+      { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm },
+      { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm },
+      { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm },
+      { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm },
+      { X86::Int_CVTSD2SIrr,  X86::Int_CVTSD2SIrm },
+      { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm },
+      { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm },
+      { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm },
+      { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm },
+      { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm },
+      { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm },
+      { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm },
+      { X86::Int_CVTSS2SIrr,  X86::Int_CVTSS2SIrm },
+      { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm },
+      { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm },
+      { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm },
+      { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm },
+      { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm },
+      { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm },
+      { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm },
+      { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm },
+      { X86::MOV16rr,         X86::MOV16rm },
+      { X86::MOV32rr,         X86::MOV32rm },
+      { X86::MOV64rr,         X86::MOV64rm },
+      { X86::MOV64toPQIrr,    X86::MOV64toPQIrm },
+      { X86::MOV64toSDrr,     X86::MOV64toSDrm },
+      { X86::MOV8rr,          X86::MOV8rm },
+      { X86::MOVAPDrr,        X86::MOVAPDrm },
+      { X86::MOVAPSrr,        X86::MOVAPSrm },
+      { X86::MOVDDUPrr,       X86::MOVDDUPrm },
+      { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm },
+      { X86::MOVDI2SSrr,      X86::MOVDI2SSrm },
+      { X86::MOVSD2PDrr,      X86::MOVSD2PDrm },
+      { X86::MOVSDrr,         X86::MOVSDrm },
+      { X86::MOVSHDUPrr,      X86::MOVSHDUPrm },
+      { X86::MOVSLDUPrr,      X86::MOVSLDUPrm },
+      { X86::MOVSS2PSrr,      X86::MOVSS2PSrm },
+      { X86::MOVSSrr,         X86::MOVSSrm },
+      { X86::MOVSX16rr8,      X86::MOVSX16rm8 },
+      { X86::MOVSX32rr16,     X86::MOVSX32rm16 },
+      { X86::MOVSX32rr8,      X86::MOVSX32rm8 },
+      { X86::MOVSX64rr16,     X86::MOVSX64rm16 },
+      { X86::MOVSX64rr32,     X86::MOVSX64rm32 },
+      { X86::MOVSX64rr8,      X86::MOVSX64rm8 },
+      { X86::MOVUPDrr,        X86::MOVUPDrm },
+      { X86::MOVUPSrr,        X86::MOVUPSrm },
+      { X86::MOVZX16rr8,      X86::MOVZX16rm8 },
+      { X86::MOVZX32rr16,     X86::MOVZX32rm16 },
+      { X86::MOVZX32rr8,      X86::MOVZX32rm8 },
+      { X86::MOVZX64rr16,     X86::MOVZX64rm16 },
+      { X86::MOVZX64rr8,      X86::MOVZX64rm8 },
+      { X86::PSHUFDri,        X86::PSHUFDmi },
+      { X86::PSHUFHWri,       X86::PSHUFHWmi },
+      { X86::PSHUFLWri,       X86::PSHUFLWmi },
+      { X86::PsMOVZX64rr32,   X86::PsMOVZX64rm32 },
+      { X86::TEST16rr,        X86::TEST16rm },
+      { X86::TEST32rr,        X86::TEST32rm },
+      { X86::TEST64rr,        X86::TEST64rm },
+      { X86::TEST8rr,         X86::TEST8rm },
+      // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+      { X86::UCOMISDrr,       X86::UCOMISDrm },
+      { X86::UCOMISSrr,       X86::UCOMISSrm },
+      { X86::XCHG16rr,        X86::XCHG16rm },
+      { X86::XCHG32rr,        X86::XCHG32rm },
+      { X86::XCHG64rr,        X86::XCHG64rm },
+      { X86::XCHG8rr,         X86::XCHG8rm }
+    };
+    ASSERT_SORTED(OpcodeTable);
+    OpcodeTablePtr = OpcodeTable;
+    OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+  } else if (i == 2) {
+    static const TableEntry OpcodeTable[] = {
+      { X86::ADC32rr,         X86::ADC32rm },
+      { X86::ADC64rr,         X86::ADC64rm },
+      { X86::ADD16rr,         X86::ADD16rm },
+      { X86::ADD32rr,         X86::ADD32rm },
+      { X86::ADD64rr,         X86::ADD64rm },
+      { X86::ADD8rr,          X86::ADD8rm },
+      { X86::ADDPDrr,         X86::ADDPDrm },
+      { X86::ADDPSrr,         X86::ADDPSrm },
+      { X86::ADDSDrr,         X86::ADDSDrm },
+      { X86::ADDSSrr,         X86::ADDSSrm },
+      { X86::ADDSUBPDrr,      X86::ADDSUBPDrm },
+      { X86::ADDSUBPSrr,      X86::ADDSUBPSrm },
+      { X86::AND16rr,         X86::AND16rm },
+      { X86::AND32rr,         X86::AND32rm },
+      { X86::AND64rr,         X86::AND64rm },
+      { X86::AND8rr,          X86::AND8rm },
+      { X86::ANDNPDrr,        X86::ANDNPDrm },
+      { X86::ANDNPSrr,        X86::ANDNPSrm },
+      { X86::ANDPDrr,         X86::ANDPDrm },
+      { X86::ANDPSrr,         X86::ANDPSrm },
+      { X86::CMOVA16rr,       X86::CMOVA16rm },
+      { X86::CMOVA32rr,       X86::CMOVA32rm },
+      { X86::CMOVA64rr,       X86::CMOVA64rm },
+      { X86::CMOVAE16rr,      X86::CMOVAE16rm },
+      { X86::CMOVAE32rr,      X86::CMOVAE32rm },
+      { X86::CMOVAE64rr,      X86::CMOVAE64rm },
+      { X86::CMOVB16rr,       X86::CMOVB16rm },
+      { X86::CMOVB32rr,       X86::CMOVB32rm },
+      { X86::CMOVB64rr,       X86::CMOVB64rm },
+      { X86::CMOVBE16rr,      X86::CMOVBE16rm },
+      { X86::CMOVBE32rr,      X86::CMOVBE32rm },
+      { X86::CMOVBE64rr,      X86::CMOVBE64rm },
+      { X86::CMOVE16rr,       X86::CMOVE16rm },
+      { X86::CMOVE32rr,       X86::CMOVE32rm },
+      { X86::CMOVE64rr,       X86::CMOVE64rm },
+      { X86::CMOVG16rr,       X86::CMOVG16rm },
+      { X86::CMOVG32rr,       X86::CMOVG32rm },
+      { X86::CMOVG64rr,       X86::CMOVG64rm },
+      { X86::CMOVGE16rr,      X86::CMOVGE16rm },
+      { X86::CMOVGE32rr,      X86::CMOVGE32rm },
+      { X86::CMOVGE64rr,      X86::CMOVGE64rm },
+      { X86::CMOVL16rr,       X86::CMOVL16rm },
+      { X86::CMOVL32rr,       X86::CMOVL32rm },
+      { X86::CMOVL64rr,       X86::CMOVL64rm },
+      { X86::CMOVLE16rr,      X86::CMOVLE16rm },
+      { X86::CMOVLE32rr,      X86::CMOVLE32rm },
+      { X86::CMOVLE64rr,      X86::CMOVLE64rm },
+      { X86::CMOVNE16rr,      X86::CMOVNE16rm },
+      { X86::CMOVNE32rr,      X86::CMOVNE32rm },
+      { X86::CMOVNE64rr,      X86::CMOVNE64rm },
+      { X86::CMOVNP16rr,      X86::CMOVNP16rm },
+      { X86::CMOVNP32rr,      X86::CMOVNP32rm },
+      { X86::CMOVNP64rr,      X86::CMOVNP64rm },
+      { X86::CMOVNS16rr,      X86::CMOVNS16rm },
+      { X86::CMOVNS32rr,      X86::CMOVNS32rm },
+      { X86::CMOVNS64rr,      X86::CMOVNS64rm },
+      { X86::CMOVP16rr,       X86::CMOVP16rm },
+      { X86::CMOVP32rr,       X86::CMOVP32rm },
+      { X86::CMOVP64rr,       X86::CMOVP64rm },
+      { X86::CMOVS16rr,       X86::CMOVS16rm },
+      { X86::CMOVS32rr,       X86::CMOVS32rm },
+      { X86::CMOVS64rr,       X86::CMOVS64rm },
+      { X86::DIVPDrr,         X86::DIVPDrm },
+      { X86::DIVPSrr,         X86::DIVPSrm },
+      { X86::DIVSDrr,         X86::DIVSDrm },
+      { X86::DIVSSrr,         X86::DIVSSrm },
+      { X86::HADDPDrr,        X86::HADDPDrm },
+      { X86::HADDPSrr,        X86::HADDPSrm },
+      { X86::HSUBPDrr,        X86::HSUBPDrm },
+      { X86::HSUBPSrr,        X86::HSUBPSrm },
+      { X86::IMUL16rr,        X86::IMUL16rm },
+      { X86::IMUL32rr,        X86::IMUL32rm },
+      { X86::MAXPDrr,         X86::MAXPDrm },
+      { X86::MAXPDrr_Int,     X86::MAXPDrm_Int },
+      { X86::MAXPSrr,         X86::MAXPSrm },
+      { X86::MAXPSrr_Int,     X86::MAXPSrm_Int },
+      { X86::MAXSDrr,         X86::MAXSDrm },
+      { X86::MAXSDrr_Int,     X86::MAXSDrm_Int },
+      { X86::MAXSSrr,         X86::MAXSSrm },
+      { X86::MAXSSrr_Int,     X86::MAXSSrm_Int },
+      { X86::MINPDrr,         X86::MINPDrm },
+      { X86::MINPDrr_Int,     X86::MINPDrm_Int },
+      { X86::MINPSrr,         X86::MINPSrm },
+      { X86::MINPSrr_Int,     X86::MINPSrm_Int },
+      { X86::MINSDrr,         X86::MINSDrm },
+      { X86::MINSDrr_Int,     X86::MINSDrm_Int },
+      { X86::MINSSrr,         X86::MINSSrm },
+      { X86::MINSSrr_Int,     X86::MINSSrm_Int },
+      { X86::MULPDrr,         X86::MULPDrm },
+      { X86::MULPSrr,         X86::MULPSrm },
+      { X86::MULSDrr,         X86::MULSDrm },
+      { X86::MULSSrr,         X86::MULSSrm },
+      { X86::OR16rr,          X86::OR16rm },
+      { X86::OR32rr,          X86::OR32rm },
+      { X86::OR64rr,          X86::OR64rm },
+      { X86::OR8rr,           X86::OR8rm },
+      { X86::ORPDrr,          X86::ORPDrm },
+      { X86::ORPSrr,          X86::ORPSrm },
+      { X86::PACKSSDWrr,      X86::PACKSSDWrm },
+      { X86::PACKSSWBrr,      X86::PACKSSWBrm },
+      { X86::PACKUSWBrr,      X86::PACKUSWBrm },
+      { X86::PADDBrr,         X86::PADDBrm },
+      { X86::PADDDrr,         X86::PADDDrm },
+      { X86::PADDQrr,         X86::PADDQrm },
+      { X86::PADDSBrr,        X86::PADDSBrm },
+      { X86::PADDSWrr,        X86::PADDSWrm },
+      { X86::PADDWrr,         X86::PADDWrm },
+      { X86::PANDNrr,         X86::PANDNrm },
+      { X86::PANDrr,          X86::PANDrm },
+      { X86::PAVGBrr,         X86::PAVGBrm },
+      { X86::PAVGWrr,         X86::PAVGWrm },
+      { X86::PCMPEQBrr,       X86::PCMPEQBrm },
+      { X86::PCMPEQDrr,       X86::PCMPEQDrm },
+      { X86::PCMPEQWrr,       X86::PCMPEQWrm },
+      { X86::PCMPGTBrr,       X86::PCMPGTBrm },
+      { X86::PCMPGTDrr,       X86::PCMPGTDrm },
+      { X86::PCMPGTWrr,       X86::PCMPGTWrm },
+      { X86::PINSRWrri,       X86::PINSRWrmi },
+      { X86::PMADDWDrr,       X86::PMADDWDrm },
+      { X86::PMAXSWrr,        X86::PMAXSWrm },
+      { X86::PMAXUBrr,        X86::PMAXUBrm },
+      { X86::PMINSWrr,        X86::PMINSWrm },
+      { X86::PMINUBrr,        X86::PMINUBrm },
+      { X86::PMULHUWrr,       X86::PMULHUWrm },
+      { X86::PMULHWrr,        X86::PMULHWrm },
+      { X86::PMULLWrr,        X86::PMULLWrm },
+      { X86::PMULUDQrr,       X86::PMULUDQrm },
+      { X86::PORrr,           X86::PORrm },
+      { X86::PSADBWrr,        X86::PSADBWrm },
+      { X86::PSLLDrr,         X86::PSLLDrm },
+      { X86::PSLLQrr,         X86::PSLLQrm },
+      { X86::PSLLWrr,         X86::PSLLWrm },
+      { X86::PSRADrr,         X86::PSRADrm },
+      { X86::PSRAWrr,         X86::PSRAWrm },
+      { X86::PSRLDrr,         X86::PSRLDrm },
+      { X86::PSRLQrr,         X86::PSRLQrm },
+      { X86::PSRLWrr,         X86::PSRLWrm },
+      { X86::PSUBBrr,         X86::PSUBBrm },
+      { X86::PSUBDrr,         X86::PSUBDrm },
+      { X86::PSUBSBrr,        X86::PSUBSBrm },
+      { X86::PSUBSWrr,        X86::PSUBSWrm },
+      { X86::PSUBWrr,         X86::PSUBWrm },
+      { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm },
+      { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm },
+      { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm },
+      { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm },
+      { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm },
+      { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm },
+      { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm },
+      { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm },
+      { X86::PXORrr,          X86::PXORrm },
+      { X86::RCPPSr,          X86::RCPPSm },
+      { X86::RCPPSr_Int,      X86::RCPPSm_Int },
+      { X86::RSQRTPSr,        X86::RSQRTPSm },
+      { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int },
+      { X86::RSQRTSSr,        X86::RSQRTSSm },
+      { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int },
+      { X86::SBB32rr,         X86::SBB32rm },
+      { X86::SBB64rr,         X86::SBB64rm },
+      { X86::SHUFPDrri,       X86::SHUFPDrmi },
+      { X86::SHUFPSrri,       X86::SHUFPSrmi },
+      { X86::SQRTPDr,         X86::SQRTPDm },
+      { X86::SQRTPDr_Int,     X86::SQRTPDm_Int },
+      { X86::SQRTPSr,         X86::SQRTPSm },
+      { X86::SQRTPSr_Int,     X86::SQRTPSm_Int },
+      { X86::SQRTSDr,         X86::SQRTSDm },
+      { X86::SQRTSDr_Int,     X86::SQRTSDm_Int },
+      { X86::SQRTSSr,         X86::SQRTSSm },
+      { X86::SQRTSSr_Int,     X86::SQRTSSm_Int },
+      { X86::SUB16rr,         X86::SUB16rm },
+      { X86::SUB32rr,         X86::SUB32rm },
+      { X86::SUB64rr,         X86::SUB64rm },
+      { X86::SUB8rr,          X86::SUB8rm },
+      { X86::SUBPDrr,         X86::SUBPDrm },
+      { X86::SUBPSrr,         X86::SUBPSrm },
+      { X86::SUBSDrr,         X86::SUBSDrm },
+      { X86::SUBSSrr,         X86::SUBSSrm },
+      // FIXME: TEST*rr -> swapped operand of TEST*mr.
+      { X86::UNPCKHPDrr,      X86::UNPCKHPDrm },
+      { X86::UNPCKHPSrr,      X86::UNPCKHPSrm },
+      { X86::UNPCKLPDrr,      X86::UNPCKLPDrm },
+      { X86::UNPCKLPSrr,      X86::UNPCKLPSrm },
+      { X86::XOR16rr,         X86::XOR16rm },
+      { X86::XOR32rr,         X86::XOR32rm },
+      { X86::XOR64rr,         X86::XOR64rm },
+      { X86::XOR8rr,          X86::XOR8rm },
+      { X86::XORPDrr,         X86::XORPDrm },
+      { X86::XORPSrr,         X86::XORPSrm }
+    };
+    ASSERT_SORTED(OpcodeTable);
+    OpcodeTablePtr = OpcodeTable;
+    OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+  }
+  
+  // If table selected...
+  if (OpcodeTablePtr) {
+    // Find the Opcode to fuse
+    unsigned fromOpcode = MI->getOpcode();
+    // Lookup fromOpcode in table
+    if (const TableEntry *Entry = TableLookup(OpcodeTablePtr, OpcodeTableSize,
+                                              fromOpcode)) {
+      if (isTwoAddrFold)
+        NewMI = FuseTwoAddrInst(Entry->to, FrameIndex, MI, TII);
+      else
+        NewMI = FuseInst(Entry->to, i, FrameIndex, MI, TII);
+      NewMI->copyKillDeadInfo(MI);
+      return NewMI;
+    }
+  }
+  
+  // No fusion 
+  if (PrintFailedFusing)
+    cerr << "We failed to fuse ("
+         << ((i == 1) ? "r" : "s") << "): " << *MI;
+  return NULL;
+}
+
+
+const unsigned *X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs32Bit[] = {
+    X86::ESI, X86::EDI, X86::EBX, X86::EBP,  0
+  };
+
+  static const unsigned CalleeSavedRegs32EHRet[] = {
+    X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP,  0
+  };
+
+  static const unsigned CalleeSavedRegs64Bit[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+
+  if (Is64Bit)
+    return CalleeSavedRegs64Bit;
+  else {
+    if (MF) {
+        MachineFrameInfo *MFI = MF->getFrameInfo();
+        MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+        if (MMI && MMI->callsEHReturn())
+          return CalleeSavedRegs32EHRet;
+    }
+    return CalleeSavedRegs32Bit;
+  }
+}
+
+const TargetRegisterClass* const*
+X86RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses32Bit[] = {
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,  0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClasses32EHRet[] = {
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,  0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClasses64Bit[] = {
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass, 0
+  };
+
+  if (Is64Bit)
+    return CalleeSavedRegClasses64Bit;
+  else {
+    if (MF) {
+        MachineFrameInfo *MFI = MF->getFrameInfo();
+        MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+        if (MMI && MMI->callsEHReturn())
+          return CalleeSavedRegClasses32EHRet;
+    }
+    return CalleeSavedRegClasses32Bit;
+  }
+
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(X86::RSP);
+  Reserved.set(X86::ESP);
+  Reserved.set(X86::SP);
+  Reserved.set(X86::SPL);
+  if (hasFP(MF)) {
+    Reserved.set(X86::RBP);
+    Reserved.set(X86::EBP);
+    Reserved.set(X86::BP);
+    Reserved.set(X86::BPL);
+  }
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+bool X86RegisterInfo::hasFP(const MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+
+  return (NoFramePointerElim || 
+          MF.getFrameInfo()->hasVarSizedObjects() ||
+          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+          (MMI && MMI->callsUnwindInit()));
+}
+
+void X86RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (hasFP(MF)) {
+    // If we have a frame pointer, turn the adjcallstackup instruction into a
+    // 'sub ESP, <amt>' and the adjcallstackdown instruction into 'add ESP,
+    // <amt>'
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      MachineInstr *New = 0;
+      if (Old->getOpcode() == X86::ADJCALLSTACKDOWN) {
+        New=BuildMI(TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), StackPtr)
+          .addReg(StackPtr).addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == X86::ADJCALLSTACKUP);
+        // factor out the amount the callee already popped.
+        uint64_t CalleeAmt = Old->getOperand(1).getImm();
+        Amount -= CalleeAmt;
+        if (Amount) {
+          unsigned Opc = (Amount < 128) ?
+            (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+            (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri);
+          New = BuildMI(TII.get(Opc),  StackPtr)
+                        .addReg(StackPtr).addImm(Amount);
+        }
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      if (New) MBB.insert(I, New);
+    }
+  } else if (I->getOpcode() == X86::ADJCALLSTACKUP) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.  We do this until we have
+    // more advanced stack pointer tracking ability.
+    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+      unsigned Opc = (CalleeAmt < 128) ?
+        (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+        (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri);
+      MachineInstr *New =
+        BuildMI(TII.get(Opc), StackPtr).addReg(StackPtr).addImm(CalleeAmt);
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, RegScavenger *RS) const{
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  while (!MI.getOperand(i).isFrameIndex()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getFrameIndex();
+  // This must be part of a four operand memory reference.  Replace the
+  // FrameIndex with base register with EBP.  Add an offset to the offset.
+  MI.getOperand(i).ChangeToRegister(hasFP(MF) ? FramePtr : StackPtr, false);
+
+  // Now add the frame object offset to the offset from EBP.
+  int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+                   MI.getOperand(i+3).getImm()+SlotSize;
+
+  if (!hasFP(MF))
+    Offset += MF.getFrameInfo()->getStackSize();
+  else
+    Offset += SlotSize;  // Skip the saved EBP
+
+  MI.getOperand(i+3).ChangeToImmediate(Offset);
+}
+
+void
+X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{
+  if (hasFP(MF)) {
+    // Create a frame entry for the EBP register that must be saved.
+    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+                                                        (int)SlotSize * -2);
+    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+           "Slot for EBP register must be last in order to be found!");
+  }
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                  unsigned StackPtr, int64_t NumBytes, bool Is64Bit,
+                  const TargetInstrInfo &TII) {
+  bool isSub = NumBytes < 0;
+  uint64_t Offset = isSub ? -NumBytes : NumBytes;
+  unsigned Opc = isSub
+    ? ((Offset < 128) ?
+       (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+       (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri))
+    : ((Offset < 128) ?
+       (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+       (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri));
+  uint64_t Chunk = (1LL << 31) - 1;
+
+  while (Offset) {
+    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+    BuildMI(MBB, MBBI, TII.get(Opc), StackPtr).addReg(StackPtr).addImm(ThisVal);
+    Offset -= ThisVal;
+  }
+}
+
+void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+  const Function* Fn = MF.getFunction();
+  const X86Subtarget* Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  
+  // Prepare for frame info.
+  unsigned FrameLabelId = 0, StartLabelId = 0;
+  
+  // Get the number of bytes to allocate from the FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+  uint64_t NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+
+  if (MMI && MMI->needsFrameInfo()) {
+    // Mark function start
+    StartLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(StartLabelId);
+  }
+
+  if (hasFP(MF)) {
+    // Get the offset of the stack slot for the EBP register... which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI->setOffsetAdjustment(SlotSize-NumBytes);
+
+    // Save EBP into the appropriate stack slot...
+    BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+      .addReg(FramePtr);
+    NumBytes -= SlotSize;
+
+    if (MMI && MMI->needsFrameInfo()) {
+      // Mark effective beginning of when frame pointer becomes valid.
+      FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(FrameLabelId);
+    }
+
+    // Update EBP with the new base value...
+    BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+      .addReg(StackPtr);
+  }
+  
+  unsigned ReadyLabelId = 0;
+  if (MMI && MMI->needsFrameInfo()) {
+    // Mark effective beginning of when frame pointer is ready.
+    ReadyLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(ReadyLabelId);
+  }
+
+  // Skip the callee-saved push instructions.
+  while (MBBI != MBB.end() &&
+         (MBBI->getOpcode() == X86::PUSH32r ||
+          MBBI->getOpcode() == X86::PUSH64r))
+    ++MBBI;
+
+  if (NumBytes) {   // adjust stack pointer: ESP -= numbytes
+    if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
+      // Check, whether EAX is livein for this function
+      bool isEAXAlive = false;
+      for (MachineFunction::livein_iterator II = MF.livein_begin(),
+             EE = MF.livein_end(); (II != EE) && !isEAXAlive; ++II) {
+        unsigned Reg = II->first;
+        isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
+                      Reg == X86::AH || Reg == X86::AL);
+      }
+
+      // Function prologue calls _alloca to probe the stack when allocating  
+      // more than 4k bytes in one go. Touching the stack at 4K increments is  
+      // necessary to ensure that the guard pages used by the OS virtual memory
+      // manager are allocated in correct sequence.
+      if (!isEAXAlive) {
+        BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes);
+        BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32))
+          .addExternalSymbol("_alloca");
+      } else {
+        // Save EAX
+        BuildMI(MBB, MBBI, TII.get(X86::PUSH32r), X86::EAX);
+        // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
+        // allocated bytes for EAX.
+        BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes-4);
+        BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32))
+          .addExternalSymbol("_alloca");
+        // Restore EAX
+        MachineInstr *MI = addRegOffset(BuildMI(TII.get(X86::MOV32rm),X86::EAX),
+                                        StackPtr, NumBytes-4);
+        MBB.insert(MBBI, MI);
+      }
+    } else {
+      // If there is an ADD32ri or SUB32ri of ESP immediately after this
+      // instruction, merge the two instructions.
+      if (MBBI != MBB.end()) {
+        MachineBasicBlock::iterator NI = next(MBBI);
+        unsigned Opc = MBBI->getOpcode();
+        if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+             Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+            MBBI->getOperand(0).getReg() == StackPtr) {
+          NumBytes -= MBBI->getOperand(2).getImm();
+          MBB.erase(MBBI);
+          MBBI = NI;
+        } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+                    Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+                   MBBI->getOperand(0).getReg() == StackPtr) {
+          NumBytes += MBBI->getOperand(2).getImm();
+          MBB.erase(MBBI);
+          MBBI = NI;
+        }
+      }
+
+      if (NumBytes)
+        emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
+    }
+  }
+
+  if (MMI && MMI->needsFrameInfo()) {
+    std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+    const TargetAsmInfo *TAI = MF.getTarget().getTargetAsmInfo();
+
+    // Calculate amount of bytes used for return address storing
+    int stackGrowth =
+      (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
+       TargetFrameInfo::StackGrowsUp ?
+       TAI->getAddressSize() : -TAI->getAddressSize());
+
+    if (StackSize) {
+      // Show update of SP.
+      if (hasFP(MF)) {
+        // Adjust SP
+        MachineLocation SPDst(MachineLocation::VirtualFP);
+        MachineLocation SPSrc(MachineLocation::VirtualFP, 2*stackGrowth);
+        Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+      } else {
+        MachineLocation SPDst(MachineLocation::VirtualFP);
+        MachineLocation SPSrc(MachineLocation::VirtualFP, -StackSize+stackGrowth);
+        Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+      }
+    } else {
+      //FIXME: Verify & implement for FP
+      MachineLocation SPDst(StackPtr);
+      MachineLocation SPSrc(StackPtr, stackGrowth);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    }
+            
+    // Add callee saved registers to move list.
+    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      int64_t Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+      unsigned Reg = CSI[I].getReg();
+      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+      MachineLocation CSSrc(Reg);
+      Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+    }
+    
+    if (hasFP(MF)) {
+      // Save FP
+      MachineLocation FPDst(MachineLocation::VirtualFP, 2*stackGrowth);
+      MachineLocation FPSrc(FramePtr);
+      Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+    }
+    
+    MachineLocation FPDst(hasFP(MF) ? FramePtr : StackPtr);
+    MachineLocation FPSrc(MachineLocation::VirtualFP);
+    Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+  }
+
+  // If it's main() on Cygwin\Mingw32 we should align stack as well
+  if (Fn->hasExternalLinkage() && Fn->getName() == "main" &&
+      Subtarget->isTargetCygMing()) {
+    BuildMI(MBB, MBBI, TII.get(X86::AND32ri), X86::ESP)
+                .addReg(X86::ESP).addImm(-Align);
+
+    // Probe the stack
+    BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(Align);
+    BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32)).addExternalSymbol("_alloca");
+  }
+}
+
+void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  switch (RetOpcode) {
+  case X86::RET:
+  case X86::RETI:
+  case X86::EH_RETURN:
+  case X86::TAILJMPd:
+  case X86::TAILJMPr:
+  case X86::TAILJMPm: break;  // These are ok
+  default:
+    assert(0 && "Can only insert epilog into returning blocks");
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = StackSize - CSSize;
+
+  if (hasFP(MF)) {
+    // pop EBP.
+    BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+    NumBytes -= SlotSize;
+  }
+
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = prior(MBBI);      
+    if (PI->getOpcode() != X86::POP32r && PI->getOpcode() != X86::POP64r)
+      break;
+    --MBBI;
+  }
+
+  // If dynamic alloca is used, then reset esp to point to the last
+  // callee-saved slot before popping them off!
+  if (MFI->hasVarSizedObjects()) {
+    unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
+    MachineInstr *MI = addRegOffset(BuildMI(TII.get(Opc), StackPtr),
+                                    FramePtr, -CSSize);
+    MBB.insert(MBBI, MI);
+    NumBytes = 0;
+  }
+
+  if (NumBytes) {    // adjust stack pointer back: ESP += numbytes
+    // If there is an ADD32ri or SUB32ri of ESP immediately before this
+    // instruction, merge the two instructions.
+    if (MBBI != MBB.begin()) {
+      MachineBasicBlock::iterator PI = prior(MBBI);
+      unsigned Opc = PI->getOpcode();
+      if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+           Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+          PI->getOperand(0).getReg() == StackPtr) {
+        NumBytes += PI->getOperand(2).getImm();
+        MBB.erase(PI);
+      } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+                  Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+                 PI->getOperand(0).getReg() == StackPtr) {
+        NumBytes -= PI->getOperand(2).getImm();
+        MBB.erase(PI);
+      }
+    }
+
+    if (NumBytes)
+      emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+  }
+
+  // We're returning from function via eh_return.
+  if (RetOpcode == X86::EH_RETURN) {
+    MBBI = prior(MBB.end());
+    MachineOperand &DestAddr  = MBBI->getOperand(0);
+    assert(DestAddr.isReg() && "Offset should be in register!");
+    BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),StackPtr).
+      addReg(DestAddr.getReg());
+  }
+}
+
+unsigned X86RegisterInfo::getRARegister() const {
+  if (Is64Bit)
+    return X86::RIP;  // Should have dwarf #16
+  else
+    return X86::EIP;  // Should have dwarf #8
+}
+
+unsigned X86RegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? FramePtr : StackPtr;
+}
+
+void X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
+                                                                         const {
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth = (Is64Bit ? -8 : -4);
+
+  // Initial state of the frame pointer is esp+4.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(StackPtr, stackGrowth);
+  Moves.push_back(MachineMove(0, Dst, Src));
+
+  // Add return address to move list
+  MachineLocation CSDst(StackPtr, stackGrowth);
+  MachineLocation CSSrc(getRARegister());
+  Moves.push_back(MachineMove(0, CSDst, CSSrc));
+}
+
+unsigned X86RegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned X86RegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+namespace llvm {
+unsigned getX86SubSuperRegister(unsigned Reg, MVT::ValueType VT, bool High) {
+  switch (VT) {
+  default: return Reg;
+  case MVT::i8:
+    if (High) {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AH;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DH;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CH;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BH;
+      }
+    } else {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AL;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DL;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CL;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
+      }
+    }
+  case MVT::i16:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::AX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::DX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::CX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::BX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::SI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::DI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::BP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
+    }
+  case MVT::i32:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::EAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::EDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::ECX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::EBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::ESI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::EDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::EBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case MVT::i64:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
+    }
+  }
+
+  return Reg;
+}
+}
+
+#include "X86GenRegisterInfo.inc"
+

diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..ab9e33f
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.h

@@ -0,0 +1,130 @@
+//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86REGISTERINFO_H
+#define X86REGISTERINFO_H
+
+#include "llvm/Target/MRegisterInfo.h"
+#include "X86GenRegisterInfo.h.inc"
+
+namespace llvm {
+  class Type;
+  class TargetInstrInfo;
+  class X86TargetMachine;
+
+class X86RegisterInfo : public X86GenRegisterInfo {
+public:
+  X86TargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+private:
+  /// Is64Bit - Is the target 64-bits.
+  bool Is64Bit;
+
+  /// SlotSize - Stack slot size in bytes.
+  unsigned SlotSize;
+
+  /// StackPtr - X86 physical register used as stack ptr.
+  unsigned StackPtr;
+
+  /// FramePtr - X86 physical register used as frame ptr.
+  unsigned FramePtr;
+
+public:
+  X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, int FrameIndex,
+                           const TargetRegisterClass *RC) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC) const;
+
+  void copyRegToReg(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator MI,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *RC) const;
+ 
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, const MachineInstr *Orig) const;
+
+  /// foldMemoryOperand - If this target supports it, fold a load or store of
+  /// the specified stack slot into the specified machine instruction for the
+  /// specified operand.  If this is possible, the target should perform the
+  /// folding and return true, otherwise it should return false.  If it folds
+  /// the instruction, it is likely that the MachineInstruction the iterator
+  /// references has been changed.
+  MachineInstr* foldMemoryOperand(MachineInstr* MI,
+                                  unsigned OpNum,
+                                  int FrameIndex) const;
+
+  /// getCalleeSavedRegs - Return a null-terminated list of all of the
+  /// callee-save registers on this target.
+  const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  /// getCalleeSavedRegClasses - Return a null-terminated list of the preferred
+  /// register classes to spill each callee-saved register with.  The order and
+  /// length of this list match the getCalleeSavedRegs() list.
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  /// getReservedRegs - Returns a bitset indexed by physical register number
+  /// indicating if a register is a special register that has particular uses and
+  /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+  /// register scavenger to determine what registers are free.
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+// getX86SubSuperRegister - X86 utility function. It returns the sub or super
+// register of a specific X86 register.
+// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, MVT::ValueType, bool High=false);
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..a1e7bb9
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.td

@@ -0,0 +1,468 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File ------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+let Namespace = "X86" in {
+
+  // In the register alias definitions below, we define which registers alias
+  // which others.  We only specify which registers the small registers alias,
+  // because the register file generator is smart enough to figure out that
+  // AL aliases AX if we tell it that AX aliased AL (for example).
+
+  // FIXME: X86-64 have different Dwarf numbers.
+  // 8-bit registers
+  // Low registers
+  def AL : Register<"AL">, DwarfRegNum<0>;
+  def CL : Register<"CL">, DwarfRegNum<1>;
+  def DL : Register<"DL">, DwarfRegNum<2>;
+  def BL : Register<"BL">, DwarfRegNum<3>;
+
+  // X86-64 only
+  def SIL : Register<"SIL">, DwarfRegNum<4>;
+  def DIL : Register<"DIL">, DwarfRegNum<5>;
+  def BPL : Register<"BPL">, DwarfRegNum<6>;
+  def SPL : Register<"SPL">, DwarfRegNum<7>;
+  def R8B  : Register<"R8B">,  DwarfRegNum<8>;
+  def R9B  : Register<"R9B">,  DwarfRegNum<9>;
+  def R10B : Register<"R10B">, DwarfRegNum<10>;
+  def R11B : Register<"R11B">, DwarfRegNum<11>;
+  def R12B : Register<"R12B">, DwarfRegNum<12>;
+  def R13B : Register<"R13B">, DwarfRegNum<13>;
+  def R14B : Register<"R14B">, DwarfRegNum<14>;
+  def R15B : Register<"R15B">, DwarfRegNum<15>;
+
+  // High registers X86-32 only
+  def AH : Register<"AH">, DwarfRegNum<0>;
+  def CH : Register<"CH">, DwarfRegNum<1>;
+  def DH : Register<"DH">, DwarfRegNum<2>;
+  def BH : Register<"BH">, DwarfRegNum<3>;
+
+  // 16-bit registers
+  def AX : RegisterWithSubRegs<"AX", [AH,AL]>, DwarfRegNum<0>;
+  def CX : RegisterWithSubRegs<"CX", [CH,CL]>, DwarfRegNum<1>;
+  def DX : RegisterWithSubRegs<"DX", [DH,DL]>, DwarfRegNum<2>;
+  def BX : RegisterWithSubRegs<"BX", [BH,BL]>, DwarfRegNum<3>;
+  def SP : RegisterWithSubRegs<"SP", [SPL]>, DwarfRegNum<4>;
+  def BP : RegisterWithSubRegs<"BP", [BPL]>, DwarfRegNum<5>;
+  def SI : RegisterWithSubRegs<"SI", [SIL]>, DwarfRegNum<6>;
+  def DI : RegisterWithSubRegs<"DI", [DIL]>, DwarfRegNum<7>;
+  def IP : Register<"IP">, DwarfRegNum<8>;  
+  
+  // X86-64 only
+  def R8W  : RegisterWithSubRegs<"R8W", [R8B]>, DwarfRegNum<8>;
+  def R9W  : RegisterWithSubRegs<"R9W", [R9B]>, DwarfRegNum<9>;
+  def R10W : RegisterWithSubRegs<"R10W", [R10B]>, DwarfRegNum<10>;
+  def R11W : RegisterWithSubRegs<"R11W", [R11B]>, DwarfRegNum<11>;
+  def R12W : RegisterWithSubRegs<"R12W", [R12B]>, DwarfRegNum<12>;
+  def R13W : RegisterWithSubRegs<"R13W", [R13B]>, DwarfRegNum<13>;
+  def R14W : RegisterWithSubRegs<"R14W", [R14B]>, DwarfRegNum<14>;
+  def R15W : RegisterWithSubRegs<"R15W", [R15B]>, DwarfRegNum<15>;
+
+  // 32-bit registers
+  def EAX : RegisterWithSubRegs<"EAX", [AX]>, DwarfRegNum<0>;
+  def ECX : RegisterWithSubRegs<"ECX", [CX]>, DwarfRegNum<1>;
+  def EDX : RegisterWithSubRegs<"EDX", [DX]>, DwarfRegNum<2>;
+  def EBX : RegisterWithSubRegs<"EBX", [BX]>, DwarfRegNum<3>;
+  def ESP : RegisterWithSubRegs<"ESP", [SP]>, DwarfRegNum<4>;
+  def EBP : RegisterWithSubRegs<"EBP", [BP]>, DwarfRegNum<5>;
+  def ESI : RegisterWithSubRegs<"ESI", [SI]>, DwarfRegNum<6>;
+  def EDI : RegisterWithSubRegs<"EDI", [DI]>, DwarfRegNum<7>;
+  def EIP : RegisterWithSubRegs<"EIP", [IP]>, DwarfRegNum<8>;  
+  
+  // X86-64 only
+  def R8D  : RegisterWithSubRegs<"R8D", [R8W]>, DwarfRegNum<8>;
+  def R9D  : RegisterWithSubRegs<"R9D", [R9W]>, DwarfRegNum<9>;
+  def R10D : RegisterWithSubRegs<"R10D", [R10W]>, DwarfRegNum<10>;
+  def R11D : RegisterWithSubRegs<"R11D", [R11W]>, DwarfRegNum<11>;
+  def R12D : RegisterWithSubRegs<"R12D", [R12W]>, DwarfRegNum<12>;
+  def R13D : RegisterWithSubRegs<"R13D", [R13W]>, DwarfRegNum<13>;
+  def R14D : RegisterWithSubRegs<"R14D", [R14W]>, DwarfRegNum<14>;
+  def R15D : RegisterWithSubRegs<"R15D", [R15W]>, DwarfRegNum<15>;
+
+  // 64-bit registers, X86-64 only
+  def RAX : RegisterWithSubRegs<"RAX", [EAX]>, DwarfRegNum<0>;
+  def RDX : RegisterWithSubRegs<"RDX", [EDX]>, DwarfRegNum<1>;
+  def RCX : RegisterWithSubRegs<"RCX", [ECX]>, DwarfRegNum<2>;
+  def RBX : RegisterWithSubRegs<"RBX", [EBX]>, DwarfRegNum<3>;
+  def RSI : RegisterWithSubRegs<"RSI", [ESI]>, DwarfRegNum<4>;
+  def RDI : RegisterWithSubRegs<"RDI", [EDI]>, DwarfRegNum<5>;
+  def RBP : RegisterWithSubRegs<"RBP", [EBP]>, DwarfRegNum<6>;
+  def RSP : RegisterWithSubRegs<"RSP", [ESP]>, DwarfRegNum<7>;
+
+  def R8  : RegisterWithSubRegs<"R8", [R8D]>, DwarfRegNum<8>;
+  def R9  : RegisterWithSubRegs<"R9", [R9D]>, DwarfRegNum<9>;
+  def R10 : RegisterWithSubRegs<"R10", [R10D]>, DwarfRegNum<10>;
+  def R11 : RegisterWithSubRegs<"R11", [R11D]>, DwarfRegNum<11>;
+  def R12 : RegisterWithSubRegs<"R12", [R12D]>, DwarfRegNum<12>;
+  def R13 : RegisterWithSubRegs<"R13", [R13D]>, DwarfRegNum<13>;
+  def R14 : RegisterWithSubRegs<"R14", [R14D]>, DwarfRegNum<14>;
+  def R15 : RegisterWithSubRegs<"R15", [R15D]>, DwarfRegNum<15>;
+  def RIP : RegisterWithSubRegs<"RIP", [EIP]>,  DwarfRegNum<16>;
+
+  // MMX Registers. These are actually aliased to ST0 .. ST7
+  def MM0 : Register<"MM0">, DwarfRegNum<29>;
+  def MM1 : Register<"MM1">, DwarfRegNum<30>;
+  def MM2 : Register<"MM2">, DwarfRegNum<31>;
+  def MM3 : Register<"MM3">, DwarfRegNum<32>;
+  def MM4 : Register<"MM4">, DwarfRegNum<33>;
+  def MM5 : Register<"MM5">, DwarfRegNum<34>;
+  def MM6 : Register<"MM6">, DwarfRegNum<35>;
+  def MM7 : Register<"MM7">, DwarfRegNum<36>;
+  
+  // Pseudo Floating Point registers
+  def FP0 : Register<"FP0">, DwarfRegNum<-1>;
+  def FP1 : Register<"FP1">, DwarfRegNum<-1>;
+  def FP2 : Register<"FP2">, DwarfRegNum<-1>;
+  def FP3 : Register<"FP3">, DwarfRegNum<-1>;
+  def FP4 : Register<"FP4">, DwarfRegNum<-1>;
+  def FP5 : Register<"FP5">, DwarfRegNum<-1>;
+  def FP6 : Register<"FP6">, DwarfRegNum<-1>; 
+
+  // XMM Registers, used by the various SSE instruction set extensions
+  def XMM0: Register<"XMM0">, DwarfRegNum<17>;
+  def XMM1: Register<"XMM1">, DwarfRegNum<18>;
+  def XMM2: Register<"XMM2">, DwarfRegNum<19>;
+  def XMM3: Register<"XMM3">, DwarfRegNum<20>;
+  def XMM4: Register<"XMM4">, DwarfRegNum<21>;
+  def XMM5: Register<"XMM5">, DwarfRegNum<22>;
+  def XMM6: Register<"XMM6">, DwarfRegNum<23>;
+  def XMM7: Register<"XMM7">, DwarfRegNum<24>;
+
+  // X86-64 only
+  def XMM8:  Register<"XMM8">,  DwarfRegNum<25>;
+  def XMM9:  Register<"XMM9">,  DwarfRegNum<26>;
+  def XMM10: Register<"XMM10">, DwarfRegNum<27>;
+  def XMM11: Register<"XMM11">, DwarfRegNum<28>;
+  def XMM12: Register<"XMM12">, DwarfRegNum<29>;
+  def XMM13: Register<"XMM13">, DwarfRegNum<30>;
+  def XMM14: Register<"XMM14">, DwarfRegNum<31>;
+  def XMM15: Register<"XMM15">, DwarfRegNum<32>;
+
+  // Floating point stack registers
+  def ST0 : Register<"ST(0)">, DwarfRegNum<11>;
+  def ST1 : Register<"ST(1)">, DwarfRegNum<12>;
+  def ST2 : Register<"ST(2)">, DwarfRegNum<13>;
+  def ST3 : Register<"ST(3)">, DwarfRegNum<14>;
+  def ST4 : Register<"ST(4)">, DwarfRegNum<15>;
+  def ST5 : Register<"ST(5)">, DwarfRegNum<16>;
+  def ST6 : Register<"ST(6)">, DwarfRegNum<17>;
+  def ST7 : Register<"ST(7)">, DwarfRegNum<18>; 
+}
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and 
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B. 
+// FIXME: Allow AH, CH, DH, BH in 64-mode for non-REX instructions,
+def GR8 : RegisterClass<"X86", [i8],  8,
+                        [AL, CL, DL, BL, AH, CH, DH, BH, SIL, DIL, BPL, SPL,
+                         R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      // If so, don't allocate SPL or BPL.
+      static const unsigned X86_GR8_AO_64_fp[] =
+      {X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+       X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+       X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B};
+      // If not, just don't allocate SPL.
+      static const unsigned X86_GR8_AO_64[] =
+      {X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+       X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+       X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B, X86::BPL};
+      // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+      static const unsigned X86_GR8_AO_32[] =
+      {X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH};
+
+    GR8Class::iterator
+    GR8Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return X86_GR8_AO_32;
+      else if (RI->hasFP(MF))
+        return X86_GR8_AO_64_fp;
+      else
+        return X86_GR8_AO_64;
+    }
+
+    GR8Class::iterator
+    GR8Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return X86_GR8_AO_32 + (sizeof(X86_GR8_AO_32) / sizeof(unsigned));
+      else if (RI->hasFP(MF))
+        return X86_GR8_AO_64_fp + (sizeof(X86_GR8_AO_64_fp) / sizeof(unsigned));
+      else
+        return X86_GR8_AO_64 + (sizeof(X86_GR8_AO_64) / sizeof(unsigned));
+    }
+  }];
+}
+
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+                         [AX, CX, DX, SI, DI, BX, BP, SP,
+                          R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      // If so, don't allocate SP or BP.
+      static const unsigned X86_GR16_AO_64_fp[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+       X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+       X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W};
+      static const unsigned X86_GR16_AO_32_fp[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX};
+      // If not, just don't allocate SPL.
+      static const unsigned X86_GR16_AO_64[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+       X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+       X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W, X86::BP};
+      static const unsigned X86_GR16_AO_32[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP};
+
+    GR16Class::iterator
+    GR16Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_64_fp;
+        else
+          return X86_GR16_AO_64;
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_32_fp;
+        else
+          return X86_GR16_AO_32;
+      }
+    }
+
+    GR16Class::iterator
+    GR16Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_64_fp+(sizeof(X86_GR16_AO_64_fp)/sizeof(unsigned));
+        else
+          return X86_GR16_AO_64 + (sizeof(X86_GR16_AO_64) / sizeof(unsigned));
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_32_fp+(sizeof(X86_GR16_AO_32_fp)/sizeof(unsigned));
+        else
+          return X86_GR16_AO_32 + (sizeof(X86_GR16_AO_32) / sizeof(unsigned));
+      }
+    }
+  }];
+}
+
+
+def GR32 : RegisterClass<"X86", [i32], 32, 
+                         [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+                          R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      // If so, don't allocate ESP or EBP.
+      static const unsigned X86_GR32_AO_64_fp[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+       X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+       X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D};
+      static const unsigned X86_GR32_AO_32_fp[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX};
+      // If not, just don't allocate SPL.
+      static const unsigned X86_GR32_AO_64[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+       X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+       X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP};
+      static const unsigned X86_GR32_AO_32[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP};
+
+    GR32Class::iterator
+    GR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_64_fp;
+        else
+          return X86_GR32_AO_64;
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_32_fp;
+        else
+          return X86_GR32_AO_32;
+      }
+    }
+
+    GR32Class::iterator
+    GR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_64_fp+(sizeof(X86_GR32_AO_64_fp)/sizeof(unsigned));
+        else
+          return X86_GR32_AO_64 + (sizeof(X86_GR32_AO_64) / sizeof(unsigned));
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_32_fp+(sizeof(X86_GR32_AO_32_fp)/sizeof(unsigned));
+        else
+          return X86_GR32_AO_32 + (sizeof(X86_GR32_AO_32) / sizeof(unsigned));
+      }
+    }
+  }];
+}
+
+
+def GR64 : RegisterClass<"X86", [i64], 64, 
+                         [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                          RBX, R14, R15, R12, R13, RBP, RSP]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR64Class::iterator
+    GR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF)) // Does the function dedicate RBP to being a frame ptr?
+        return end()-2;  // If so, don't allocate RSP or RBP
+      else
+        return end()-1;  // If not, just don't allocate RSP
+    }
+  }];
+}
+
+
+// GR16, GR32 subclasses which contain registers that have R8 sub-registers.
+// These should only be used for 32-bit mode.
+def GR16_ : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]>;
+def GR32_ : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32,
+                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                          XMM8, XMM9, XMM10, XMM11,
+                          XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FR32Class::iterator
+    FR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+
+def FR64 : RegisterClass<"X86", [f64], 64,
+                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                          XMM8, XMM9, XMM10, XMM11,
+                          XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FR64Class::iterator
+    FR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values.  This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware.  In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86", [f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP64 : RegisterClass<"X86", [f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f64], 32,
+                        [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> {
+    let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    RSTClass::iterator
+    RSTClass::allocation_order_end(const MachineFunction &MF) const {
+      return begin();
+    }
+  }];
+}
+
+// Generic vector registers: VR64 and VR128.
+def VR64  : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64], 64,
+                          [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
+                          [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                           XMM8, XMM9, XMM10, XMM11,
+                           XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    VR128Class::iterator
+    VR128Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}

diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h
new file mode 100644
index 0000000..3dd2b24
--- /dev/null
+++ b/lib/Target/X86/X86Relocations.h

@@ -0,0 +1,34 @@
+//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86RELOCATIONS_H
+#define X86RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace X86 {
+    enum RelocationType {
+      // reloc_pcrel_word - PC relative relocation, add the relocated value to
+      // the value already in memory, after we adjust it for where the PC is.
+      reloc_pcrel_word = 0,
+
+      // reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just
+      // add the relocated value to the value already in memory.
+      reloc_absolute_word = 1,
+      reloc_absolute_dword = 2
+    };
+  }
+}
+
+#endif

diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..1a75e04
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.cpp

@@ -0,0 +1,293 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Begeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Subtarget.h"
+#include "X86GenSubtarget.inc"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+cl::opt<X86Subtarget::AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(X86Subtarget::Unset),
+  cl::desc("Choose style of code to emit from X86 backend:"),
+  cl::values(
+    clEnumValN(X86Subtarget::ATT,   "att",   "  Emit AT&T-style assembly"),
+    clEnumValN(X86Subtarget::Intel, "intel", "  Emit Intel-style assembly"),
+    clEnumValEnd));
+
+
+/// True if accessing the GV requires an extra load. For Windows, dllimported
+/// symbols are indirect, loading the value at address GV rather then the
+/// value of GV itself. This means that the GlobalAddress must be in the base
+/// or index register of the address, not the GV offset field.
+bool X86Subtarget::GVRequiresExtraLoad(const GlobalValue* GV,
+                                       const TargetMachine& TM,
+                                       bool isDirectCall) const
+{
+  // FIXME: PIC
+  if (TM.getRelocationModel() != Reloc::Static)
+    if (isTargetDarwin()) {
+      return (!isDirectCall &&
+              (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+               (GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode())));
+    } else if (TM.getRelocationModel() == Reloc::PIC_ && isPICStyleGOT()) {
+      // Extra load is needed for all non-statics.
+      return (!isDirectCall &&
+              (GV->isDeclaration() || !GV->hasInternalLinkage()));
+    } else if (isTargetCygMing() || isTargetWindows()) {
+      return (GV->hasDLLImportLinkage());
+    }
+  
+  return false;
+}
+
+/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments.  If we can't run cpuid on the host, return true.
+bool X86::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+                          unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__)
+  // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+  asm ("movq\t%%rbx, %%rsi\n\t"
+       "cpuid\n\t"
+       "xchgq\t%%rbx, %%rsi\n\t"
+       : "=a" (*rEAX),
+         "=S" (*rEBX),
+         "=c" (*rECX),
+         "=d" (*rEDX)
+       :  "a" (value));
+  return false;
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+#if defined(__GNUC__)
+  asm ("movl\t%%ebx, %%esi\n\t"
+       "cpuid\n\t"
+       "xchgl\t%%ebx, %%esi\n\t"
+       : "=a" (*rEAX),
+         "=S" (*rEBX),
+         "=c" (*rECX),
+         "=d" (*rEDX)
+       :  "a" (value));
+  return false;
+#elif defined(_MSC_VER)
+  __asm {
+    mov   eax,value
+    cpuid
+    mov   esi,rEAX
+    mov   dword ptr [esi],eax
+    mov   esi,rEBX
+    mov   dword ptr [esi],ebx
+    mov   esi,rECX
+    mov   dword ptr [esi],ecx
+    mov   esi,rEDX
+    mov   dword ptr [esi],edx
+  }
+  return false;
+#endif
+#endif
+  return true;
+}
+
+void X86Subtarget::AutoDetectSubtargetFeatures() {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+  
+  if (X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1))
+    return;
+
+  X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+  
+  if ((EDX >> 23) & 0x1) X86SSELevel = MMX;
+  if ((EDX >> 25) & 0x1) X86SSELevel = SSE1;
+  if ((EDX >> 26) & 0x1) X86SSELevel = SSE2;
+  if (ECX & 0x1)         X86SSELevel = SSE3;
+  if ((ECX >> 9)  & 0x1) X86SSELevel = SSSE3;
+
+  if (memcmp(text.c, "GenuineIntel", 12) == 0 ||
+      memcmp(text.c, "AuthenticAMD", 12) == 0) {
+    X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+    HasX86_64 = (EDX >> 29) & 0x1;
+  }
+}
+
+static const char *GetCurrentX86CPU() {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  if (X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
+    return "generic";
+  unsigned Family  = (EAX >> 8) & 0xf; // Bits 8 - 11
+  unsigned Model   = (EAX >> 4) & 0xf; // Bits 4 - 7
+  X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  bool Em64T = (EDX >> 29) & 0x1;
+
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
+  if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+    switch (Family) {
+      case 3:
+        return "i386";
+      case 4:
+        return "i486";
+      case 5:
+        switch (Model) {
+        case 4:  return "pentium-mmx";
+        default: return "pentium";
+        }
+      case 6:
+        switch (Model) {
+        case 1:  return "pentiumpro";
+        case 3:
+        case 5:
+        case 6:  return "pentium2";
+        case 7:
+        case 8:
+        case 10:
+        case 11: return "pentium3";
+        case 9:
+        case 13: return "pentium-m";
+        case 14: return "yonah";
+        case 15: return "core2";
+        default: return "i686";
+        }
+      case 15: {
+        switch (Model) {
+        case 3:  
+        case 4:
+          return (Em64T) ? "nocona" : "prescott";
+        default:
+          return (Em64T) ? "x86-64" : "pentium4";
+        }
+      }
+        
+    default:
+      return "generic";
+    }
+  } else if (memcmp(text.c, "AuthenticAMD", 12) == 0) {
+    // FIXME: this poorly matches the generated SubtargetFeatureKV table.  There
+    // appears to be no way to generate the wide variety of AMD-specific targets
+    // from the information returned from CPUID.
+    switch (Family) {
+      case 4:
+        return "i486";
+      case 5:
+        switch (Model) {
+        case 6:
+        case 7:  return "k6";
+        case 8:  return "k6-2";
+        case 9:
+        case 13: return "k6-3";
+        default: return "pentium";
+        }
+      case 6:
+        switch (Model) {
+        case 4:  return "athlon-tbird";
+        case 6:
+        case 7:
+        case 8:  return "athlon-mp";
+        case 10: return "athlon-xp";
+        default: return "athlon";
+        }
+      case 15:
+        switch (Model) {
+        case 1:  return "opteron";
+        case 5:  return "athlon-fx"; // also opteron
+        default: return "athlon64";
+        }
+    default:
+      return "generic";
+    }
+  } else {
+    return "generic";
+  }
+}
+
+X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit)
+  : AsmFlavor(AsmWriterFlavor)
+  , PICStyle(PICStyle::None)
+  , X86SSELevel(NoMMXSSE)
+  , HasX86_64(false)
+  , stackAlignment(8)
+  // FIXME: this is a known good value for Yonah. How about others?
+  , MinRepStrSizeThreshold(128)
+  , Is64Bit(is64Bit)
+  , TargetType(isELF) { // Default to ELF unless otherwise specified.
+
+  // Determine default and user specified characteristics
+  if (!FS.empty()) {
+    // If feature string is not empty, parse features string.
+    std::string CPU = GetCurrentX86CPU();
+    ParseSubtargetFeatures(FS, CPU);
+    
+    if (Is64Bit && !HasX86_64)
+      cerr << "Warning: Generation of 64-bit code for a 32-bit processor "
+           << "requested.\n";
+    if (Is64Bit && X86SSELevel < SSE2)
+      cerr << "Warning: 64-bit processors all have at least SSE2.\n";
+  } else {
+    // Otherwise, use CPUID to auto-detect feature set.
+    AutoDetectSubtargetFeatures();
+  }
+    
+  // If requesting codegen for X86-64, make sure that 64-bit and SSE2 features
+  // are enabled.  These are available on all x86-64 CPUs.
+  if (Is64Bit) {
+    HasX86_64 = true;
+    if (X86SSELevel < SSE2)
+      X86SSELevel = SSE2;
+  }
+
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  const std::string& TT = M.getTargetTriple();
+  if (TT.length() > 5) {
+    if (TT.find("cygwin") != std::string::npos)
+      TargetType = isCygwin;
+    else if (TT.find("mingw") != std::string::npos)
+      TargetType = isMingw;
+    else if (TT.find("darwin") != std::string::npos)
+      TargetType = isDarwin;
+    else if (TT.find("win32") != std::string::npos)
+      TargetType = isWindows;
+  } else if (TT.empty()) {
+#if defined(__CYGWIN__)
+    TargetType = isCygwin;
+#elif defined(__MINGW32__)
+    TargetType = isMingw;
+#elif defined(__APPLE__)
+    TargetType = isDarwin;
+#elif defined(_WIN32)
+    TargetType = isWindows;
+#endif
+  }
+
+  // If the asm syntax hasn't been overridden on the command line, use whatever
+  // the target wants.
+  if (AsmFlavor == X86Subtarget::Unset) {
+    if (TargetType == isWindows) {
+      AsmFlavor = X86Subtarget::Intel;
+    } else {
+      AsmFlavor = X86Subtarget::ATT;
+    }
+  }
+
+  if (TargetType == isDarwin ||
+      TargetType == isCygwin ||
+      TargetType == isMingw  ||
+      (TargetType == isELF && Is64Bit))
+    stackAlignment = 16;
+}

diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..2cda970
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.h

@@ -0,0 +1,156 @@
+//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Begeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SUBTARGET_H
+#define X86SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+class Module;
+class GlobalValue;
+class TargetMachine;
+  
+namespace PICStyle {
+enum Style {
+  Stub, GOT, RIPRel, WinPIC, None
+};
+}
+
+class X86Subtarget : public TargetSubtarget {
+public:
+  enum AsmWriterFlavorTy {
+    // Note: This numbering has to match the GCC assembler dialects for inline
+    // asm alternatives to work right.
+    ATT = 0, Intel = 1, Unset
+  };
+protected:
+  enum X86SSEEnum {
+    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3
+  };
+
+  enum X863DNowEnum {
+    NoThreeDNow, ThreeDNow, ThreeDNowA
+  };
+
+  /// AsmFlavor - Which x86 asm dialect to use.
+  AsmWriterFlavorTy AsmFlavor;
+
+  /// PICStyle - Which PIC style to use
+  PICStyle::Style PICStyle;
+  
+  /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, or none supported.
+  X86SSEEnum X86SSELevel;
+
+  /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
+  X863DNowEnum X863DNowLevel;
+
+  /// HasX86_64 - True if the processor supports X86-64 instructions.
+  bool HasX86_64;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// Min. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+  unsigned MinRepStrSizeThreshold;
+
+private:
+  /// Is64Bit - True if the processor supports 64-bit instructions and module
+  /// pointer size is 64 bit.
+  bool Is64Bit;
+
+public:
+  enum {
+    isELF, isCygwin, isDarwin, isWindows, isMingw
+  } TargetType;
+
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  X86Subtarget(const Module &M, const std::string &FS, bool is64Bit);
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// getMinRepStrSizeThreshold - Returns the minimum memset / memcpy size
+  /// required to turn the operation into a X86 rep/movs or rep/stos
+  /// instruction. This is only used if the src / dst alignment is not DWORD
+  /// aligned.
+  unsigned getMinRepStrSizeThreshold() const { return MinRepStrSizeThreshold; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU);
+
+  /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
+  /// instruction.
+  void AutoDetectSubtargetFeatures();
+
+  bool is64Bit() const { return Is64Bit; }
+
+  PICStyle::Style getPICStyle() const { return PICStyle; }
+  void setPICStyle(PICStyle::Style Style)  { PICStyle = Style; }
+
+  bool hasMMX() const { return X86SSELevel >= MMX; }
+  bool hasSSE1() const { return X86SSELevel >= SSE1; }
+  bool hasSSE2() const { return X86SSELevel >= SSE2; }
+  bool hasSSE3() const { return X86SSELevel >= SSE3; }
+  bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+  bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+  bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+
+  unsigned getAsmFlavor() const {
+    return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0;
+  }
+
+  bool isFlavorAtt() const { return AsmFlavor == ATT; }
+  bool isFlavorIntel() const { return AsmFlavor == Intel; }
+
+  bool isTargetDarwin() const { return TargetType == isDarwin; }
+  bool isTargetELF() const { return TargetType == isELF; }
+  bool isTargetWindows() const { return TargetType == isWindows; }
+  bool isTargetMingw() const { return TargetType == isMingw; }
+  bool isTargetCygMing() const { return (TargetType == isMingw ||
+                                         TargetType == isCygwin); }
+  bool isTargetCygwin() const { return TargetType == isCygwin; }
+
+  bool isPICStyleSet() const { return PICStyle != PICStyle::None; }
+  bool isPICStyleGOT() const { return PICStyle == PICStyle::GOT; }
+  bool isPICStyleStub() const { return PICStyle == PICStyle::Stub; }
+  bool isPICStyleRIPRel() const { return PICStyle == PICStyle::RIPRel; }
+  bool isPICStyleWinPIC() const { return PICStyle == PICStyle:: WinPIC; }
+    
+  /// True if accessing the GV requires an extra load. For Windows, dllimported
+  /// symbols are indirect, loading the value at address GV rather then the
+  /// value of GV itself. This means that the GlobalAddress must be in the base
+  /// or index register of the address, not the GV offset field.
+  bool GVRequiresExtraLoad(const GlobalValue* GV, const TargetMachine& TM,
+                           bool isDirectCall) const;
+
+};
+
+namespace X86 {
+  /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+  /// the specified arguments.  If we can't run cpuid on the host, return true.
+  bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+                       unsigned *rECX, unsigned *rEDX);
+}
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86TargetAsmInfo.cpp b/lib/Target/X86/X86TargetAsmInfo.cpp
new file mode 100644
index 0000000..4bb854e
--- /dev/null
+++ b/lib/Target/X86/X86TargetAsmInfo.cpp

@@ -0,0 +1,280 @@
+//===-- X86TargetAsmInfo.cpp - X86 asm properties ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86TargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86Subtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+static const char* x86_asm_table[] = {"{si}", "S",
+                                      "{di}", "D",
+                                      "{ax}", "a",
+                                      "{cx}", "c",
+                                      "{memory}", "memory",
+                                      "{flags}", "",
+                                      "{dirflag}", "",
+                                      "{fpsr}", "",
+                                      "{cc}", "cc",
+                                      0,0};
+
+X86TargetAsmInfo::X86TargetAsmInfo(const X86TargetMachine &TM) {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  
+  // FIXME - Should be simplified.
+
+  AsmTransCBE = x86_asm_table;
+  
+  switch (Subtarget->TargetType) {
+  case X86Subtarget::isDarwin:
+    AlignmentIsInBytes = false;
+    GlobalPrefix = "_";
+    if (!Subtarget->is64Bit())
+      Data64bitsDirective = 0;       // we can't emit a 64-bit unit
+    ZeroDirective = "\t.space\t";  // ".space N" emits N zeros.
+    PrivateGlobalPrefix = "L";     // Marker for constant pool idxs
+    BSSSection = 0;                       // no BSS section.
+    ZeroFillDirective = "\t.zerofill\t";  // Uses .zerofill
+    ConstantPoolSection = "\t.const\n";
+    JumpTableDataSection = "\t.const\n";
+    CStringSection = "\t.cstring";
+    FourByteConstantSection = "\t.literal4\n";
+    EightByteConstantSection = "\t.literal8\n";
+    if (Subtarget->is64Bit())
+      SixteenByteConstantSection = "\t.literal16\n";
+    ReadOnlySection = "\t.const\n";
+    LCOMMDirective = "\t.lcomm\t";
+    COMMDirectiveTakesAlignment = false;
+    HasDotTypeDotSizeDirective = false;
+    if (TM.getRelocationModel() == Reloc::Static) {
+      StaticCtorsSection = ".constructor";
+      StaticDtorsSection = ".destructor";
+    } else {
+      StaticCtorsSection = ".mod_init_func";
+      StaticDtorsSection = ".mod_term_func";
+    }
+    InlineAsmStart = "# InlineAsm Start";
+    InlineAsmEnd = "# InlineAsm End";
+    SetDirective = "\t.set";
+    UsedDirective = "\t.no_dead_strip\t";
+    WeakRefDirective = "\t.weak_reference\t";
+    HiddenDirective = "\t.private_extern\t";
+    
+    // In non-PIC modes, emit a special label before jump tables so that the
+    // linker can perform more accurate dead code stripping.
+    if (TM.getRelocationModel() != Reloc::PIC_) {
+      // Emit a local label that is preserved until the linker runs.
+      JumpTableSpecialLabelPrefix = "l";
+    }
+
+    SupportsDebugInformation = true;
+    NeedsSet = true;
+    DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug";
+    DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug";
+    DwarfLineSection = ".section __DWARF,__debug_line,regular,debug";
+    DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug";
+    DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug";
+    DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug";
+    DwarfStrSection = ".section __DWARF,__debug_str,regular,debug";
+    DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug";
+    DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug";
+    DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug";
+    DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug";
+    break;
+
+  case X86Subtarget::isELF:
+    ReadOnlySection = "\t.section\t.rodata";
+    FourByteConstantSection = "\t.section\t.rodata.cst4,\"aM\",@progbits,4";
+    EightByteConstantSection = "\t.section\t.rodata.cst8,\"aM\",@progbits,8";
+    SixteenByteConstantSection = "\t.section\t.rodata.cst16,\"aM\",@progbits,16";
+    CStringSection = "\t.section\t.rodata.str1.1,\"aMS\",@progbits,1";
+    PrivateGlobalPrefix = ".L";
+    WeakRefDirective = "\t.weak\t";
+    SetDirective = "\t.set\t";
+    PCSymbol = ".";
+
+    // Set up DWARF directives
+    HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+    AbsoluteDebugSectionOffsets = true;
+    AbsoluteEHSectionOffsets = false;
+    SupportsDebugInformation = true;
+    DwarfAbbrevSection =  "\t.section\t.debug_abbrev,\"\",@progbits";
+    DwarfInfoSection =    "\t.section\t.debug_info,\"\",@progbits";
+    DwarfLineSection =    "\t.section\t.debug_line,\"\",@progbits";
+    DwarfFrameSection =   "\t.section\t.debug_frame,\"\",@progbits";
+    DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"\",@progbits";
+    DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"\",@progbits";
+    DwarfStrSection =     "\t.section\t.debug_str,\"\",@progbits";
+    DwarfLocSection =     "\t.section\t.debug_loc,\"\",@progbits";
+    DwarfARangesSection = "\t.section\t.debug_aranges,\"\",@progbits";
+    DwarfRangesSection =  "\t.section\t.debug_ranges,\"\",@progbits";
+    DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",@progbits";
+
+    if (!Subtarget->is64Bit())
+      SupportsExceptionHandling = true;
+    DwarfEHFrameSection = "\t.section\t.eh_frame,\"aw\",@progbits";
+    DwarfExceptionSection = "\t.section\t.gcc_except_table,\"a\",@progbits";
+    break;
+
+  case X86Subtarget::isCygwin:
+  case X86Subtarget::isMingw:
+    GlobalPrefix = "_";
+    LCOMMDirective = "\t.lcomm\t";
+    COMMDirectiveTakesAlignment = false;
+    HasDotTypeDotSizeDirective = false;
+    StaticCtorsSection = "\t.section .ctors,\"aw\"";
+    StaticDtorsSection = "\t.section .dtors,\"aw\"";
+    HiddenDirective = NULL;
+    PrivateGlobalPrefix = "L";  // Prefix for private global symbols
+    WeakRefDirective = "\t.weak\t";
+    SetDirective = "\t.set\t";
+
+    // Set up DWARF directives
+    HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+    AbsoluteDebugSectionOffsets = true;
+    AbsoluteEHSectionOffsets = false;
+    SupportsDebugInformation = true;
+    DwarfSectionOffsetDirective = "\t.secrel32\t";
+    DwarfAbbrevSection =  "\t.section\t.debug_abbrev,\"dr\"";
+    DwarfInfoSection =    "\t.section\t.debug_info,\"dr\"";
+    DwarfLineSection =    "\t.section\t.debug_line,\"dr\"";
+    DwarfFrameSection =   "\t.section\t.debug_frame,\"dr\"";
+    DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"dr\"";
+    DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"dr\"";
+    DwarfStrSection =     "\t.section\t.debug_str,\"dr\"";
+    DwarfLocSection =     "\t.section\t.debug_loc,\"dr\"";
+    DwarfARangesSection = "\t.section\t.debug_aranges,\"dr\"";
+    DwarfRangesSection =  "\t.section\t.debug_ranges,\"dr\"";
+    DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"dr\"";
+    break;
+
+  case X86Subtarget::isWindows:
+    GlobalPrefix = "_";
+    HasDotTypeDotSizeDirective = false;
+    break;
+
+  default: break;
+  }
+  
+  if (Subtarget->isFlavorIntel()) {
+    GlobalPrefix = "_";
+    CommentString = ";";
+  
+    PrivateGlobalPrefix = "$";
+    AlignDirective = "\talign\t";
+    ZeroDirective = "\tdb\t";
+    ZeroDirectiveSuffix = " dup(0)";
+    AsciiDirective = "\tdb\t";
+    AscizDirective = 0;
+    Data8bitsDirective = "\tdb\t";
+    Data16bitsDirective = "\tdw\t";
+    Data32bitsDirective = "\tdd\t";
+    Data64bitsDirective = "\tdq\t";
+    HasDotTypeDotSizeDirective = false;
+    
+    TextSection = "_text";
+    DataSection = "_data";
+    JumpTableDataSection = NULL;
+    SwitchToSectionDirective = "";
+    TextSectionStartSuffix = "\tsegment 'CODE'";
+    DataSectionStartSuffix = "\tsegment 'DATA'";
+    SectionEndDirectiveSuffix = "\tends\n";
+  }
+
+  AssemblerDialect = Subtarget->getAsmFlavor();
+}
+
+bool X86TargetAsmInfo::LowerToBSwap(CallInst *CI) const {
+  // FIXME: this should verify that we are targetting a 486 or better.  If not,
+  // we will turn this bswap into something that will be lowered to logical ops
+  // instead of emitting the bswap asm.  For now, we don't support 486 or lower
+  // so don't worry about this.
+  
+  // Verify this is a simple bswap.
+  if (CI->getNumOperands() != 2 ||
+      CI->getType() != CI->getOperand(1)->getType() ||
+      !CI->getType()->isInteger())
+    return false;
+  
+  const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+  
+  // Okay, we can do this xform, do so now.
+  const Type *Tys[] = { Ty, Ty };
+  Module *M = CI->getParent()->getParent()->getParent();
+  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 2);
+  
+  Value *Op = CI->getOperand(1);
+  Op = new CallInst(Int, Op, CI->getName(), CI);
+  
+  CI->replaceAllUsesWith(Op);
+  CI->eraseFromParent();
+  return true;
+}
+
+
+bool X86TargetAsmInfo::ExpandInlineAsm(CallInst *CI) const {
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
+  
+  std::string AsmStr = IA->getAsmString();
+  
+  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+  std::vector<std::string> AsmPieces;
+  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
+  
+  switch (AsmPieces.size()) {
+  default: return false;    
+  case 1:
+    AsmStr = AsmPieces[0];
+    AsmPieces.clear();
+    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
+    
+    // bswap $0
+    if (AsmPieces.size() == 2 && 
+        AsmPieces[0] == "bswap" && AsmPieces[1] == "$0") {
+      // No need to check constraints, nothing other than the equivalent of
+      // "=r,0" would be valid here.
+      return LowerToBSwap(CI);
+    }
+    break;
+  case 3:
+    if (CI->getType() == Type::Int64Ty && Constraints.size() >= 2 &&
+        Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+        Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+      // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
+      std::vector<std::string> Words;
+      SplitString(AsmPieces[0], Words, " \t");
+      if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
+        Words.clear();
+        SplitString(AsmPieces[1], Words, " \t");
+        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
+          Words.clear();
+          SplitString(AsmPieces[2], Words, " \t,");
+          if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
+              Words[2] == "%edx") {
+            return LowerToBSwap(CI);
+          }
+        }
+      }
+    }
+    break;
+  }
+  return false;
+}

diff --git a/lib/Target/X86/X86TargetAsmInfo.h b/lib/Target/X86/X86TargetAsmInfo.h
new file mode 100644
index 0000000..cc509d1
--- /dev/null
+++ b/lib/Target/X86/X86TargetAsmInfo.h

@@ -0,0 +1,33 @@
+//=====-- X86TargetAsmInfo.h - X86 asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86TargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class X86TargetMachine;
+
+  struct X86TargetAsmInfo : public TargetAsmInfo {
+    X86TargetAsmInfo(const X86TargetMachine &TM);
+    
+    virtual bool ExpandInlineAsm(CallInst *CI) const;
+  private:
+    bool LowerToBSwap(CallInst *CI) const;
+  };
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..4d4bd3f
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.cpp

@@ -0,0 +1,190 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+/// X86TargetMachineModule - Note that this is used on hosts that cannot link
+/// in a library unless there are references into the library.  In particular,
+/// it seems that it is not possible to get things to work on Win32 without
+/// this.  Though it is unused, do not remove it.
+extern "C" int X86TargetMachineModule;
+int X86TargetMachineModule = 0;
+
+namespace {
+  // Register the target.
+  RegisterTarget<X86_32TargetMachine>
+  X("x86",    "  32-bit X86: Pentium-Pro and above");
+  RegisterTarget<X86_64TargetMachine>
+  Y("x86-64", "  64-bit X86: EM64T and AMD64");
+}
+
+const TargetAsmInfo *X86TargetMachine::createTargetAsmInfo() const {
+  return new X86TargetAsmInfo(*this);
+}
+
+unsigned X86_32TargetMachine::getJITMatchQuality() {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned X86_64TargetMachine::getJITMatchQuality() {
+#if defined(__x86_64__)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned X86_32TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "i[3-9]86-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' &&
+      TT[4] == '-' && TT[1] - '3' < 6)
+    return 20;
+  // If the target triple is something non-X86, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer32)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+unsigned X86_64TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "x86_64-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 7 && TT[0] == 'x' && TT[1] == '8' && TT[2] == '6' &&
+      TT[3] == '_' && TT[4] == '6' && TT[5] == '4' && TT[6] == '-')
+    return 20;
+
+  // We strongly match "amd64-*".
+  if (TT.size() >= 6 && TT[0] == 'a' && TT[1] == 'm' && TT[2] == 'd' &&
+      TT[3] == '6' && TT[4] == '4' && TT[5] == '-')
+    return 20;
+  
+  // If the target triple is something non-X86-64, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer64)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+X86_32TargetMachine::X86_32TargetMachine(const Module &M, const std::string &FS) 
+  : X86TargetMachine(M, FS, false) {
+}
+
+
+X86_64TargetMachine::X86_64TargetMachine(const Module &M, const std::string &FS)
+  : X86TargetMachine(M, FS, true) {
+}
+
+/// X86TargetMachine ctor - Create an ILP32 architecture model
+///
+X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS,
+                                   bool is64Bit)
+  : Subtarget(M, FS, is64Bit),
+    DataLayout(Subtarget.is64Bit() ?
+               std::string("e-p:64:64-f64:32:64-i64:32:64") :
+               std::string("e-p:32:32-f64:32:64-i64:32:64")),
+    FrameInfo(TargetFrameInfo::StackGrowsDown,
+              Subtarget.getStackAlignment(), Subtarget.is64Bit() ? -8 : -4),
+    InstrInfo(*this), JITInfo(*this), TLInfo(*this) {
+  if (getRelocationModel() == Reloc::Default)
+    if (Subtarget.isTargetDarwin() || Subtarget.isTargetCygMing())
+      setRelocationModel(Reloc::DynamicNoPIC);
+    else
+      setRelocationModel(Reloc::Static);
+  if (Subtarget.is64Bit()) {
+    // No DynamicNoPIC support under X86-64.
+    if (getRelocationModel() == Reloc::DynamicNoPIC)
+      setRelocationModel(Reloc::PIC_);
+    // Default X86-64 code model is small.
+    if (getCodeModel() == CodeModel::Default)
+      setCodeModel(CodeModel::Small);
+  }
+
+  if (Subtarget.isTargetCygMing())
+    Subtarget.setPICStyle(PICStyle::WinPIC);
+  else if (Subtarget.isTargetDarwin())
+    if (Subtarget.is64Bit())
+      Subtarget.setPICStyle(PICStyle::RIPRel);
+    else
+      Subtarget.setPICStyle(PICStyle::Stub);
+  else if (Subtarget.isTargetELF())
+    if (Subtarget.is64Bit())
+      Subtarget.setPICStyle(PICStyle::RIPRel);
+    else
+      Subtarget.setPICStyle(PICStyle::GOT);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool X86TargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) {
+  // Install an instruction selector.
+  PM.add(createX86ISelDag(*this, Fast));
+  return false;
+}
+
+bool X86TargetMachine::addPostRegAlloc(FunctionPassManager &PM, bool Fast) {
+  PM.add(createX86FloatingPointStackifierPass());
+  return true;  // -print-machineinstr should print after this.
+}
+
+bool X86TargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                          std::ostream &Out) {
+  PM.add(createX86CodePrinterPass(Out, *this));
+  return false;
+}
+
+bool X86TargetMachine::addCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                      MachineCodeEmitter &MCE) {
+  // FIXME: Move this to TargetJITInfo!
+  setRelocationModel(Reloc::Static);
+  Subtarget.setPICStyle(PICStyle::None);
+  
+  // JIT cannot ensure globals are placed in the lower 4G of address.
+  if (Subtarget.is64Bit())
+    setCodeModel(CodeModel::Large);
+
+  PM.add(createX86CodeEmitterPass(*this, MCE));
+  return false;
+}
+
+bool X86TargetMachine::addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                            MachineCodeEmitter &MCE) {
+  PM.add(createX86CodeEmitterPass(*this, MCE));
+  return false;
+}

diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..0a4f1b5
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.h

@@ -0,0 +1,95 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETMACHINE_H
+#define X86TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "X86.h"
+#include "X86ELFWriterInfo.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86ISelLowering.h"
+
+namespace llvm {
+
+class X86TargetMachine : public LLVMTargetMachine {
+  X86Subtarget      Subtarget;
+  const TargetData  DataLayout; // Calculates type size & alignment
+  TargetFrameInfo   FrameInfo;
+  X86InstrInfo      InstrInfo;
+  X86JITInfo        JITInfo;
+  X86TargetLowering TLInfo;
+  X86ELFWriterInfo  ELFWriterInfo;
+
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+public:
+  X86TargetMachine(const Module &M, const std::string &FS, bool is64Bit);
+
+  virtual const X86InstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual       TargetJITInfo    *getJITInfo()         { return &JITInfo; }
+  virtual const TargetSubtarget  *getSubtargetImpl() const{ return &Subtarget; }
+  virtual       X86TargetLowering *getTargetLowering() const { 
+    return const_cast<X86TargetLowering*>(&TLInfo); 
+  }
+  virtual const MRegisterInfo    *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const X86ELFWriterInfo *getELFWriterInfo() const {
+    return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+  }
+
+  static unsigned getModuleMatchQuality(const Module &M);
+  static unsigned getJITMatchQuality();
+  
+  // Set up the pass pipeline.
+  virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);  
+  virtual bool addPostRegAlloc(FunctionPassManager &PM, bool Fast);
+  virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                  std::ostream &Out);
+  virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast,
+                              MachineCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                    MachineCodeEmitter &MCE);
+};
+
+/// X86_32TargetMachine - X86 32-bit target machine.
+///
+class X86_32TargetMachine : public X86TargetMachine {
+public:
+  X86_32TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+/// X86_64TargetMachine - X86 64-bit target machine.
+///
+class X86_64TargetMachine : public X86TargetMachine {
+public:
+  X86_64TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+} // End llvm namespace
+
+#endif
commit	f17a25c88b892d30c2b41ba7ecdfbdfb2b4be9cc	[log] [tgz]
author	Dan Gohman <djg@cray.com>	Wed Jul 18 16:29:46 2007 +0000
committer	Dan Gohman <djg@cray.com>	Wed Jul 18 16:29:46 2007 +0000
tree	ebb79ea1ee5e3bc1fdf38541a811a8b804f0679a