lib/Target/R600/AMDILISelLowering.cpp - fp2-dev/platform/external/llvm - Gitiles

 //===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //==-----------------------------------------------------------------------===//
 //
 /// \file
 /// \brief TargetLowering functions borrowed from AMDIL.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPUISelLowering.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDILDevices.h"
 #include "AMDILIntrinsicInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"

 using namespace llvm;
 //===----------------------------------------------------------------------===//
 // Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 #include "AMDGPUGenCallingConv.inc"

 //===----------------------------------------------------------------------===//
 // TargetLowering Implementation Help Functions End
 //===----------------------------------------------------------------------===//

 //===----------------------------------------------------------------------===//
 // TargetLowering Class Implementation Begins
 //===----------------------------------------------------------------------===//
 void AMDGPUTargetLowering::InitAMDILLowering() {
   int types[] = {
     (int)MVT::i8,
     (int)MVT::i16,
     (int)MVT::i32,
     (int)MVT::f32,
     (int)MVT::f64,
     (int)MVT::i64,
     (int)MVT::v2i8,
     (int)MVT::v4i8,
     (int)MVT::v2i16,
     (int)MVT::v4i16,
     (int)MVT::v4f32,
     (int)MVT::v4i32,
     (int)MVT::v2f32,
     (int)MVT::v2i32,
     (int)MVT::v2f64,
     (int)MVT::v2i64
   };

   int IntTypes[] = {
     (int)MVT::i8,
     (int)MVT::i16,
     (int)MVT::i32,
     (int)MVT::i64
   };

   int FloatTypes[] = {
     (int)MVT::f32,
     (int)MVT::f64
   };

   int VectorTypes[] = {
     (int)MVT::v2i8,
     (int)MVT::v4i8,
     (int)MVT::v2i16,
     (int)MVT::v4i16,
     (int)MVT::v4f32,
     (int)MVT::v4i32,
     (int)MVT::v2f32,
     (int)MVT::v2i32,
     (int)MVT::v2f64,
     (int)MVT::v2i64
   };
   size_t NumTypes = sizeof(types) / sizeof(*types);
   size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
   size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
   size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);

   const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
   // These are the current register classes that are
   // supported

   for (unsigned int x  = 0; x < NumTypes; ++x) {
     MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];

     //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
     // We cannot sextinreg, expand to shifts
     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
     setOperationAction(ISD::SUBE, VT, Expand);
     setOperationAction(ISD::SUBC, VT, Expand);
     setOperationAction(ISD::ADDE, VT, Expand);
     setOperationAction(ISD::ADDC, VT, Expand);
     setOperationAction(ISD::BRCOND, VT, Custom);
     setOperationAction(ISD::BR_JT, VT, Expand);
     setOperationAction(ISD::BRIND, VT, Expand);
     // TODO: Implement custom UREM/SREM routines
     setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     if (VT != MVT::i64 && VT != MVT::v2i64) {
       setOperationAction(ISD::SDIV, VT, Custom);
     }
   }
   for (unsigned int x = 0; x < NumFloatTypes; ++x) {
     MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];

     // IL does not have these operations for floating point types
     setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
     setOperationAction(ISD::SETOLT, VT, Expand);
     setOperationAction(ISD::SETOGE, VT, Expand);
     setOperationAction(ISD::SETOGT, VT, Expand);
     setOperationAction(ISD::SETOLE, VT, Expand);
     setOperationAction(ISD::SETULT, VT, Expand);
     setOperationAction(ISD::SETUGE, VT, Expand);
     setOperationAction(ISD::SETUGT, VT, Expand);
     setOperationAction(ISD::SETULE, VT, Expand);
   }

   for (unsigned int x = 0; x < NumIntTypes; ++x) {
     MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];

     // GPU also does not have divrem function for signed or unsigned
     setOperationAction(ISD::SDIVREM, VT, Expand);

     // GPU does not have [S|U]MUL_LOHI functions as a single instruction
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);

     // GPU doesn't have a rotl, rotr, or byteswap instruction
     setOperationAction(ISD::ROTR, VT, Expand);
     setOperationAction(ISD::BSWAP, VT, Expand);

     // GPU doesn't have any counting operators
     setOperationAction(ISD::CTPOP, VT, Expand);
     setOperationAction(ISD::CTTZ, VT, Expand);
     setOperationAction(ISD::CTLZ, VT, Expand);
   }

   for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
     MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];

     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     // setOperationAction(ISD::VSETCC, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);

   }
   if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
     setOperationAction(ISD::MULHU, MVT::i64, Expand);
     setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
     setOperationAction(ISD::MULHS, MVT::i64, Expand);
     setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
     setOperationAction(ISD::ADD, MVT::v2i64, Expand);
     setOperationAction(ISD::SREM, MVT::v2i64, Expand);
     setOperationAction(ISD::Constant          , MVT::i64  , Legal);
     setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
     setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
     setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
     setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
     setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
   }
   if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
     // we support loading/storing v2f64 but not operations on the type
     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
     setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
     setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
     setOperationAction(ISD::ConstantFP        , MVT::f64  , Legal);
     // We want to expand vector conversions into their scalar
     // counterparts.
     setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
     setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
     setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
     setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
     setOperationAction(ISD::FABS, MVT::f64, Expand);
     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
   }
   // TODO: Fix the UDIV24 algorithm so it works for these
   // types correctly. This needs vector comparisons
   // for this to work correctly.
   setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
   setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
   setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
   setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
   setOperationAction(ISD::SUBC, MVT::Other, Expand);
   setOperationAction(ISD::ADDE, MVT::Other, Expand);
   setOperationAction(ISD::ADDC, MVT::Other, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);


   // Use the default implementation.
   setOperationAction(ISD::ConstantFP        , MVT::f32    , Legal);
   setOperationAction(ISD::Constant          , MVT::i32    , Legal);

   setSchedulingPreference(Sched::RegPressure);
   setPow2DivIsCheap(false);
   setSelectIsExpensive(true);
   setJumpIsExpensive(true);

   maxStoresPerMemcpy  = 4096;
   maxStoresPerMemmove = 4096;
   maxStoresPerMemset  = 4096;

 }

 bool
 AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     const CallInst &I, unsigned Intrinsic) const {
   return false;
 }

 // The backend supports 32 and 64 bit floating point immediates
 bool
 AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
       || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
     return true;
   } else {
     return false;
   }
 }

 bool
 AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
   if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
       || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
     return false;
   } else {
     return true;
   }
 }


 // isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
 // be zero. Op is expected to be a target specific node. Used by DAG
 // combiner.

 void
 AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
     const SDValue Op,
     APInt &KnownZero,
     APInt &KnownOne,
     const SelectionDAG &DAG,
     unsigned Depth) const {
   APInt KnownZero2;
   APInt KnownOne2;
   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
   switch (Op.getOpcode()) {
     default: break;
     case ISD::SELECT_CC:
              DAG.ComputeMaskedBits(
                  Op.getOperand(1),
                  KnownZero,
                  KnownOne,
                  Depth + 1
                  );
              DAG.ComputeMaskedBits(
                  Op.getOperand(0),
                  KnownZero2,
                  KnownOne2
                  );
              assert((KnownZero & KnownOne) == 0
                  && "Bits known to be one AND zero?");
              assert((KnownZero2 & KnownOne2) == 0
                  && "Bits known to be one AND zero?");
              // Only known if known in both the LHS and RHS
              KnownOne &= KnownOne2;
              KnownZero &= KnownZero2;
              break;
   };
 }

 //===----------------------------------------------------------------------===//
 //                           Other Lowering Hooks
 //===----------------------------------------------------------------------===//

 SDValue
 AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
   EVT OVT = Op.getValueType();
   SDValue DST;
   if (OVT.getScalarType() == MVT::i64) {
     DST = LowerSDIV64(Op, DAG);
   } else if (OVT.getScalarType() == MVT::i32) {
     DST = LowerSDIV32(Op, DAG);
   } else if (OVT.getScalarType() == MVT::i16
       || OVT.getScalarType() == MVT::i8) {
     DST = LowerSDIV24(Op, DAG);
   } else {
     DST = SDValue(Op.getNode(), 0);
   }
   return DST;
 }

 SDValue
 AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
   EVT OVT = Op.getValueType();
   SDValue DST;
   if (OVT.getScalarType() == MVT::i64) {
     DST = LowerSREM64(Op, DAG);
   } else if (OVT.getScalarType() == MVT::i32) {
     DST = LowerSREM32(Op, DAG);
   } else if (OVT.getScalarType() == MVT::i16) {
     DST = LowerSREM16(Op, DAG);
   } else if (OVT.getScalarType() == MVT::i8) {
     DST = LowerSREM8(Op, DAG);
   } else {
     DST = SDValue(Op.getNode(), 0);
   }
   return DST;
 }

 SDValue
 AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
   SDValue Data = Op.getOperand(0);
   VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
   DebugLoc DL = Op.getDebugLoc();
   EVT DVT = Data.getValueType();
   EVT BVT = BaseType->getVT();
   unsigned baseBits = BVT.getScalarType().getSizeInBits();
   unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
   unsigned shiftBits = srcBits - baseBits;
   if (srcBits < 32) {
     // If the op is less than 32 bits, then it needs to extend to 32bits
     // so it can properly keep the upper bits valid.
     EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
     Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
     shiftBits = 32 - baseBits;
     DVT = IVT;
   }
   SDValue Shift = DAG.getConstant(shiftBits, DVT);
   // Shift left by 'Shift' bits.
   Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
   // Signed shift Right by 'Shift' bits.
   Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
   if (srcBits < 32) {
     // Once the sign extension is done, the op needs to be converted to
     // its original type.
     Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
   }
   return Data;
 }
 EVT
 AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
   int iSize = (size * numEle);
   int vEle = (iSize >> ((size == 64) ? 6 : 5));
   if (!vEle) {
     vEle = 1;
   }
   if (size == 64) {
     if (vEle == 1) {
       return EVT(MVT::i64);
     } else {
       return EVT(MVT::getVectorVT(MVT::i64, vEle));
     }
   } else {
     if (vEle == 1) {
       return EVT(MVT::i32);
     } else {
       return EVT(MVT::getVectorVT(MVT::i32, vEle));
     }
   }
 }

 SDValue
 AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Cond  = Op.getOperand(1);
   SDValue Jump  = Op.getOperand(2);
   SDValue Result;
   Result = DAG.getNode(
       AMDGPUISD::BRANCH_COND,
       Op.getDebugLoc(),
       Op.getValueType(),
       Chain, Jump, Cond);
   return Result;
 }

 SDValue
 AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
   EVT OVT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   MVT INTTY;
   MVT FLTTY;
   if (!OVT.isVector()) {
     INTTY = MVT::i32;
     FLTTY = MVT::f32;
   } else if (OVT.getVectorNumElements() == 2) {
     INTTY = MVT::v2i32;
     FLTTY = MVT::v2f32;
   } else if (OVT.getVectorNumElements() == 4) {
     INTTY = MVT::v4i32;
     FLTTY = MVT::v4f32;
   }
   unsigned bitsize = OVT.getScalarType().getSizeInBits();
   // char|short jq = ia ^ ib;
   SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);

   // jq = jq >> (bitsize - 2)
   jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));

   // jq = jq | 0x1
   jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));

   // jq = (int)jq
   jq = DAG.getSExtOrTrunc(jq, DL, INTTY);

   // int ia = (int)LHS;
   SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);

   // int ib, (int)RHS;
   SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);

   // float fa = (float)ia;
   SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);

   // float fb = (float)ib;
   SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);

   // float fq = native_divide(fa, fb);
   SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);

   // fq = trunc(fq);
   fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);

   // float fqneg = -fq;
   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);

   // float fr = mad(fqneg, fb, fa);
   SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa);

   // int iq = (int)fq;
   SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);

   // fr = fabs(fr);
   fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);

   // fb = fabs(fb);
   fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);

   // int cv = fr >= fb;
   SDValue cv;
   if (INTTY == MVT::i32) {
     cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
   } else {
     cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
   }
   // jq = (cv ? jq : 0);
   jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
       DAG.getConstant(0, OVT));
   // dst = iq + jq;
   iq = DAG.getSExtOrTrunc(iq, DL, OVT);
   iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
   return iq;
 }

 SDValue
 AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
   EVT OVT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   // The LowerSDIV32 function generates equivalent to the following IL.
   // mov r0, LHS
   // mov r1, RHS
   // ilt r10, r0, 0
   // ilt r11, r1, 0
   // iadd r0, r0, r10
   // iadd r1, r1, r11
   // ixor r0, r0, r10
   // ixor r1, r1, r11
   // udiv r0, r0, r1
   // ixor r10, r10, r11
   // iadd r0, r0, r10
   // ixor DST, r0, r10

   // mov r0, LHS
   SDValue r0 = LHS;

   // mov r1, RHS
   SDValue r1 = RHS;

   // ilt r10, r0, 0
   SDValue r10 = DAG.getSelectCC(DL,
       r0, DAG.getConstant(0, OVT),
       DAG.getConstant(-1, MVT::i32),
       DAG.getConstant(0, MVT::i32),
       ISD::SETLT);

   // ilt r11, r1, 0
   SDValue r11 = DAG.getSelectCC(DL,
       r1, DAG.getConstant(0, OVT),
       DAG.getConstant(-1, MVT::i32),
       DAG.getConstant(0, MVT::i32),
       ISD::SETLT);

   // iadd r0, r0, r10
   r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);

   // iadd r1, r1, r11
   r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);

   // ixor r0, r0, r10
   r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);

   // ixor r1, r1, r11
   r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);

   // udiv r0, r0, r1
   r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);

   // ixor r10, r10, r11
   r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);

   // iadd r0, r0, r10
   r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);

   // ixor DST, r0, r10
   SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
   return DST;
 }

 SDValue
 AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
   return SDValue(Op.getNode(), 0);
 }

 SDValue
 AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
   EVT OVT = Op.getValueType();
   MVT INTTY = MVT::i32;
   if (OVT == MVT::v2i8) {
     INTTY = MVT::v2i32;
   } else if (OVT == MVT::v4i8) {
     INTTY = MVT::v4i32;
   }
   SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
   SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
   LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
   LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
   return LHS;
 }

 SDValue
 AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
   EVT OVT = Op.getValueType();
   MVT INTTY = MVT::i32;
   if (OVT == MVT::v2i16) {
     INTTY = MVT::v2i32;
   } else if (OVT == MVT::v4i16) {
     INTTY = MVT::v4i32;
   }
   SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
   SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
   LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
   LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
   return LHS;
 }

 SDValue
 AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
   EVT OVT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   // The LowerSREM32 function generates equivalent to the following IL.
   // mov r0, LHS
   // mov r1, RHS
   // ilt r10, r0, 0
   // ilt r11, r1, 0
   // iadd r0, r0, r10
   // iadd r1, r1, r11
   // ixor r0, r0, r10
   // ixor r1, r1, r11
   // udiv r20, r0, r1
   // umul r20, r20, r1
   // sub r0, r0, r20
   // iadd r0, r0, r10
   // ixor DST, r0, r10

   // mov r0, LHS
   SDValue r0 = LHS;

   // mov r1, RHS
   SDValue r1 = RHS;

   // ilt r10, r0, 0
   SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);

   // ilt r11, r1, 0
   SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);

   // iadd r0, r0, r10
   r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);

   // iadd r1, r1, r11
   r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);

   // ixor r0, r0, r10
   r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);

   // ixor r1, r1, r11
   r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);

   // udiv r20, r0, r1
   SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);

   // umul r20, r20, r1
   r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);

   // sub r0, r0, r20
   r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);

   // iadd r0, r0, r10
   r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);

   // ixor DST, r0, r10
   SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
   return DST;
 }

 SDValue
 AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
   return SDValue(Op.getNode(), 0);
 }
	//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//==-----------------------------------------------------------------------===//
	//
	/// \file
	/// \brief TargetLowering functions borrowed from AMDIL.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPUISelLowering.h"
	#include "AMDGPURegisterInfo.h"
	#include "AMDGPUSubtarget.h"
	#include "AMDILDevices.h"
	#include "AMDILIntrinsicInfo.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/PseudoSourceValue.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetOptions.h"

	using namespace llvm;
	//===----------------------------------------------------------------------===//
	// Calling Convention Implementation
	//===----------------------------------------------------------------------===//
	#include "AMDGPUGenCallingConv.inc"

	//===----------------------------------------------------------------------===//
	// TargetLowering Implementation Help Functions End
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// TargetLowering Class Implementation Begins
	//===----------------------------------------------------------------------===//
	void AMDGPUTargetLowering::InitAMDILLowering() {
	int types[] = {
	(int)MVT::i8,
	(int)MVT::i16,
	(int)MVT::i32,
	(int)MVT::f32,
	(int)MVT::f64,
	(int)MVT::i64,
	(int)MVT::v2i8,
	(int)MVT::v4i8,
	(int)MVT::v2i16,
	(int)MVT::v4i16,
	(int)MVT::v4f32,
	(int)MVT::v4i32,
	(int)MVT::v2f32,
	(int)MVT::v2i32,
	(int)MVT::v2f64,
	(int)MVT::v2i64
	};

	int IntTypes[] = {
	(int)MVT::i8,
	(int)MVT::i16,
	(int)MVT::i32,
	(int)MVT::i64
	};

	int FloatTypes[] = {
	(int)MVT::f32,
	(int)MVT::f64
	};

	int VectorTypes[] = {
	(int)MVT::v2i8,
	(int)MVT::v4i8,
	(int)MVT::v2i16,
	(int)MVT::v4i16,
	(int)MVT::v4f32,
	(int)MVT::v4i32,
	(int)MVT::v2f32,
	(int)MVT::v2i32,
	(int)MVT::v2f64,
	(int)MVT::v2i64
	};
	size_t NumTypes = sizeof(types) / sizeof(*types);
	size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
	size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
	size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);

	const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
	// These are the current register classes that are
	// supported

	for (unsigned int x = 0; x < NumTypes; ++x) {
	MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];

	//FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
	// We cannot sextinreg, expand to shifts
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
	setOperationAction(ISD::SUBE, VT, Expand);
	setOperationAction(ISD::SUBC, VT, Expand);
	setOperationAction(ISD::ADDE, VT, Expand);
	setOperationAction(ISD::ADDC, VT, Expand);
	setOperationAction(ISD::BRCOND, VT, Custom);
	setOperationAction(ISD::BR_JT, VT, Expand);
	setOperationAction(ISD::BRIND, VT, Expand);
	// TODO: Implement custom UREM/SREM routines
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	if (VT != MVT::i64 && VT != MVT::v2i64) {
	setOperationAction(ISD::SDIV, VT, Custom);
	}
	}
	for (unsigned int x = 0; x < NumFloatTypes; ++x) {
	MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];

	// IL does not have these operations for floating point types
	setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
	setOperationAction(ISD::SETOLT, VT, Expand);
	setOperationAction(ISD::SETOGE, VT, Expand);
	setOperationAction(ISD::SETOGT, VT, Expand);
	setOperationAction(ISD::SETOLE, VT, Expand);
	setOperationAction(ISD::SETULT, VT, Expand);
	setOperationAction(ISD::SETUGE, VT, Expand);
	setOperationAction(ISD::SETUGT, VT, Expand);
	setOperationAction(ISD::SETULE, VT, Expand);
	}

	for (unsigned int x = 0; x < NumIntTypes; ++x) {
	MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];

	// GPU also does not have divrem function for signed or unsigned
	setOperationAction(ISD::SDIVREM, VT, Expand);

	// GPU does not have [S\|U]MUL_LOHI functions as a single instruction
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);

	// GPU doesn't have a rotl, rotr, or byteswap instruction
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);

	// GPU doesn't have any counting operators
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	}

	for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
	MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];

	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	// setOperationAction(ISD::VSETCC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);

	}
	if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
	setOperationAction(ISD::MULHU, MVT::i64, Expand);
	setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
	setOperationAction(ISD::MULHS, MVT::i64, Expand);
	setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
	setOperationAction(ISD::ADD, MVT::v2i64, Expand);
	setOperationAction(ISD::SREM, MVT::v2i64, Expand);
	setOperationAction(ISD::Constant , MVT::i64 , Legal);
	setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
	setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
	setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
	}
	if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
	// we support loading/storing v2f64 but not operations on the type
	setOperationAction(ISD::FADD, MVT::v2f64, Expand);
	setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
	setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
	setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
	setOperationAction(ISD::ConstantFP , MVT::f64 , Legal);
	// We want to expand vector conversions into their scalar
	// counterparts.
	setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
	setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
	setOperationAction(ISD::FABS, MVT::f64, Expand);
	setOperationAction(ISD::FABS, MVT::v2f64, Expand);
	}
	// TODO: Fix the UDIV24 algorithm so it works for these
	// types correctly. This needs vector comparisons
	// for this to work correctly.
	setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
	setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
	setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
	setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
	setOperationAction(ISD::SUBC, MVT::Other, Expand);
	setOperationAction(ISD::ADDE, MVT::Other, Expand);
	setOperationAction(ISD::ADDC, MVT::Other, Expand);
	setOperationAction(ISD::BRCOND, MVT::Other, Custom);
	setOperationAction(ISD::BR_JT, MVT::Other, Expand);
	setOperationAction(ISD::BRIND, MVT::Other, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);


	// Use the default implementation.
	setOperationAction(ISD::ConstantFP , MVT::f32 , Legal);
	setOperationAction(ISD::Constant , MVT::i32 , Legal);

	setSchedulingPreference(Sched::RegPressure);
	setPow2DivIsCheap(false);
	setSelectIsExpensive(true);
	setJumpIsExpensive(true);

	maxStoresPerMemcpy = 4096;
	maxStoresPerMemmove = 4096;
	maxStoresPerMemset = 4096;

	}

	bool
	AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I, unsigned Intrinsic) const {
	return false;
	}

	// The backend supports 32 and 64 bit floating point immediates
	bool
	AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
	\|\| VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
	return true;
	} else {
	return false;
	}
	}

	bool
	AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
	if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
	\|\| VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
	return false;
	} else {
	return true;
	}
	}


	// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
	// be zero. Op is expected to be a target specific node. Used by DAG
	// combiner.

	void
	AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
	const SDValue Op,
	APInt &KnownZero,
	APInt &KnownOne,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	APInt KnownZero2;
	APInt KnownOne2;
	KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
	switch (Op.getOpcode()) {
	default: break;
	case ISD::SELECT_CC:
	DAG.ComputeMaskedBits(
	Op.getOperand(1),
	KnownZero,
	KnownOne,
	Depth + 1
	);
	DAG.ComputeMaskedBits(
	Op.getOperand(0),
	KnownZero2,
	KnownOne2
	);
	assert((KnownZero & KnownOne) == 0
	&& "Bits known to be one AND zero?");
	assert((KnownZero2 & KnownOne2) == 0
	&& "Bits known to be one AND zero?");
	// Only known if known in both the LHS and RHS
	KnownOne &= KnownOne2;
	KnownZero &= KnownZero2;
	break;
	};
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	SDValue
	AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
	EVT OVT = Op.getValueType();
	SDValue DST;
	if (OVT.getScalarType() == MVT::i64) {
	DST = LowerSDIV64(Op, DAG);
	} else if (OVT.getScalarType() == MVT::i32) {
	DST = LowerSDIV32(Op, DAG);
	} else if (OVT.getScalarType() == MVT::i16
	\|\| OVT.getScalarType() == MVT::i8) {
	DST = LowerSDIV24(Op, DAG);
	} else {
	DST = SDValue(Op.getNode(), 0);
	}
	return DST;
	}

	SDValue
	AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
	EVT OVT = Op.getValueType();
	SDValue DST;
	if (OVT.getScalarType() == MVT::i64) {
	DST = LowerSREM64(Op, DAG);
	} else if (OVT.getScalarType() == MVT::i32) {
	DST = LowerSREM32(Op, DAG);
	} else if (OVT.getScalarType() == MVT::i16) {
	DST = LowerSREM16(Op, DAG);
	} else if (OVT.getScalarType() == MVT::i8) {
	DST = LowerSREM8(Op, DAG);
	} else {
	DST = SDValue(Op.getNode(), 0);
	}
	return DST;
	}

	SDValue
	AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
	SDValue Data = Op.getOperand(0);
	VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
	DebugLoc DL = Op.getDebugLoc();
	EVT DVT = Data.getValueType();
	EVT BVT = BaseType->getVT();
	unsigned baseBits = BVT.getScalarType().getSizeInBits();
	unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
	unsigned shiftBits = srcBits - baseBits;
	if (srcBits < 32) {
	// If the op is less than 32 bits, then it needs to extend to 32bits
	// so it can properly keep the upper bits valid.
	EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
	Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
	shiftBits = 32 - baseBits;
	DVT = IVT;
	}
	SDValue Shift = DAG.getConstant(shiftBits, DVT);
	// Shift left by 'Shift' bits.
	Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
	// Signed shift Right by 'Shift' bits.
	Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
	if (srcBits < 32) {
	// Once the sign extension is done, the op needs to be converted to
	// its original type.
	Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
	}
	return Data;
	}
	EVT
	AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
	int iSize = (size * numEle);
	int vEle = (iSize >> ((size == 64) ? 6 : 5));
	if (!vEle) {
	vEle = 1;
	}
	if (size == 64) {
	if (vEle == 1) {
	return EVT(MVT::i64);
	} else {
	return EVT(MVT::getVectorVT(MVT::i64, vEle));
	}
	} else {
	if (vEle == 1) {
	return EVT(MVT::i32);
	} else {
	return EVT(MVT::getVectorVT(MVT::i32, vEle));
	}
	}
	}

	SDValue
	AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Jump = Op.getOperand(2);
	SDValue Result;
	Result = DAG.getNode(
	AMDGPUISD::BRANCH_COND,
	Op.getDebugLoc(),
	Op.getValueType(),
	Chain, Jump, Cond);
	return Result;
	}

	SDValue
	AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
	DebugLoc DL = Op.getDebugLoc();
	EVT OVT = Op.getValueType();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	MVT INTTY;
	MVT FLTTY;
	if (!OVT.isVector()) {
	INTTY = MVT::i32;
	FLTTY = MVT::f32;
	} else if (OVT.getVectorNumElements() == 2) {
	INTTY = MVT::v2i32;
	FLTTY = MVT::v2f32;
	} else if (OVT.getVectorNumElements() == 4) {
	INTTY = MVT::v4i32;
	FLTTY = MVT::v4f32;
	}
	unsigned bitsize = OVT.getScalarType().getSizeInBits();
	// char\|short jq = ia ^ ib;
	SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);

	// jq = jq >> (bitsize - 2)
	jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));

	// jq = jq \| 0x1
	jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));

	// jq = (int)jq
	jq = DAG.getSExtOrTrunc(jq, DL, INTTY);

	// int ia = (int)LHS;
	SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);

	// int ib, (int)RHS;
	SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);

	// float fa = (float)ia;
	SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);

	// float fb = (float)ib;
	SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);

	// float fq = native_divide(fa, fb);
	SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);

	// fq = trunc(fq);
	fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);

	// float fqneg = -fq;
	SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);

	// float fr = mad(fqneg, fb, fa);
	SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa);

	// int iq = (int)fq;
	SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);

	// fr = fabs(fr);
	fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);

	// fb = fabs(fb);
	fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);

	// int cv = fr >= fb;
	SDValue cv;
	if (INTTY == MVT::i32) {
	cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
	} else {
	cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
	}
	// jq = (cv ? jq : 0);
	jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
	DAG.getConstant(0, OVT));
	// dst = iq + jq;
	iq = DAG.getSExtOrTrunc(iq, DL, OVT);
	iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
	return iq;
	}

	SDValue
	AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
	DebugLoc DL = Op.getDebugLoc();
	EVT OVT = Op.getValueType();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	// The LowerSDIV32 function generates equivalent to the following IL.
	// mov r0, LHS
	// mov r1, RHS
	// ilt r10, r0, 0
	// ilt r11, r1, 0
	// iadd r0, r0, r10
	// iadd r1, r1, r11
	// ixor r0, r0, r10
	// ixor r1, r1, r11
	// udiv r0, r0, r1
	// ixor r10, r10, r11
	// iadd r0, r0, r10
	// ixor DST, r0, r10

	// mov r0, LHS
	SDValue r0 = LHS;

	// mov r1, RHS
	SDValue r1 = RHS;

	// ilt r10, r0, 0
	SDValue r10 = DAG.getSelectCC(DL,
	r0, DAG.getConstant(0, OVT),
	DAG.getConstant(-1, MVT::i32),
	DAG.getConstant(0, MVT::i32),
	ISD::SETLT);

	// ilt r11, r1, 0
	SDValue r11 = DAG.getSelectCC(DL,
	r1, DAG.getConstant(0, OVT),
	DAG.getConstant(-1, MVT::i32),
	DAG.getConstant(0, MVT::i32),
	ISD::SETLT);

	// iadd r0, r0, r10
	r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);

	// iadd r1, r1, r11
	r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);

	// ixor r0, r0, r10
	r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);

	// ixor r1, r1, r11
	r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);

	// udiv r0, r0, r1
	r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);

	// ixor r10, r10, r11
	r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);

	// iadd r0, r0, r10
	r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);

	// ixor DST, r0, r10
	SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
	return DST;
	}

	SDValue
	AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
	return SDValue(Op.getNode(), 0);
	}

	SDValue
	AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
	DebugLoc DL = Op.getDebugLoc();
	EVT OVT = Op.getValueType();
	MVT INTTY = MVT::i32;
	if (OVT == MVT::v2i8) {
	INTTY = MVT::v2i32;
	} else if (OVT == MVT::v4i8) {
	INTTY = MVT::v4i32;
	}
	SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
	SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
	LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
	LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
	return LHS;
	}

	SDValue
	AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
	DebugLoc DL = Op.getDebugLoc();
	EVT OVT = Op.getValueType();
	MVT INTTY = MVT::i32;
	if (OVT == MVT::v2i16) {
	INTTY = MVT::v2i32;
	} else if (OVT == MVT::v4i16) {
	INTTY = MVT::v4i32;
	}
	SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
	SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
	LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
	LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
	return LHS;
	}

	SDValue
	AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
	DebugLoc DL = Op.getDebugLoc();
	EVT OVT = Op.getValueType();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	// The LowerSREM32 function generates equivalent to the following IL.
	// mov r0, LHS
	// mov r1, RHS
	// ilt r10, r0, 0
	// ilt r11, r1, 0
	// iadd r0, r0, r10
	// iadd r1, r1, r11
	// ixor r0, r0, r10
	// ixor r1, r1, r11
	// udiv r20, r0, r1
	// umul r20, r20, r1
	// sub r0, r0, r20
	// iadd r0, r0, r10
	// ixor DST, r0, r10

	// mov r0, LHS
	SDValue r0 = LHS;

	// mov r1, RHS
	SDValue r1 = RHS;

	// ilt r10, r0, 0
	SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);

	// ilt r11, r1, 0
	SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);

	// iadd r0, r0, r10
	r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);

	// iadd r1, r1, r11
	r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);

	// ixor r0, r0, r10
	r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);

	// ixor r1, r1, r11
	r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);

	// udiv r20, r0, r1
	SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);

	// umul r20, r20, r1
	r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);

	// sub r0, r0, r20
	r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);

	// iadd r0, r0, r10
	r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);

	// ixor DST, r0, r10
	SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
	return DST;
	}

	SDValue
	AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
	return SDValue(Op.getNode(), 0);
	}