Chandler Carruth | 93dcdc4 | 2015-01-31 11:17:59 +0000 | [diff] [blame] | 1 | //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===// |
Jingyue Wu | 0c981bd | 2014-11-10 18:38:25 +0000 | [diff] [blame] | 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
Jingyue Wu | 0c981bd | 2014-11-10 18:38:25 +0000 | [diff] [blame] | 9 | |
Chandler Carruth | 93dcdc4 | 2015-01-31 11:17:59 +0000 | [diff] [blame] | 10 | #include "NVPTXTargetTransformInfo.h" |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 11 | #include "NVPTXUtilities.h" |
Jingyue Wu | 0c981bd | 2014-11-10 18:38:25 +0000 | [diff] [blame] | 12 | #include "llvm/Analysis/LoopInfo.h" |
| 13 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 14 | #include "llvm/Analysis/ValueTracking.h" |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 15 | #include "llvm/CodeGen/BasicTTIImpl.h" |
Jingyue Wu | 0c981bd | 2014-11-10 18:38:25 +0000 | [diff] [blame] | 16 | #include "llvm/Support/Debug.h" |
| 17 | #include "llvm/Target/CostTable.h" |
| 18 | #include "llvm/Target/TargetLowering.h" |
| 19 | using namespace llvm; |
| 20 | |
| 21 | #define DEBUG_TYPE "NVPTXtti" |
| 22 | |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 23 | // Whether the given intrinsic reads threadIdx.x/y/z. |
| 24 | static bool readsThreadIndex(const IntrinsicInst *II) { |
| 25 | switch (II->getIntrinsicID()) { |
| 26 | default: return false; |
| 27 | case Intrinsic::nvvm_read_ptx_sreg_tid_x: |
| 28 | case Intrinsic::nvvm_read_ptx_sreg_tid_y: |
| 29 | case Intrinsic::nvvm_read_ptx_sreg_tid_z: |
| 30 | return true; |
| 31 | } |
| 32 | } |
| 33 | |
| 34 | static bool readsLaneId(const IntrinsicInst *II) { |
| 35 | return II->getIntrinsicID() == Intrinsic::ptx_read_laneid; |
| 36 | } |
| 37 | |
| 38 | // Whether the given intrinsic is an atomic instruction in PTX. |
| 39 | static bool isNVVMAtomic(const IntrinsicInst *II) { |
| 40 | switch (II->getIntrinsicID()) { |
| 41 | default: return false; |
| 42 | case Intrinsic::nvvm_atomic_load_add_f32: |
| 43 | case Intrinsic::nvvm_atomic_load_inc_32: |
| 44 | case Intrinsic::nvvm_atomic_load_dec_32: |
| 45 | return true; |
| 46 | } |
| 47 | } |
| 48 | |
| 49 | bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { |
| 50 | // Without inter-procedural analysis, we conservatively assume that arguments |
| 51 | // to __device__ functions are divergent. |
| 52 | if (const Argument *Arg = dyn_cast<Argument>(V)) |
| 53 | return !isKernelFunction(*Arg->getParent()); |
| 54 | |
| 55 | if (const Instruction *I = dyn_cast<Instruction>(V)) { |
| 56 | // Without pointer analysis, we conservatively assume values loaded from |
| 57 | // generic or local address space are divergent. |
| 58 | if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { |
| 59 | unsigned AS = LI->getPointerAddressSpace(); |
| 60 | return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL; |
| 61 | } |
| 62 | // Atomic instructions may cause divergence. Atomic instructions are |
| 63 | // executed sequentially across all threads in a warp. Therefore, an earlier |
| 64 | // executed thread may see different memory inputs than a later executed |
| 65 | // thread. For example, suppose *a = 0 initially. |
| 66 | // |
| 67 | // atom.global.add.s32 d, [a], 1 |
| 68 | // |
| 69 | // returns 0 for the first thread that enters the critical region, and 1 for |
| 70 | // the second thread. |
| 71 | if (I->isAtomic()) |
| 72 | return true; |
| 73 | if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { |
| 74 | // Instructions that read threadIdx are obviously divergent. |
| 75 | if (readsThreadIndex(II) || readsLaneId(II)) |
| 76 | return true; |
| 77 | // Handle the NVPTX atomic instrinsics that cannot be represented as an |
| 78 | // atomic IR instruction. |
| 79 | if (isNVVMAtomic(II)) |
| 80 | return true; |
| 81 | } |
| 82 | // Conservatively consider the return value of function calls as divergent. |
| 83 | // We could analyze callees with bodies more precisely using |
| 84 | // inter-procedural analysis. |
| 85 | if (isa<CallInst>(I)) |
| 86 | return true; |
| 87 | } |
| 88 | |
| 89 | return false; |
| 90 | } |
| 91 | |
Chandler Carruth | 93205eb | 2015-08-05 18:08:10 +0000 | [diff] [blame] | 92 | int NVPTXTTIImpl::getArithmeticInstrCost( |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 93 | unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, |
| 94 | TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, |
| 95 | TTI::OperandValueProperties Opd2PropInfo) { |
Jingyue Wu | 8a12cea | 2014-11-12 18:09:15 +0000 | [diff] [blame] | 96 | // Legalize the type. |
Chandler Carruth | 93205eb | 2015-08-05 18:08:10 +0000 | [diff] [blame] | 97 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
Jingyue Wu | 8a12cea | 2014-11-12 18:09:15 +0000 | [diff] [blame] | 98 | |
| 99 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 100 | |
| 101 | switch (ISD) { |
| 102 | default: |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 103 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, |
| 104 | Opd1PropInfo, Opd2PropInfo); |
Jingyue Wu | 8a12cea | 2014-11-12 18:09:15 +0000 | [diff] [blame] | 105 | case ISD::ADD: |
| 106 | case ISD::MUL: |
| 107 | case ISD::XOR: |
| 108 | case ISD::OR: |
| 109 | case ISD::AND: |
| 110 | // The machine code (SASS) simulates an i64 with two i32. Therefore, we |
| 111 | // estimate that arithmetic operations on i64 are twice as expensive as |
| 112 | // those on types that can fit into one machine register. |
| 113 | if (LT.second.SimpleTy == MVT::i64) |
| 114 | return 2 * LT.first; |
| 115 | // Delegate other cases to the basic TTI. |
Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 116 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, |
| 117 | Opd1PropInfo, Opd2PropInfo); |
Jingyue Wu | 8a12cea | 2014-11-12 18:09:15 +0000 | [diff] [blame] | 118 | } |
| 119 | } |
Mark Heffernan | 4c8ca53 | 2015-07-13 18:33:21 +0000 | [diff] [blame] | 120 | |
| 121 | void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, |
| 122 | TTI::UnrollingPreferences &UP) { |
| 123 | BaseT::getUnrollingPreferences(L, UP); |
| 124 | |
| 125 | // Enable partial unrolling and runtime unrolling, but reduce the |
| 126 | // threshold. This partially unrolls small loops which are often |
| 127 | // unrolled by the PTX to SASS compiler and unrolling earlier can be |
| 128 | // beneficial. |
| 129 | UP.Partial = UP.Runtime = true; |
| 130 | UP.PartialThreshold = UP.Threshold / 4; |
| 131 | } |