| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 1 | //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// | 
| Tom Stellard | 8b1e021 | 2013-07-27 00:01:07 +0000 | [diff] [blame] | 2 | // | 
|  | 3 | //                     The LLVM Compiler Infrastructure | 
|  | 4 | // | 
|  | 5 | // This file is distributed under the University of Illinois Open Source | 
|  | 6 | // License. See LICENSE.TXT for details. | 
|  | 7 | // | 
|  | 8 | //===----------------------------------------------------------------------===// | 
|  | 9 | // | 
|  | 10 | // \file | 
|  | 11 | // This file implements a TargetTransformInfo analysis pass specific to the | 
|  | 12 | // AMDGPU target machine. It uses the target's detailed information to provide | 
|  | 13 | // more precise answers to certain TTI queries, while letting the target | 
|  | 14 | // independent and default TTI implementations handle the rest. | 
|  | 15 | // | 
|  | 16 | //===----------------------------------------------------------------------===// | 
|  | 17 |  | 
| Chandler Carruth | 93dcdc4 | 2015-01-31 11:17:59 +0000 | [diff] [blame] | 18 | #include "AMDGPUTargetTransformInfo.h" | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 19 | #include "AMDGPUSubtarget.h" | 
| Alexander Timofeev | 2e5eece | 2018-03-05 15:12:21 +0000 | [diff] [blame] | 20 | #include "Utils/AMDGPUBaseInfo.h" | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 21 | #include "llvm/ADT/STLExtras.h" | 
| Tom Stellard | 8cce9bd | 2014-01-23 18:49:28 +0000 | [diff] [blame] | 22 | #include "llvm/Analysis/LoopInfo.h" | 
| Tom Stellard | 8b1e021 | 2013-07-27 00:01:07 +0000 | [diff] [blame] | 23 | #include "llvm/Analysis/TargetTransformInfo.h" | 
| Tom Stellard | 8cce9bd | 2014-01-23 18:49:28 +0000 | [diff] [blame] | 24 | #include "llvm/Analysis/ValueTracking.h" | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 25 | #include "llvm/CodeGen/ISDOpcodes.h" | 
| Craig Topper | 2fa1436 | 2018-03-29 17:21:10 +0000 | [diff] [blame] | 26 | #include "llvm/CodeGen/ValueTypes.h" | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 27 | #include "llvm/IR/Argument.h" | 
|  | 28 | #include "llvm/IR/Attributes.h" | 
|  | 29 | #include "llvm/IR/BasicBlock.h" | 
|  | 30 | #include "llvm/IR/CallingConv.h" | 
|  | 31 | #include "llvm/IR/DataLayout.h" | 
|  | 32 | #include "llvm/IR/DerivedTypes.h" | 
|  | 33 | #include "llvm/IR/Function.h" | 
|  | 34 | #include "llvm/IR/Instruction.h" | 
|  | 35 | #include "llvm/IR/Instructions.h" | 
|  | 36 | #include "llvm/IR/IntrinsicInst.h" | 
| Chandler Carruth | 6bda14b | 2017-06-06 11:49:48 +0000 | [diff] [blame] | 37 | #include "llvm/IR/Module.h" | 
| Matt Arsenault | 376f1bd | 2017-08-31 05:47:00 +0000 | [diff] [blame] | 38 | #include "llvm/IR/PatternMatch.h" | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 39 | #include "llvm/IR/Type.h" | 
|  | 40 | #include "llvm/IR/Value.h" | 
|  | 41 | #include "llvm/MC/SubtargetFeature.h" | 
|  | 42 | #include "llvm/Support/Casting.h" | 
|  | 43 | #include "llvm/Support/CommandLine.h" | 
| Tom Stellard | 8b1e021 | 2013-07-27 00:01:07 +0000 | [diff] [blame] | 44 | #include "llvm/Support/Debug.h" | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 45 | #include "llvm/Support/ErrorHandling.h" | 
| David Blaikie | 13e77db | 2018-03-23 23:58:25 +0000 | [diff] [blame] | 46 | #include "llvm/Support/MachineValueType.h" | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 47 | #include "llvm/Support/raw_ostream.h" | 
|  | 48 | #include "llvm/Target/TargetMachine.h" | 
|  | 49 | #include <algorithm> | 
|  | 50 | #include <cassert> | 
|  | 51 | #include <limits> | 
|  | 52 | #include <utility> | 
|  | 53 |  | 
| Tom Stellard | 8b1e021 | 2013-07-27 00:01:07 +0000 | [diff] [blame] | 54 | using namespace llvm; | 
|  | 55 |  | 
| Chandler Carruth | 84e68b2 | 2014-04-22 02:41:26 +0000 | [diff] [blame] | 56 | #define DEBUG_TYPE "AMDGPUtti" | 
|  | 57 |  | 
| Stanislav Mekhanoshin | f29602d | 2017-02-03 02:20:05 +0000 | [diff] [blame] | 58 | static cl::opt<unsigned> UnrollThresholdPrivate( | 
|  | 59 | "amdgpu-unroll-threshold-private", | 
|  | 60 | cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), | 
| Stanislav Mekhanoshin | 478b819 | 2017-04-07 16:26:28 +0000 | [diff] [blame] | 61 | cl::init(2500), cl::Hidden); | 
| Matt Arsenault | 9651813 | 2016-03-25 01:00:32 +0000 | [diff] [blame] | 62 |  | 
| Stanislav Mekhanoshin | baf31ac | 2017-03-28 22:13:51 +0000 | [diff] [blame] | 63 | static cl::opt<unsigned> UnrollThresholdLocal( | 
|  | 64 | "amdgpu-unroll-threshold-local", | 
|  | 65 | cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), | 
|  | 66 | cl::init(1000), cl::Hidden); | 
|  | 67 |  | 
| Stanislav Mekhanoshin | 478b819 | 2017-04-07 16:26:28 +0000 | [diff] [blame] | 68 | static cl::opt<unsigned> UnrollThresholdIf( | 
|  | 69 | "amdgpu-unroll-threshold-if", | 
|  | 70 | cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), | 
|  | 71 | cl::init(150), cl::Hidden); | 
|  | 72 |  | 
|  | 73 | static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, | 
|  | 74 | unsigned Depth = 0) { | 
|  | 75 | const Instruction *I = dyn_cast<Instruction>(Cond); | 
|  | 76 | if (!I) | 
|  | 77 | return false; | 
|  | 78 |  | 
|  | 79 | for (const Value *V : I->operand_values()) { | 
|  | 80 | if (!L->contains(I)) | 
|  | 81 | continue; | 
|  | 82 | if (const PHINode *PHI = dyn_cast<PHINode>(V)) { | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 83 | if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { | 
| Stanislav Mekhanoshin | 478b819 | 2017-04-07 16:26:28 +0000 | [diff] [blame] | 84 | return SubLoop->contains(PHI); })) | 
|  | 85 | return true; | 
|  | 86 | } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) | 
|  | 87 | return true; | 
|  | 88 | } | 
|  | 89 | return false; | 
|  | 90 | } | 
|  | 91 |  | 
| Geoff Berry | 66d9bdb | 2017-06-28 15:53:17 +0000 | [diff] [blame] | 92 | void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, | 
| Chandler Carruth | 705b185 | 2015-01-31 03:43:40 +0000 | [diff] [blame] | 93 | TTI::UnrollingPreferences &UP) { | 
| Matt Arsenault | c824458 | 2014-07-25 23:02:42 +0000 | [diff] [blame] | 94 | UP.Threshold = 300; // Twice the default. | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 95 | UP.MaxCount = std::numeric_limits<unsigned>::max(); | 
| Matt Arsenault | c824458 | 2014-07-25 23:02:42 +0000 | [diff] [blame] | 96 | UP.Partial = true; | 
|  | 97 |  | 
|  | 98 | // TODO: Do we want runtime unrolling? | 
|  | 99 |  | 
| Stanislav Mekhanoshin | f29602d | 2017-02-03 02:20:05 +0000 | [diff] [blame] | 100 | // Maximum alloca size than can fit registers. Reserve 16 registers. | 
|  | 101 | const unsigned MaxAlloca = (256 - 16) * 4; | 
| Stanislav Mekhanoshin | baf31ac | 2017-03-28 22:13:51 +0000 | [diff] [blame] | 102 | unsigned ThresholdPrivate = UnrollThresholdPrivate; | 
|  | 103 | unsigned ThresholdLocal = UnrollThresholdLocal; | 
|  | 104 | unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); | 
|  | 105 | AMDGPUAS ASST = ST->getAMDGPUAS(); | 
| Matt Arsenault | ac6e39c | 2014-07-17 06:19:06 +0000 | [diff] [blame] | 106 | for (const BasicBlock *BB : L->getBlocks()) { | 
| Mehdi Amini | a28d91d | 2015-03-10 02:37:25 +0000 | [diff] [blame] | 107 | const DataLayout &DL = BB->getModule()->getDataLayout(); | 
| Stanislav Mekhanoshin | baf31ac | 2017-03-28 22:13:51 +0000 | [diff] [blame] | 108 | unsigned LocalGEPsSeen = 0; | 
|  | 109 |  | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 110 | if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { | 
| Stanislav Mekhanoshin | 478b819 | 2017-04-07 16:26:28 +0000 | [diff] [blame] | 111 | return SubLoop->contains(BB); })) | 
|  | 112 | continue; // Block belongs to an inner loop. | 
|  | 113 |  | 
| Matt Arsenault | ac6e39c | 2014-07-17 06:19:06 +0000 | [diff] [blame] | 114 | for (const Instruction &I : *BB) { | 
| Stanislav Mekhanoshin | 478b819 | 2017-04-07 16:26:28 +0000 | [diff] [blame] | 115 | // Unroll a loop which contains an "if" statement whose condition | 
|  | 116 | // defined by a PHI belonging to the loop. This may help to eliminate | 
|  | 117 | // if region and potentially even PHI itself, saving on both divergence | 
|  | 118 | // and registers used for the PHI. | 
|  | 119 | // Add a small bonus for each of such "if" statements. | 
|  | 120 | if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) { | 
|  | 121 | if (UP.Threshold < MaxBoost && Br->isConditional()) { | 
|  | 122 | if (L->isLoopExiting(Br->getSuccessor(0)) || | 
|  | 123 | L->isLoopExiting(Br->getSuccessor(1))) | 
|  | 124 | continue; | 
|  | 125 | if (dependsOnLocalPhi(L, Br->getCondition())) { | 
|  | 126 | UP.Threshold += UnrollThresholdIf; | 
| Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 127 | LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold | 
|  | 128 | << " for loop:\n" | 
|  | 129 | << *L << " due to " << *Br << '\n'); | 
| Stanislav Mekhanoshin | 478b819 | 2017-04-07 16:26:28 +0000 | [diff] [blame] | 130 | if (UP.Threshold >= MaxBoost) | 
|  | 131 | return; | 
|  | 132 | } | 
|  | 133 | } | 
|  | 134 | continue; | 
|  | 135 | } | 
|  | 136 |  | 
| Matt Arsenault | ac6e39c | 2014-07-17 06:19:06 +0000 | [diff] [blame] | 137 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); | 
| Stanislav Mekhanoshin | baf31ac | 2017-03-28 22:13:51 +0000 | [diff] [blame] | 138 | if (!GEP) | 
| Tom Stellard | 8cce9bd | 2014-01-23 18:49:28 +0000 | [diff] [blame] | 139 | continue; | 
| Matt Arsenault | ac6e39c | 2014-07-17 06:19:06 +0000 | [diff] [blame] | 140 |  | 
| Stanislav Mekhanoshin | baf31ac | 2017-03-28 22:13:51 +0000 | [diff] [blame] | 141 | unsigned AS = GEP->getAddressSpace(); | 
|  | 142 | unsigned Threshold = 0; | 
|  | 143 | if (AS == ASST.PRIVATE_ADDRESS) | 
|  | 144 | Threshold = ThresholdPrivate; | 
|  | 145 | else if (AS == ASST.LOCAL_ADDRESS) | 
|  | 146 | Threshold = ThresholdLocal; | 
|  | 147 | else | 
|  | 148 | continue; | 
|  | 149 |  | 
|  | 150 | if (UP.Threshold >= Threshold) | 
|  | 151 | continue; | 
|  | 152 |  | 
|  | 153 | if (AS == ASST.PRIVATE_ADDRESS) { | 
|  | 154 | const Value *Ptr = GEP->getPointerOperand(); | 
|  | 155 | const AllocaInst *Alloca = | 
|  | 156 | dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); | 
|  | 157 | if (!Alloca || !Alloca->isStaticAlloca()) | 
|  | 158 | continue; | 
| Stanislav Mekhanoshin | f29602d | 2017-02-03 02:20:05 +0000 | [diff] [blame] | 159 | Type *Ty = Alloca->getAllocatedType(); | 
|  | 160 | unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; | 
|  | 161 | if (AllocaSize > MaxAlloca) | 
|  | 162 | continue; | 
| Stanislav Mekhanoshin | baf31ac | 2017-03-28 22:13:51 +0000 | [diff] [blame] | 163 | } else if (AS == ASST.LOCAL_ADDRESS) { | 
|  | 164 | LocalGEPsSeen++; | 
|  | 165 | // Inhibit unroll for local memory if we have seen addressing not to | 
|  | 166 | // a variable, most likely we will be unable to combine it. | 
|  | 167 | // Do not unroll too deep inner loops for local memory to give a chance | 
|  | 168 | // to unroll an outer loop for a more important reason. | 
|  | 169 | if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 || | 
|  | 170 | (!isa<GlobalVariable>(GEP->getPointerOperand()) && | 
|  | 171 | !isa<Argument>(GEP->getPointerOperand()))) | 
|  | 172 | continue; | 
|  | 173 | } | 
| Stanislav Mekhanoshin | f29602d | 2017-02-03 02:20:05 +0000 | [diff] [blame] | 174 |  | 
| Stanislav Mekhanoshin | baf31ac | 2017-03-28 22:13:51 +0000 | [diff] [blame] | 175 | // Check if GEP depends on a value defined by this loop itself. | 
|  | 176 | bool HasLoopDef = false; | 
|  | 177 | for (const Value *Op : GEP->operands()) { | 
|  | 178 | const Instruction *Inst = dyn_cast<Instruction>(Op); | 
|  | 179 | if (!Inst || L->isLoopInvariant(Op)) | 
| Stanislav Mekhanoshin | f29602d | 2017-02-03 02:20:05 +0000 | [diff] [blame] | 180 | continue; | 
|  | 181 |  | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 182 | if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { | 
| Stanislav Mekhanoshin | baf31ac | 2017-03-28 22:13:51 +0000 | [diff] [blame] | 183 | return SubLoop->contains(Inst); })) | 
|  | 184 | continue; | 
|  | 185 | HasLoopDef = true; | 
|  | 186 | break; | 
| Tom Stellard | 8cce9bd | 2014-01-23 18:49:28 +0000 | [diff] [blame] | 187 | } | 
| Stanislav Mekhanoshin | baf31ac | 2017-03-28 22:13:51 +0000 | [diff] [blame] | 188 | if (!HasLoopDef) | 
|  | 189 | continue; | 
|  | 190 |  | 
|  | 191 | // We want to do whatever we can to limit the number of alloca | 
|  | 192 | // instructions that make it through to the code generator.  allocas | 
|  | 193 | // require us to use indirect addressing, which is slow and prone to | 
|  | 194 | // compiler bugs.  If this loop does an address calculation on an | 
|  | 195 | // alloca ptr, then we want to use a higher than normal loop unroll | 
|  | 196 | // threshold. This will give SROA a better chance to eliminate these | 
|  | 197 | // allocas. | 
|  | 198 | // | 
|  | 199 | // We also want to have more unrolling for local memory to let ds | 
|  | 200 | // instructions with different offsets combine. | 
|  | 201 | // | 
|  | 202 | // Don't use the maximum allowed value here as it will make some | 
|  | 203 | // programs way too big. | 
|  | 204 | UP.Threshold = Threshold; | 
| Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 205 | LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold | 
|  | 206 | << " for loop:\n" | 
|  | 207 | << *L << " due to " << *GEP << '\n'); | 
| Stanislav Mekhanoshin | 478b819 | 2017-04-07 16:26:28 +0000 | [diff] [blame] | 208 | if (UP.Threshold >= MaxBoost) | 
| Stanislav Mekhanoshin | baf31ac | 2017-03-28 22:13:51 +0000 | [diff] [blame] | 209 | return; | 
| Tom Stellard | 8cce9bd | 2014-01-23 18:49:28 +0000 | [diff] [blame] | 210 | } | 
|  | 211 | } | 
|  | 212 | } | 
| Matt Arsenault | 3dd43fc | 2014-07-18 06:07:13 +0000 | [diff] [blame] | 213 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 214 | unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { | 
| Matt Arsenault | 67cd347 | 2017-06-20 20:38:06 +0000 | [diff] [blame] | 215 | // The concept of vector registers doesn't really exist. Some packed vector | 
|  | 216 | // operations operate on the normal 32-bit registers. | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 217 | return 256; | 
| Matt Arsenault | a93441f | 2014-07-19 18:15:16 +0000 | [diff] [blame] | 218 | } | 
|  | 219 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 220 | unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const { | 
| Matt Arsenault | 67cd347 | 2017-06-20 20:38:06 +0000 | [diff] [blame] | 221 | // This is really the number of registers to fill when vectorizing / | 
|  | 222 | // interleaving loops, so we lie to avoid trying to use all registers. | 
|  | 223 | return getHardwareNumberOfRegisters(Vec) >> 3; | 
|  | 224 | } | 
|  | 225 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 226 | unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const { | 
| Matt Arsenault | 67cd347 | 2017-06-20 20:38:06 +0000 | [diff] [blame] | 227 | return 32; | 
|  | 228 | } | 
|  | 229 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 230 | unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { | 
| Matt Arsenault | 67cd347 | 2017-06-20 20:38:06 +0000 | [diff] [blame] | 231 | return 32; | 
| Matt Arsenault | 4339b3f | 2015-12-24 05:14:55 +0000 | [diff] [blame] | 232 | } | 
| Matt Arsenault | a93441f | 2014-07-19 18:15:16 +0000 | [diff] [blame] | 233 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 234 | unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, | 
| Farhana Aleen | 8919664 | 2018-03-07 17:09:18 +0000 | [diff] [blame] | 235 | unsigned ChainSizeInBytes, | 
|  | 236 | VectorType *VecTy) const { | 
|  | 237 | unsigned VecRegBitWidth = VF * LoadSize; | 
|  | 238 | if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) | 
|  | 239 | // TODO: Support element-size less than 32bit? | 
|  | 240 | return 128 / LoadSize; | 
|  | 241 |  | 
|  | 242 | return VF; | 
|  | 243 | } | 
|  | 244 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 245 | unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, | 
| Farhana Aleen | 8919664 | 2018-03-07 17:09:18 +0000 | [diff] [blame] | 246 | unsigned ChainSizeInBytes, | 
|  | 247 | VectorType *VecTy) const { | 
|  | 248 | unsigned VecRegBitWidth = VF * StoreSize; | 
|  | 249 | if (VecRegBitWidth > 128) | 
|  | 250 | return 128 / StoreSize; | 
|  | 251 |  | 
|  | 252 | return VF; | 
|  | 253 | } | 
|  | 254 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 255 | unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { | 
| Yaxun Liu | 1a14bfa | 2017-03-27 14:04:01 +0000 | [diff] [blame] | 256 | AMDGPUAS AS = ST->getAMDGPUAS(); | 
|  | 257 | if (AddrSpace == AS.GLOBAL_ADDRESS || | 
|  | 258 | AddrSpace == AS.CONSTANT_ADDRESS || | 
| Farhana Aleen | 8919664 | 2018-03-07 17:09:18 +0000 | [diff] [blame] | 259 | AddrSpace == AS.CONSTANT_ADDRESS_32BIT) { | 
| Farhana Aleen | 8919664 | 2018-03-07 17:09:18 +0000 | [diff] [blame] | 260 | return 512; | 
|  | 261 | } | 
|  | 262 |  | 
| Farhana Aleen | eacb102 | 2018-05-28 18:15:11 +0000 | [diff] [blame] | 263 | if (AddrSpace == AS.FLAT_ADDRESS || | 
|  | 264 | AddrSpace == AS.LOCAL_ADDRESS || | 
| Marek Olsak | a9a58fa | 2018-04-10 22:48:23 +0000 | [diff] [blame] | 265 | AddrSpace == AS.REGION_ADDRESS) | 
| Farhana Aleen | eacb102 | 2018-05-28 18:15:11 +0000 | [diff] [blame] | 266 | return 128; | 
| Marek Olsak | a9a58fa | 2018-04-10 22:48:23 +0000 | [diff] [blame] | 267 |  | 
| Yaxun Liu | 1a14bfa | 2017-03-27 14:04:01 +0000 | [diff] [blame] | 268 | if (AddrSpace == AS.PRIVATE_ADDRESS) | 
| Matt Arsenault | 0994bd5 | 2016-07-01 00:56:27 +0000 | [diff] [blame] | 269 | return 8 * ST->getMaxPrivateElementSize(); | 
| Yaxun Liu | 1a14bfa | 2017-03-27 14:04:01 +0000 | [diff] [blame] | 270 |  | 
| Yaxun Liu | 1a14bfa | 2017-03-27 14:04:01 +0000 | [diff] [blame] | 271 | llvm_unreachable("unhandled address space"); | 
| Matt Arsenault | 0994bd5 | 2016-07-01 00:56:27 +0000 | [diff] [blame] | 272 | } | 
|  | 273 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 274 | bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, | 
| Matt Arsenault | f0a88db | 2017-02-23 03:58:53 +0000 | [diff] [blame] | 275 | unsigned Alignment, | 
|  | 276 | unsigned AddrSpace) const { | 
|  | 277 | // We allow vectorization of flat stores, even though we may need to decompose | 
|  | 278 | // them later if they may access private memory. We don't have enough context | 
|  | 279 | // here, and legalization can handle it. | 
| Yaxun Liu | 1a14bfa | 2017-03-27 14:04:01 +0000 | [diff] [blame] | 280 | if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) { | 
| Matt Arsenault | f0a88db | 2017-02-23 03:58:53 +0000 | [diff] [blame] | 281 | return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && | 
|  | 282 | ChainSizeInBytes <= ST->getMaxPrivateElementSize(); | 
|  | 283 | } | 
|  | 284 | return true; | 
|  | 285 | } | 
|  | 286 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 287 | bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, | 
| Matt Arsenault | f0a88db | 2017-02-23 03:58:53 +0000 | [diff] [blame] | 288 | unsigned Alignment, | 
|  | 289 | unsigned AddrSpace) const { | 
|  | 290 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); | 
|  | 291 | } | 
|  | 292 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 293 | bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, | 
| Matt Arsenault | f0a88db | 2017-02-23 03:58:53 +0000 | [diff] [blame] | 294 | unsigned Alignment, | 
|  | 295 | unsigned AddrSpace) const { | 
|  | 296 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); | 
|  | 297 | } | 
|  | 298 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 299 | unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) { | 
| Changpeng Fang | 1be9b9f | 2017-03-09 00:07:00 +0000 | [diff] [blame] | 300 | // Disable unrolling if the loop is not vectorized. | 
| Matt Arsenault | 67cd347 | 2017-06-20 20:38:06 +0000 | [diff] [blame] | 301 | // TODO: Enable this again. | 
| Changpeng Fang | 1be9b9f | 2017-03-09 00:07:00 +0000 | [diff] [blame] | 302 | if (VF == 1) | 
|  | 303 | return 1; | 
|  | 304 |  | 
| Matt Arsenault | 67cd347 | 2017-06-20 20:38:06 +0000 | [diff] [blame] | 305 | return 8; | 
| Matt Arsenault | a93441f | 2014-07-19 18:15:16 +0000 | [diff] [blame] | 306 | } | 
| Matt Arsenault | e830f54 | 2015-12-01 19:08:39 +0000 | [diff] [blame] | 307 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 308 | bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, | 
| Matt Arsenault | 3e268cc | 2017-12-11 21:38:43 +0000 | [diff] [blame] | 309 | MemIntrinsicInfo &Info) const { | 
|  | 310 | switch (Inst->getIntrinsicID()) { | 
|  | 311 | case Intrinsic::amdgcn_atomic_inc: | 
| Daniil Fukalov | 6e1dc68 | 2018-01-26 11:09:38 +0000 | [diff] [blame] | 312 | case Intrinsic::amdgcn_atomic_dec: | 
|  | 313 | case Intrinsic::amdgcn_ds_fadd: | 
|  | 314 | case Intrinsic::amdgcn_ds_fmin: | 
|  | 315 | case Intrinsic::amdgcn_ds_fmax: { | 
| Matt Arsenault | 3e268cc | 2017-12-11 21:38:43 +0000 | [diff] [blame] | 316 | auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); | 
|  | 317 | auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4)); | 
|  | 318 | if (!Ordering || !Volatile) | 
|  | 319 | return false; // Invalid. | 
|  | 320 |  | 
|  | 321 | unsigned OrderingVal = Ordering->getZExtValue(); | 
|  | 322 | if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent)) | 
|  | 323 | return false; | 
|  | 324 |  | 
|  | 325 | Info.PtrVal = Inst->getArgOperand(0); | 
|  | 326 | Info.Ordering = static_cast<AtomicOrdering>(OrderingVal); | 
|  | 327 | Info.ReadMem = true; | 
|  | 328 | Info.WriteMem = true; | 
|  | 329 | Info.IsVolatile = !Volatile->isNullValue(); | 
|  | 330 | return true; | 
|  | 331 | } | 
|  | 332 | default: | 
|  | 333 | return false; | 
|  | 334 | } | 
|  | 335 | } | 
|  | 336 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 337 | int GCNTTIImpl::getArithmeticInstrCost( | 
| Matt Arsenault | 9651813 | 2016-03-25 01:00:32 +0000 | [diff] [blame] | 338 | unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, | 
|  | 339 | TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, | 
| Mohammed Agabaria | 2c96c43 | 2017-01-11 08:23:37 +0000 | [diff] [blame] | 340 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) { | 
| Matt Arsenault | 9651813 | 2016-03-25 01:00:32 +0000 | [diff] [blame] | 341 | EVT OrigTy = TLI->getValueType(DL, Ty); | 
|  | 342 | if (!OrigTy.isSimple()) { | 
|  | 343 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, | 
|  | 344 | Opd1PropInfo, Opd2PropInfo); | 
|  | 345 | } | 
|  | 346 |  | 
|  | 347 | // Legalize the type. | 
|  | 348 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | 
|  | 349 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | 
|  | 350 |  | 
|  | 351 | // Because we don't have any legal vector operations, but the legal types, we | 
|  | 352 | // need to account for split vectors. | 
|  | 353 | unsigned NElts = LT.second.isVector() ? | 
|  | 354 | LT.second.getVectorNumElements() : 1; | 
|  | 355 |  | 
|  | 356 | MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; | 
|  | 357 |  | 
|  | 358 | switch (ISD) { | 
| Matt Arsenault | 8c8fcb2 | 2016-03-25 01:16:40 +0000 | [diff] [blame] | 359 | case ISD::SHL: | 
|  | 360 | case ISD::SRL: | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 361 | case ISD::SRA: | 
| Matt Arsenault | 8c8fcb2 | 2016-03-25 01:16:40 +0000 | [diff] [blame] | 362 | if (SLT == MVT::i64) | 
|  | 363 | return get64BitInstrCost() * LT.first * NElts; | 
|  | 364 |  | 
|  | 365 | // i32 | 
|  | 366 | return getFullRateInstrCost() * LT.first * NElts; | 
| Matt Arsenault | 8c8fcb2 | 2016-03-25 01:16:40 +0000 | [diff] [blame] | 367 | case ISD::ADD: | 
|  | 368 | case ISD::SUB: | 
|  | 369 | case ISD::AND: | 
|  | 370 | case ISD::OR: | 
| Eugene Zelenko | d16eff8 | 2017-08-08 23:53:55 +0000 | [diff] [blame] | 371 | case ISD::XOR: | 
| Matt Arsenault | 8c8fcb2 | 2016-03-25 01:16:40 +0000 | [diff] [blame] | 372 | if (SLT == MVT::i64){ | 
|  | 373 | // and, or and xor are typically split into 2 VALU instructions. | 
|  | 374 | return 2 * getFullRateInstrCost() * LT.first * NElts; | 
|  | 375 | } | 
|  | 376 |  | 
|  | 377 | return LT.first * NElts * getFullRateInstrCost(); | 
| Matt Arsenault | 8c8fcb2 | 2016-03-25 01:16:40 +0000 | [diff] [blame] | 378 | case ISD::MUL: { | 
|  | 379 | const int QuarterRateCost = getQuarterRateInstrCost(); | 
|  | 380 | if (SLT == MVT::i64) { | 
|  | 381 | const int FullRateCost = getFullRateInstrCost(); | 
|  | 382 | return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; | 
|  | 383 | } | 
|  | 384 |  | 
|  | 385 | // i32 | 
|  | 386 | return QuarterRateCost * NElts * LT.first; | 
|  | 387 | } | 
| Matt Arsenault | 9651813 | 2016-03-25 01:00:32 +0000 | [diff] [blame] | 388 | case ISD::FADD: | 
|  | 389 | case ISD::FSUB: | 
|  | 390 | case ISD::FMUL: | 
|  | 391 | if (SLT == MVT::f64) | 
|  | 392 | return LT.first * NElts * get64BitInstrCost(); | 
|  | 393 |  | 
|  | 394 | if (SLT == MVT::f32 || SLT == MVT::f16) | 
|  | 395 | return LT.first * NElts * getFullRateInstrCost(); | 
|  | 396 | break; | 
| Matt Arsenault | 9651813 | 2016-03-25 01:00:32 +0000 | [diff] [blame] | 397 | case ISD::FDIV: | 
|  | 398 | case ISD::FREM: | 
|  | 399 | // FIXME: frem should be handled separately. The fdiv in it is most of it, | 
|  | 400 | // but the current lowering is also not entirely correct. | 
|  | 401 | if (SLT == MVT::f64) { | 
|  | 402 | int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); | 
| Matt Arsenault | 9651813 | 2016-03-25 01:00:32 +0000 | [diff] [blame] | 403 | // Add cost of workaround. | 
|  | 404 | if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) | 
|  | 405 | Cost += 3 * getFullRateInstrCost(); | 
|  | 406 |  | 
|  | 407 | return LT.first * Cost * NElts; | 
|  | 408 | } | 
|  | 409 |  | 
| Matt Arsenault | 376f1bd | 2017-08-31 05:47:00 +0000 | [diff] [blame] | 410 | if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) { | 
|  | 411 | // TODO: This is more complicated, unsafe flags etc. | 
|  | 412 | if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) || | 
|  | 413 | (SLT == MVT::f16 && ST->has16BitInsts())) { | 
|  | 414 | return LT.first * getQuarterRateInstrCost() * NElts; | 
|  | 415 | } | 
|  | 416 | } | 
|  | 417 |  | 
|  | 418 | if (SLT == MVT::f16 && ST->has16BitInsts()) { | 
|  | 419 | // 2 x v_cvt_f32_f16 | 
|  | 420 | // f32 rcp | 
|  | 421 | // f32 fmul | 
|  | 422 | // v_cvt_f16_f32 | 
|  | 423 | // f16 div_fixup | 
|  | 424 | int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(); | 
|  | 425 | return LT.first * Cost * NElts; | 
|  | 426 | } | 
|  | 427 |  | 
| Matt Arsenault | 9651813 | 2016-03-25 01:00:32 +0000 | [diff] [blame] | 428 | if (SLT == MVT::f32 || SLT == MVT::f16) { | 
| Matt Arsenault | 9651813 | 2016-03-25 01:00:32 +0000 | [diff] [blame] | 429 | int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); | 
| Matt Arsenault | 376f1bd | 2017-08-31 05:47:00 +0000 | [diff] [blame] | 430 |  | 
|  | 431 | if (!ST->hasFP32Denormals()) { | 
|  | 432 | // FP mode switches. | 
|  | 433 | Cost += 2 * getFullRateInstrCost(); | 
|  | 434 | } | 
|  | 435 |  | 
| Matt Arsenault | 9651813 | 2016-03-25 01:00:32 +0000 | [diff] [blame] | 436 | return LT.first * NElts * Cost; | 
|  | 437 | } | 
| Matt Arsenault | 9651813 | 2016-03-25 01:00:32 +0000 | [diff] [blame] | 438 | break; | 
|  | 439 | default: | 
|  | 440 | break; | 
|  | 441 | } | 
|  | 442 |  | 
|  | 443 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, | 
|  | 444 | Opd1PropInfo, Opd2PropInfo); | 
|  | 445 | } | 
|  | 446 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 447 | unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) { | 
| Matt Arsenault | e05ff15 | 2015-12-16 18:37:19 +0000 | [diff] [blame] | 448 | // XXX - For some reason this isn't called for switch. | 
|  | 449 | switch (Opcode) { | 
|  | 450 | case Instruction::Br: | 
|  | 451 | case Instruction::Ret: | 
|  | 452 | return 10; | 
|  | 453 | default: | 
|  | 454 | return BaseT::getCFInstrCost(Opcode); | 
|  | 455 | } | 
|  | 456 | } | 
|  | 457 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 458 | int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty, | 
| Farhana Aleen | e2dfe8a | 2018-05-01 21:41:12 +0000 | [diff] [blame] | 459 | bool IsPairwise) { | 
|  | 460 | EVT OrigTy = TLI->getValueType(DL, Ty); | 
|  | 461 |  | 
|  | 462 | // Computes cost on targets that have packed math instructions(which support | 
|  | 463 | // 16-bit types only). | 
|  | 464 | if (IsPairwise || | 
|  | 465 | !ST->hasVOP3PInsts() || | 
|  | 466 | OrigTy.getScalarSizeInBits() != 16) | 
|  | 467 | return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise); | 
|  | 468 |  | 
|  | 469 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | 
|  | 470 | return LT.first * getFullRateInstrCost(); | 
|  | 471 | } | 
|  | 472 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 473 | int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy, | 
| Farhana Aleen | e24f3ff | 2018-05-09 21:18:34 +0000 | [diff] [blame] | 474 | bool IsPairwise, | 
|  | 475 | bool IsUnsigned) { | 
|  | 476 | EVT OrigTy = TLI->getValueType(DL, Ty); | 
|  | 477 |  | 
|  | 478 | // Computes cost on targets that have packed math instructions(which support | 
|  | 479 | // 16-bit types only). | 
|  | 480 | if (IsPairwise || | 
|  | 481 | !ST->hasVOP3PInsts() || | 
|  | 482 | OrigTy.getScalarSizeInBits() != 16) | 
|  | 483 | return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned); | 
|  | 484 |  | 
|  | 485 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); | 
|  | 486 | return LT.first * getHalfRateInstrCost(); | 
|  | 487 | } | 
|  | 488 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 489 | int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, | 
| Matt Arsenault | e830f54 | 2015-12-01 19:08:39 +0000 | [diff] [blame] | 490 | unsigned Index) { | 
|  | 491 | switch (Opcode) { | 
|  | 492 | case Instruction::ExtractElement: | 
| Matt Arsenault | 3c5e423 | 2017-05-10 21:29:33 +0000 | [diff] [blame] | 493 | case Instruction::InsertElement: { | 
|  | 494 | unsigned EltSize | 
|  | 495 | = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); | 
|  | 496 | if (EltSize < 32) { | 
|  | 497 | if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) | 
|  | 498 | return 0; | 
|  | 499 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); | 
|  | 500 | } | 
|  | 501 |  | 
| Matt Arsenault | 59767ce | 2016-03-25 00:14:11 +0000 | [diff] [blame] | 502 | // Extracts are just reads of a subregister, so are free. Inserts are | 
|  | 503 | // considered free because we don't want to have any cost for scalarizing | 
|  | 504 | // operations, and we don't have to copy into a different register class. | 
|  | 505 |  | 
| Matt Arsenault | e830f54 | 2015-12-01 19:08:39 +0000 | [diff] [blame] | 506 | // Dynamic indexing isn't free and is best avoided. | 
|  | 507 | return Index == ~0u ? 2 : 0; | 
| Matt Arsenault | 3c5e423 | 2017-05-10 21:29:33 +0000 | [diff] [blame] | 508 | } | 
| Matt Arsenault | e830f54 | 2015-12-01 19:08:39 +0000 | [diff] [blame] | 509 | default: | 
|  | 510 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); | 
|  | 511 | } | 
|  | 512 | } | 
| Tom Stellard | dbe374b | 2015-12-15 18:04:38 +0000 | [diff] [blame] | 513 |  | 
| Alexander Timofeev | 2e5eece | 2018-03-05 15:12:21 +0000 | [diff] [blame] | 514 |  | 
| Tom Stellard | dbe374b | 2015-12-15 18:04:38 +0000 | [diff] [blame] | 515 |  | 
|  | 516 | static bool isArgPassedInSGPR(const Argument *A) { | 
|  | 517 | const Function *F = A->getParent(); | 
| Tom Stellard | dbe374b | 2015-12-15 18:04:38 +0000 | [diff] [blame] | 518 |  | 
|  | 519 | // Arguments to compute shaders are never a source of divergence. | 
| Matt Arsenault | 4c1ecde | 2017-04-19 17:42:34 +0000 | [diff] [blame] | 520 | CallingConv::ID CC = F->getCallingConv(); | 
|  | 521 | switch (CC) { | 
|  | 522 | case CallingConv::AMDGPU_KERNEL: | 
|  | 523 | case CallingConv::SPIR_KERNEL: | 
| Tom Stellard | dbe374b | 2015-12-15 18:04:38 +0000 | [diff] [blame] | 524 | return true; | 
| Matt Arsenault | 4c1ecde | 2017-04-19 17:42:34 +0000 | [diff] [blame] | 525 | case CallingConv::AMDGPU_VS: | 
| Tim Renouf | ef1ae8f | 2017-09-29 09:51:22 +0000 | [diff] [blame] | 526 | case CallingConv::AMDGPU_LS: | 
| Marek Olsak | a302a736 | 2017-05-02 15:41:10 +0000 | [diff] [blame] | 527 | case CallingConv::AMDGPU_HS: | 
| Tim Renouf | ef1ae8f | 2017-09-29 09:51:22 +0000 | [diff] [blame] | 528 | case CallingConv::AMDGPU_ES: | 
| Matt Arsenault | 4c1ecde | 2017-04-19 17:42:34 +0000 | [diff] [blame] | 529 | case CallingConv::AMDGPU_GS: | 
|  | 530 | case CallingConv::AMDGPU_PS: | 
|  | 531 | case CallingConv::AMDGPU_CS: | 
|  | 532 | // For non-compute shaders, SGPR inputs are marked with either inreg or byval. | 
|  | 533 | // Everything else is in VGPRs. | 
|  | 534 | return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || | 
|  | 535 | F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal); | 
|  | 536 | default: | 
|  | 537 | // TODO: Should calls support inreg for SGPR inputs? | 
|  | 538 | return false; | 
|  | 539 | } | 
| Tom Stellard | dbe374b | 2015-12-15 18:04:38 +0000 | [diff] [blame] | 540 | } | 
|  | 541 |  | 
| Tom Stellard | dbe374b | 2015-12-15 18:04:38 +0000 | [diff] [blame] | 542 | /// \returns true if the result of the value could potentially be | 
|  | 543 | /// different across workitems in a wavefront. | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 544 | bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { | 
| Tom Stellard | dbe374b | 2015-12-15 18:04:38 +0000 | [diff] [blame] | 545 | if (const Argument *A = dyn_cast<Argument>(V)) | 
|  | 546 | return !isArgPassedInSGPR(A); | 
|  | 547 |  | 
|  | 548 | // Loads from the private address space are divergent, because threads | 
|  | 549 | // can execute the load instruction with the same inputs and get different | 
|  | 550 | // results. | 
|  | 551 | // | 
|  | 552 | // All other loads are not divergent, because if threads issue loads with the | 
|  | 553 | // same arguments, they will always get the same result. | 
|  | 554 | if (const LoadInst *Load = dyn_cast<LoadInst>(V)) | 
| Yaxun Liu | 1a14bfa | 2017-03-27 14:04:01 +0000 | [diff] [blame] | 555 | return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS; | 
| Tom Stellard | dbe374b | 2015-12-15 18:04:38 +0000 | [diff] [blame] | 556 |  | 
| Nicolai Haehnle | 79cad85 | 2016-03-17 16:21:59 +0000 | [diff] [blame] | 557 | // Atomics are divergent because they are executed sequentially: when an | 
|  | 558 | // atomic operation refers to the same address in each thread, then each | 
|  | 559 | // thread after the first sees the value written by the previous thread as | 
|  | 560 | // original value. | 
|  | 561 | if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) | 
|  | 562 | return true; | 
|  | 563 |  | 
| Matt Arsenault | d2c8a33 | 2017-02-16 02:01:13 +0000 | [diff] [blame] | 564 | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) | 
| Alexander Timofeev | 2e5eece | 2018-03-05 15:12:21 +0000 | [diff] [blame] | 565 | return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); | 
| Tom Stellard | dbe374b | 2015-12-15 18:04:38 +0000 | [diff] [blame] | 566 |  | 
|  | 567 | // Assume all function calls are a source of divergence. | 
|  | 568 | if (isa<CallInst>(V) || isa<InvokeInst>(V)) | 
|  | 569 | return true; | 
|  | 570 |  | 
|  | 571 | return false; | 
|  | 572 | } | 
| Matt Arsenault | 3c5e423 | 2017-05-10 21:29:33 +0000 | [diff] [blame] | 573 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 574 | bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { | 
| Alexander Timofeev | 0f9c84c | 2017-06-15 19:33:10 +0000 | [diff] [blame] | 575 | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { | 
|  | 576 | switch (Intrinsic->getIntrinsicID()) { | 
|  | 577 | default: | 
|  | 578 | return false; | 
|  | 579 | case Intrinsic::amdgcn_readfirstlane: | 
|  | 580 | case Intrinsic::amdgcn_readlane: | 
|  | 581 | return true; | 
|  | 582 | } | 
|  | 583 | } | 
|  | 584 | return false; | 
|  | 585 | } | 
|  | 586 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 587 | unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, | 
| Matt Arsenault | 3c5e423 | 2017-05-10 21:29:33 +0000 | [diff] [blame] | 588 | Type *SubTp) { | 
|  | 589 | if (ST->hasVOP3PInsts()) { | 
|  | 590 | VectorType *VT = cast<VectorType>(Tp); | 
|  | 591 | if (VT->getNumElements() == 2 && | 
|  | 592 | DL.getTypeSizeInBits(VT->getElementType()) == 16) { | 
|  | 593 | // With op_sel VOP3P instructions freely can access the low half or high | 
|  | 594 | // half of a register, so any swizzle is free. | 
|  | 595 |  | 
|  | 596 | switch (Kind) { | 
|  | 597 | case TTI::SK_Broadcast: | 
|  | 598 | case TTI::SK_Reverse: | 
|  | 599 | case TTI::SK_PermuteSingleSrc: | 
|  | 600 | return 0; | 
|  | 601 | default: | 
|  | 602 | break; | 
|  | 603 | } | 
|  | 604 | } | 
|  | 605 | } | 
|  | 606 |  | 
|  | 607 | return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); | 
|  | 608 | } | 
| Matt Arsenault | aac47c1 | 2017-08-07 17:08:44 +0000 | [diff] [blame] | 609 |  | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 610 | bool GCNTTIImpl::areInlineCompatible(const Function *Caller, | 
| Matt Arsenault | aac47c1 | 2017-08-07 17:08:44 +0000 | [diff] [blame] | 611 | const Function *Callee) const { | 
|  | 612 | const TargetMachine &TM = getTLI()->getTargetMachine(); | 
|  | 613 | const FeatureBitset &CallerBits = | 
|  | 614 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | 
|  | 615 | const FeatureBitset &CalleeBits = | 
|  | 616 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | 
|  | 617 |  | 
|  | 618 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; | 
|  | 619 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; | 
|  | 620 | return ((RealCallerBits & RealCalleeBits) == RealCalleeBits); | 
|  | 621 | } | 
| Tom Stellard | c762431 | 2018-05-30 22:55:35 +0000 | [diff] [blame] | 622 |  | 
|  | 623 | void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, | 
|  | 624 | TTI::UnrollingPreferences &UP) { | 
|  | 625 | CommonTTI.getUnrollingPreferences(L, SE, UP); | 
|  | 626 | } | 
|  | 627 |  | 
|  | 628 | unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { | 
|  | 629 | return 4 * 128; // XXX - 4 channels. Should these count as vector instead? | 
|  | 630 | } | 
|  | 631 |  | 
|  | 632 | unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const { | 
|  | 633 | return getHardwareNumberOfRegisters(Vec); | 
|  | 634 | } | 
|  | 635 |  | 
|  | 636 | unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const { | 
|  | 637 | return 32; | 
|  | 638 | } | 
|  | 639 |  | 
|  | 640 | unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { | 
|  | 641 | return 32; | 
|  | 642 | } | 
|  | 643 |  | 
|  | 644 | unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { | 
|  | 645 | AMDGPUAS AS = ST->getAMDGPUAS(); | 
|  | 646 | if (AddrSpace == AS.GLOBAL_ADDRESS || | 
|  | 647 | AddrSpace == AS.CONSTANT_ADDRESS) | 
|  | 648 | return 128; | 
|  | 649 | if (AddrSpace == AS.LOCAL_ADDRESS || | 
|  | 650 | AddrSpace == AS.REGION_ADDRESS) | 
|  | 651 | return 64; | 
|  | 652 | if (AddrSpace == AS.PRIVATE_ADDRESS) | 
|  | 653 | return 32; | 
|  | 654 |  | 
|  | 655 | if ((AddrSpace == AS.PARAM_D_ADDRESS || | 
|  | 656 | AddrSpace == AS.PARAM_I_ADDRESS || | 
|  | 657 | (AddrSpace >= AS.CONSTANT_BUFFER_0 && | 
|  | 658 | AddrSpace <= AS.CONSTANT_BUFFER_15))) | 
|  | 659 | return 128; | 
|  | 660 | llvm_unreachable("unhandled address space"); | 
|  | 661 | } | 
|  | 662 |  | 
|  | 663 | bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, | 
|  | 664 | unsigned Alignment, | 
|  | 665 | unsigned AddrSpace) const { | 
|  | 666 | // We allow vectorization of flat stores, even though we may need to decompose | 
|  | 667 | // them later if they may access private memory. We don't have enough context | 
|  | 668 | // here, and legalization can handle it. | 
|  | 669 | if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) | 
|  | 670 | return false; | 
|  | 671 | return true; | 
|  | 672 | } | 
|  | 673 |  | 
|  | 674 | bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, | 
|  | 675 | unsigned Alignment, | 
|  | 676 | unsigned AddrSpace) const { | 
|  | 677 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); | 
|  | 678 | } | 
|  | 679 |  | 
|  | 680 | bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, | 
|  | 681 | unsigned Alignment, | 
|  | 682 | unsigned AddrSpace) const { | 
|  | 683 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); | 
|  | 684 | } | 
|  | 685 |  | 
|  | 686 | unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) { | 
|  | 687 | // Disable unrolling if the loop is not vectorized. | 
|  | 688 | // TODO: Enable this again. | 
|  | 689 | if (VF == 1) | 
|  | 690 | return 1; | 
|  | 691 |  | 
|  | 692 | return 8; | 
|  | 693 | } | 
|  | 694 |  | 
|  | 695 | unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) { | 
|  | 696 | // XXX - For some reason this isn't called for switch. | 
|  | 697 | switch (Opcode) { | 
|  | 698 | case Instruction::Br: | 
|  | 699 | case Instruction::Ret: | 
|  | 700 | return 10; | 
|  | 701 | default: | 
|  | 702 | return BaseT::getCFInstrCost(Opcode); | 
|  | 703 | } | 
|  | 704 | } | 
|  | 705 |  | 
|  | 706 | int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, | 
|  | 707 | unsigned Index) { | 
|  | 708 | switch (Opcode) { | 
|  | 709 | case Instruction::ExtractElement: | 
|  | 710 | case Instruction::InsertElement: { | 
|  | 711 | unsigned EltSize | 
|  | 712 | = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); | 
|  | 713 | if (EltSize < 32) { | 
|  | 714 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); | 
|  | 715 | } | 
|  | 716 |  | 
|  | 717 | // Extracts are just reads of a subregister, so are free. Inserts are | 
|  | 718 | // considered free because we don't want to have any cost for scalarizing | 
|  | 719 | // operations, and we don't have to copy into a different register class. | 
|  | 720 |  | 
|  | 721 | // Dynamic indexing isn't free and is best avoided. | 
|  | 722 | return Index == ~0u ? 2 : 0; | 
|  | 723 | } | 
|  | 724 | default: | 
|  | 725 | return BaseT::getVectorInstrCost(Opcode, ValTy, Index); | 
|  | 726 | } | 
|  | 727 | } | 
|  | 728 |  | 
|  | 729 | void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, | 
|  | 730 | TTI::UnrollingPreferences &UP) { | 
|  | 731 | CommonTTI.getUnrollingPreferences(L, SE, UP); | 
|  | 732 | } |