| Tom Stellard | 75aadc2 | 2012-12-11 21:25:42 +0000 | [diff] [blame] | 1 | //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// | 
|  | 2 | // | 
|  | 3 | //                     The LLVM Compiler Infrastructure | 
|  | 4 | // | 
|  | 5 | // This file is distributed under the University of Illinois Open Source | 
|  | 6 | // License. See LICENSE.TXT for details. | 
|  | 7 | // | 
|  | 8 | //===----------------------------------------------------------------------===// | 
|  | 9 | // | 
|  | 10 | /// \file | 
|  | 11 | /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. | 
|  | 12 | // | 
|  | 13 | //===----------------------------------------------------------------------===// | 
|  | 14 |  | 
|  | 15 | #include "AMDGPUSubtarget.h" | 
| Konstantin Zhuravlyov | e03b1d7 | 2017-02-08 13:02:33 +0000 | [diff] [blame] | 16 | #include "SIMachineFunctionInfo.h" | 
| Matt Arsenault | d9a23ab | 2014-07-13 02:08:26 +0000 | [diff] [blame] | 17 | #include "llvm/ADT/SmallString.h" | 
| Tom Stellard | 83f0bce | 2015-01-29 16:55:25 +0000 | [diff] [blame] | 18 | #include "llvm/CodeGen/MachineScheduler.h" | 
| Stanislav Mekhanoshin | c90347d | 2017-04-12 20:48:56 +0000 | [diff] [blame] | 19 | #include "llvm/IR/MDBuilder.h" | 
| Eugene Zelenko | 6a9226d | 2016-12-12 22:23:53 +0000 | [diff] [blame] | 20 | #include "llvm/Target/TargetFrameLowering.h" | 
|  | 21 | #include <algorithm> | 
| Matt Arsenault | d9a23ab | 2014-07-13 02:08:26 +0000 | [diff] [blame] | 22 |  | 
| Tom Stellard | 75aadc2 | 2012-12-11 21:25:42 +0000 | [diff] [blame] | 23 | using namespace llvm; | 
|  | 24 |  | 
| Chandler Carruth | e96dd89 | 2014-04-21 22:55:11 +0000 | [diff] [blame] | 25 | #define DEBUG_TYPE "amdgpu-subtarget" | 
|  | 26 |  | 
| Tom Stellard | 75aadc2 | 2012-12-11 21:25:42 +0000 | [diff] [blame] | 27 | #define GET_SUBTARGETINFO_TARGET_DESC | 
|  | 28 | #define GET_SUBTARGETINFO_CTOR | 
|  | 29 | #include "AMDGPUGenSubtargetInfo.inc" | 
|  | 30 |  | 
| Eugene Zelenko | 6a9226d | 2016-12-12 22:23:53 +0000 | [diff] [blame] | 31 | AMDGPUSubtarget::~AMDGPUSubtarget() = default; | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 32 |  | 
| Eric Christopher | ac4b69e | 2014-07-25 22:22:39 +0000 | [diff] [blame] | 33 | AMDGPUSubtarget & | 
| Daniel Sanders | a73f1fd | 2015-06-10 12:11:26 +0000 | [diff] [blame] | 34 | AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, | 
|  | 35 | StringRef GPU, StringRef FS) { | 
| Eric Christopher | ac4b69e | 2014-07-25 22:22:39 +0000 | [diff] [blame] | 36 | // Determine default and user-specified characteristics | 
| Matt Arsenault | f171cf2 | 2014-07-14 23:40:49 +0000 | [diff] [blame] | 37 | // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be | 
|  | 38 | // enabled, but some instructions do not respect them and they run at the | 
|  | 39 | // double precision rate, so don't enable by default. | 
|  | 40 | // | 
|  | 41 | // We want to be able to turn these off, but making this a subtarget feature | 
|  | 42 | // for SI has the unhelpful behavior that it unsets everything else if you | 
|  | 43 | // disable it. | 
| Matt Arsenault | d9a23ab | 2014-07-13 02:08:26 +0000 | [diff] [blame] | 44 |  | 
| Matt Arsenault | 2fdf2a1 | 2017-02-21 23:35:48 +0000 | [diff] [blame] | 45 | SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); | 
| Changpeng Fang | b41574a | 2015-12-22 20:55:23 +0000 | [diff] [blame] | 46 | if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. | 
| Wei Ding | 205bfdb | 2017-02-10 02:15:29 +0000 | [diff] [blame] | 47 | FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; | 
| Matt Arsenault | a6867fd | 2017-01-23 22:31:03 +0000 | [diff] [blame] | 48 |  | 
| Matt Arsenault | d9a23ab | 2014-07-13 02:08:26 +0000 | [diff] [blame] | 49 | FullFS += FS; | 
|  | 50 |  | 
|  | 51 | ParseSubtargetFeatures(GPU, FullFS); | 
| Tom Stellard | 2e59a45 | 2014-06-13 01:32:00 +0000 | [diff] [blame] | 52 |  | 
| Matt Arsenault | d8f7ea3 | 2017-01-27 17:42:26 +0000 | [diff] [blame] | 53 | // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es | 
|  | 54 | // on VI and newer hardware to avoid assertion failures due to missing ADDR64 | 
|  | 55 | // variants of MUBUF instructions. | 
|  | 56 | if (!hasAddr64() && !FS.contains("flat-for-global")) { | 
|  | 57 | FlatForGlobal = true; | 
|  | 58 | } | 
|  | 59 |  | 
| Eric Christopher | ac4b69e | 2014-07-25 22:22:39 +0000 | [diff] [blame] | 60 | // FIXME: I don't think think Evergreen has any useful support for | 
|  | 61 | // denormals, but should be checked. Should we issue a warning somewhere | 
|  | 62 | // if someone tries to enable these? | 
| Tom Stellard | 2e59a45 | 2014-06-13 01:32:00 +0000 | [diff] [blame] | 63 | if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { | 
| Matt Arsenault | a6867fd | 2017-01-23 22:31:03 +0000 | [diff] [blame] | 64 | FP64FP16Denormals = false; | 
| Matt Arsenault | f171cf2 | 2014-07-14 23:40:49 +0000 | [diff] [blame] | 65 | FP32Denormals = false; | 
| Eric Christopher | ac4b69e | 2014-07-25 22:22:39 +0000 | [diff] [blame] | 66 | } | 
| Matt Arsenault | 24ee078 | 2016-02-12 02:40:47 +0000 | [diff] [blame] | 67 |  | 
|  | 68 | // Set defaults if needed. | 
|  | 69 | if (MaxPrivateElementSize == 0) | 
| Matt Arsenault | e8ed8e5 | 2016-05-11 00:28:54 +0000 | [diff] [blame] | 70 | MaxPrivateElementSize = 4; | 
| Matt Arsenault | 24ee078 | 2016-02-12 02:40:47 +0000 | [diff] [blame] | 71 |  | 
| Eric Christopher | ac4b69e | 2014-07-25 22:22:39 +0000 | [diff] [blame] | 72 | return *this; | 
|  | 73 | } | 
|  | 74 |  | 
| Daniel Sanders | a73f1fd | 2015-06-10 12:11:26 +0000 | [diff] [blame] | 75 | AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 76 | const TargetMachine &TM) | 
|  | 77 | : AMDGPUGenSubtargetInfo(TT, GPU, FS), | 
|  | 78 | TargetTriple(TT), | 
|  | 79 | Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), | 
|  | 80 | IsaVersion(ISAVersion0_0_0), | 
|  | 81 | WavefrontSize(64), | 
|  | 82 | LocalMemorySize(0), | 
|  | 83 | LDSBankCount(0), | 
|  | 84 | MaxPrivateElementSize(0), | 
| Tom Stellard | 40ce8af | 2015-01-28 16:04:26 +0000 | [diff] [blame] | 85 |  | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 86 | FastFMAF32(false), | 
|  | 87 | HalfRate64Ops(false), | 
|  | 88 |  | 
|  | 89 | FP32Denormals(false), | 
| Matt Arsenault | a6867fd | 2017-01-23 22:31:03 +0000 | [diff] [blame] | 90 | FP64FP16Denormals(false), | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 91 | FPExceptions(false), | 
| Matt Arsenault | 2fdf2a1 | 2017-02-21 23:35:48 +0000 | [diff] [blame] | 92 | DX10Clamp(false), | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 93 | FlatForGlobal(false), | 
| Tom Stellard | 64a9d08 | 2016-10-14 18:10:39 +0000 | [diff] [blame] | 94 | UnalignedScratchAccess(false), | 
| Matt Arsenault | 7f681ac | 2016-07-01 23:03:44 +0000 | [diff] [blame] | 95 | UnalignedBufferAccess(false), | 
|  | 96 |  | 
| Matt Arsenault | e823d92 | 2017-02-18 18:29:53 +0000 | [diff] [blame] | 97 | HasApertureRegs(false), | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 98 | EnableXNACK(false), | 
| Wei Ding | 205bfdb | 2017-02-10 02:15:29 +0000 | [diff] [blame] | 99 | TrapHandler(false), | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 100 | DebuggerInsertNops(false), | 
|  | 101 | DebuggerReserveRegs(false), | 
| Konstantin Zhuravlyov | f2f3d14 | 2016-06-25 03:11:28 +0000 | [diff] [blame] | 102 | DebuggerEmitPrologue(false), | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 103 |  | 
|  | 104 | EnableVGPRSpilling(false), | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 105 | EnablePromoteAlloca(false), | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 106 | EnableLoadStoreOpt(false), | 
|  | 107 | EnableUnsafeDSOffsetFolding(false), | 
|  | 108 | EnableSIScheduler(false), | 
|  | 109 | DumpCode(false), | 
|  | 110 |  | 
|  | 111 | FP64(false), | 
|  | 112 | IsGCN(false), | 
|  | 113 | GCN1Encoding(false), | 
|  | 114 | GCN3Encoding(false), | 
|  | 115 | CIInsts(false), | 
| Matt Arsenault | 2021f08 | 2017-02-18 19:12:26 +0000 | [diff] [blame] | 116 | GFX9Insts(false), | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 117 | SGPRInitBug(false), | 
|  | 118 | HasSMemRealTime(false), | 
|  | 119 | Has16BitInsts(false), | 
| Matt Arsenault | 9be7b0d | 2017-02-27 18:49:11 +0000 | [diff] [blame] | 120 | HasVOP3PInsts(false), | 
| Matt Arsenault | cc88ce3 | 2016-10-12 18:00:51 +0000 | [diff] [blame] | 121 | HasMovrel(false), | 
|  | 122 | HasVGPRIndexMode(false), | 
| Matt Arsenault | c88ba36 | 2016-10-29 04:05:06 +0000 | [diff] [blame] | 123 | HasScalarStores(false), | 
| Benjamin Kramer | 11590b8 | 2017-01-20 10:37:53 +0000 | [diff] [blame] | 124 | HasInv2PiInlineImm(false), | 
| Sam Kolton | 07dbde2 | 2017-01-20 10:01:25 +0000 | [diff] [blame] | 125 | HasSDWA(false), | 
|  | 126 | HasDPP(false), | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 127 | FlatAddressSpace(false), | 
|  | 128 |  | 
|  | 129 | R600ALUInst(false), | 
|  | 130 | CaymanISA(false), | 
|  | 131 | CFALUBug(false), | 
|  | 132 | HasVertexCache(false), | 
|  | 133 | TexVTXClauseSize(0), | 
| Alexander Timofeev | 1800956 | 2016-12-08 17:28:47 +0000 | [diff] [blame] | 134 | ScalarizeGlobal(false), | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 135 |  | 
|  | 136 | FeatureDisable(false), | 
| Eugene Zelenko | 6a9226d | 2016-12-12 22:23:53 +0000 | [diff] [blame] | 137 | InstrItins(getInstrItineraryForCPU(GPU)) { | 
| Yaxun Liu | 1a14bfa | 2017-03-27 14:04:01 +0000 | [diff] [blame] | 138 | AS = AMDGPU::getAMDGPUAS(TT); | 
| Tom Stellard | 40ce8af | 2015-01-28 16:04:26 +0000 | [diff] [blame] | 139 | initializeSubtargetDependencies(TT, GPU, FS); | 
| Tom Stellard | a40f971 | 2014-01-22 21:55:43 +0000 | [diff] [blame] | 140 | } | 
| Tom Stellard | b8fd6ef | 2014-12-02 22:00:07 +0000 | [diff] [blame] | 141 |  | 
| Stanislav Mekhanoshin | 2b913b1 | 2017-02-01 22:59:50 +0000 | [diff] [blame] | 142 | unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, | 
|  | 143 | const Function &F) const { | 
|  | 144 | if (NWaves == 1) | 
| Matt Arsenault | 8a028bf | 2016-05-16 21:19:59 +0000 | [diff] [blame] | 145 | return getLocalMemorySize(); | 
| Stanislav Mekhanoshin | 2b913b1 | 2017-02-01 22:59:50 +0000 | [diff] [blame] | 146 | unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; | 
|  | 147 | unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); | 
|  | 148 | unsigned MaxWaves = getMaxWavesPerEU(); | 
|  | 149 | return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; | 
| Matt Arsenault | 8a028bf | 2016-05-16 21:19:59 +0000 | [diff] [blame] | 150 | } | 
|  | 151 |  | 
| Stanislav Mekhanoshin | 2b913b1 | 2017-02-01 22:59:50 +0000 | [diff] [blame] | 152 | unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, | 
|  | 153 | const Function &F) const { | 
|  | 154 | unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; | 
|  | 155 | unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); | 
|  | 156 | unsigned MaxWaves = getMaxWavesPerEU(); | 
|  | 157 | unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; | 
|  | 158 | unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); | 
|  | 159 | NumWaves = std::min(NumWaves, MaxWaves); | 
|  | 160 | NumWaves = std::max(NumWaves, 1u); | 
|  | 161 | return NumWaves; | 
| Matt Arsenault | 8a028bf | 2016-05-16 21:19:59 +0000 | [diff] [blame] | 162 | } | 
|  | 163 |  | 
| Konstantin Zhuravlyov | 1d65026 | 2016-09-06 20:22:28 +0000 | [diff] [blame] | 164 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( | 
|  | 165 | const Function &F) const { | 
| Konstantin Zhuravlyov | 1d65026 | 2016-09-06 20:22:28 +0000 | [diff] [blame] | 166 | // Default minimum/maximum flat work group sizes. | 
|  | 167 | std::pair<unsigned, unsigned> Default = | 
|  | 168 | AMDGPU::isCompute(F.getCallingConv()) ? | 
|  | 169 | std::pair<unsigned, unsigned>(getWavefrontSize() * 2, | 
|  | 170 | getWavefrontSize() * 4) : | 
|  | 171 | std::pair<unsigned, unsigned>(1, getWavefrontSize()); | 
|  | 172 |  | 
|  | 173 | // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa | 
|  | 174 | // starts using "amdgpu-flat-work-group-size" attribute. | 
|  | 175 | Default.second = AMDGPU::getIntegerAttribute( | 
|  | 176 | F, "amdgpu-max-work-group-size", Default.second); | 
|  | 177 | Default.first = std::min(Default.first, Default.second); | 
|  | 178 |  | 
|  | 179 | // Requested minimum/maximum flat work group sizes. | 
|  | 180 | std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( | 
|  | 181 | F, "amdgpu-flat-work-group-size", Default); | 
|  | 182 |  | 
|  | 183 | // Make sure requested minimum is less than requested maximum. | 
|  | 184 | if (Requested.first > Requested.second) | 
|  | 185 | return Default; | 
|  | 186 |  | 
|  | 187 | // Make sure requested values do not violate subtarget's specifications. | 
|  | 188 | if (Requested.first < getMinFlatWorkGroupSize()) | 
|  | 189 | return Default; | 
|  | 190 | if (Requested.second > getMaxFlatWorkGroupSize()) | 
|  | 191 | return Default; | 
|  | 192 |  | 
|  | 193 | return Requested; | 
|  | 194 | } | 
|  | 195 |  | 
|  | 196 | std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( | 
|  | 197 | const Function &F) const { | 
| Konstantin Zhuravlyov | 1d65026 | 2016-09-06 20:22:28 +0000 | [diff] [blame] | 198 | // Default minimum/maximum number of waves per execution unit. | 
| Konstantin Zhuravlyov | fd87137 | 2017-02-09 21:33:23 +0000 | [diff] [blame] | 199 | std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); | 
| Konstantin Zhuravlyov | 1d65026 | 2016-09-06 20:22:28 +0000 | [diff] [blame] | 200 |  | 
|  | 201 | // Default/requested minimum/maximum flat work group sizes. | 
|  | 202 | std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); | 
|  | 203 |  | 
|  | 204 | // If minimum/maximum flat work group sizes were explicitly requested using | 
|  | 205 | // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum | 
|  | 206 | // number of waves per execution unit to values implied by requested | 
|  | 207 | // minimum/maximum flat work group sizes. | 
|  | 208 | unsigned MinImpliedByFlatWorkGroupSize = | 
|  | 209 | getMaxWavesPerEU(FlatWorkGroupSizes.second); | 
|  | 210 | bool RequestedFlatWorkGroupSize = false; | 
|  | 211 |  | 
|  | 212 | // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa | 
|  | 213 | // starts using "amdgpu-flat-work-group-size" attribute. | 
|  | 214 | if (F.hasFnAttribute("amdgpu-max-work-group-size") || | 
|  | 215 | F.hasFnAttribute("amdgpu-flat-work-group-size")) { | 
|  | 216 | Default.first = MinImpliedByFlatWorkGroupSize; | 
|  | 217 | RequestedFlatWorkGroupSize = true; | 
|  | 218 | } | 
|  | 219 |  | 
|  | 220 | // Requested minimum/maximum number of waves per execution unit. | 
|  | 221 | std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( | 
|  | 222 | F, "amdgpu-waves-per-eu", Default, true); | 
|  | 223 |  | 
|  | 224 | // Make sure requested minimum is less than requested maximum. | 
|  | 225 | if (Requested.second && Requested.first > Requested.second) | 
|  | 226 | return Default; | 
|  | 227 |  | 
|  | 228 | // Make sure requested values do not violate subtarget's specifications. | 
|  | 229 | if (Requested.first < getMinWavesPerEU() || | 
|  | 230 | Requested.first > getMaxWavesPerEU()) | 
|  | 231 | return Default; | 
|  | 232 | if (Requested.second > getMaxWavesPerEU()) | 
|  | 233 | return Default; | 
|  | 234 |  | 
|  | 235 | // Make sure requested values are compatible with values implied by requested | 
|  | 236 | // minimum/maximum flat work group sizes. | 
|  | 237 | if (RequestedFlatWorkGroupSize && | 
|  | 238 | Requested.first > MinImpliedByFlatWorkGroupSize) | 
|  | 239 | return Default; | 
|  | 240 |  | 
|  | 241 | return Requested; | 
|  | 242 | } | 
|  | 243 |  | 
| Stanislav Mekhanoshin | c90347d | 2017-04-12 20:48:56 +0000 | [diff] [blame] | 244 | bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { | 
|  | 245 | Function *Kernel = I->getParent()->getParent(); | 
|  | 246 | unsigned MinSize = 0; | 
|  | 247 | unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; | 
|  | 248 | bool IdQuery = false; | 
|  | 249 |  | 
|  | 250 | // If reqd_work_group_size is present it narrows value down. | 
|  | 251 | if (auto *CI = dyn_cast<CallInst>(I)) { | 
|  | 252 | const Function *F = CI->getCalledFunction(); | 
|  | 253 | if (F) { | 
|  | 254 | unsigned Dim = UINT_MAX; | 
|  | 255 | switch (F->getIntrinsicID()) { | 
|  | 256 | case Intrinsic::amdgcn_workitem_id_x: | 
|  | 257 | case Intrinsic::r600_read_tidig_x: | 
|  | 258 | IdQuery = true; | 
|  | 259 | case Intrinsic::r600_read_local_size_x: | 
|  | 260 | Dim = 0; | 
|  | 261 | break; | 
|  | 262 | case Intrinsic::amdgcn_workitem_id_y: | 
|  | 263 | case Intrinsic::r600_read_tidig_y: | 
|  | 264 | IdQuery = true; | 
|  | 265 | case Intrinsic::r600_read_local_size_y: | 
|  | 266 | Dim = 1; | 
|  | 267 | break; | 
|  | 268 | case Intrinsic::amdgcn_workitem_id_z: | 
|  | 269 | case Intrinsic::r600_read_tidig_z: | 
|  | 270 | IdQuery = true; | 
|  | 271 | case Intrinsic::r600_read_local_size_z: | 
|  | 272 | Dim = 2; | 
|  | 273 | break; | 
|  | 274 | default: | 
|  | 275 | break; | 
|  | 276 | } | 
|  | 277 | if (Dim <= 3) { | 
|  | 278 | if (auto Node = Kernel->getMetadata("reqd_work_group_size")) | 
|  | 279 | if (Node->getNumOperands() == 3) | 
|  | 280 | MinSize = MaxSize = mdconst::extract<ConstantInt>( | 
|  | 281 | Node->getOperand(Dim))->getZExtValue(); | 
|  | 282 | } | 
|  | 283 | } | 
|  | 284 | } | 
|  | 285 |  | 
|  | 286 | if (!MaxSize) | 
|  | 287 | return false; | 
|  | 288 |  | 
|  | 289 | // Range metadata is [Lo, Hi). For ID query we need to pass max size | 
|  | 290 | // as Hi. For size query we need to pass Hi + 1. | 
|  | 291 | if (IdQuery) | 
|  | 292 | MinSize = 0; | 
|  | 293 | else | 
|  | 294 | ++MaxSize; | 
|  | 295 |  | 
|  | 296 | MDBuilder MDB(I->getContext()); | 
|  | 297 | MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), | 
|  | 298 | APInt(32, MaxSize)); | 
|  | 299 | I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); | 
|  | 300 | return true; | 
|  | 301 | } | 
|  | 302 |  | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 303 | R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, | 
|  | 304 | const TargetMachine &TM) : | 
|  | 305 | AMDGPUSubtarget(TT, GPU, FS, TM), | 
|  | 306 | InstrInfo(*this), | 
|  | 307 | FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), | 
|  | 308 | TLInfo(TM, *this) {} | 
|  | 309 |  | 
|  | 310 | SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, | 
|  | 311 | const TargetMachine &TM) : | 
|  | 312 | AMDGPUSubtarget(TT, GPU, FS, TM), | 
|  | 313 | InstrInfo(*this), | 
|  | 314 | FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), | 
| Eugene Zelenko | 6a9226d | 2016-12-12 22:23:53 +0000 | [diff] [blame] | 315 | TLInfo(TM, *this) {} | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 316 |  | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 317 | void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, | 
| Matt Arsenault | 55dff27 | 2016-06-28 00:11:26 +0000 | [diff] [blame] | 318 | unsigned NumRegionInstrs) const { | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 319 | // Track register pressure so the scheduler can try to decrease | 
|  | 320 | // pressure once register usage is above the threshold defined by | 
|  | 321 | // SIRegisterInfo::getRegPressureSetLimit() | 
|  | 322 | Policy.ShouldTrackPressure = true; | 
| Tom Stellard | 83f0bce | 2015-01-29 16:55:25 +0000 | [diff] [blame] | 323 |  | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 324 | // Enabling both top down and bottom up scheduling seems to give us less | 
|  | 325 | // register spills than just using one of these approaches on its own. | 
|  | 326 | Policy.OnlyTopDown = false; | 
|  | 327 | Policy.OnlyBottomUp = false; | 
| Tom Stellard | 83f0bce | 2015-01-29 16:55:25 +0000 | [diff] [blame] | 328 |  | 
| Alexander Timofeev | 9f61fea | 2017-02-14 14:29:05 +0000 | [diff] [blame] | 329 | // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. | 
|  | 330 | if (!enableSIScheduler()) | 
|  | 331 | Policy.ShouldTrackLaneMasks = true; | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 332 | } | 
| Tom Stellard | 0bc954e | 2016-03-30 16:35:09 +0000 | [diff] [blame] | 333 |  | 
| Matt Arsenault | 43e92fe | 2016-06-24 06:30:11 +0000 | [diff] [blame] | 334 | bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { | 
|  | 335 | return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); | 
|  | 336 | } | 
| Tom Stellard | 0d23ebe | 2016-08-29 19:42:52 +0000 | [diff] [blame] | 337 |  | 
| Tom Stellard | 2f3f985 | 2017-01-25 01:25:13 +0000 | [diff] [blame] | 338 | unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, | 
| Konstantin Zhuravlyov | 27d64c3 | 2017-02-08 13:29:23 +0000 | [diff] [blame] | 339 | unsigned ExplicitArgBytes) const { | 
| Tom Stellard | 2f3f985 | 2017-01-25 01:25:13 +0000 | [diff] [blame] | 340 | unsigned ImplicitBytes = getImplicitArgNumBytes(MF); | 
| Tom Stellard | e88bbc3 | 2016-09-23 01:33:26 +0000 | [diff] [blame] | 341 | if (ImplicitBytes == 0) | 
|  | 342 | return ExplicitArgBytes; | 
|  | 343 |  | 
|  | 344 | unsigned Alignment = getAlignmentForImplicitArgPtr(); | 
|  | 345 | return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; | 
|  | 346 | } | 
|  | 347 |  | 
| Tom Stellard | 0d23ebe | 2016-08-29 19:42:52 +0000 | [diff] [blame] | 348 | unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { | 
|  | 349 | if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { | 
|  | 350 | if (SGPRs <= 80) | 
|  | 351 | return 10; | 
|  | 352 | if (SGPRs <= 88) | 
|  | 353 | return 9; | 
|  | 354 | if (SGPRs <= 100) | 
|  | 355 | return 8; | 
|  | 356 | return 7; | 
|  | 357 | } | 
|  | 358 | if (SGPRs <= 48) | 
|  | 359 | return 10; | 
|  | 360 | if (SGPRs <= 56) | 
|  | 361 | return 9; | 
|  | 362 | if (SGPRs <= 64) | 
|  | 363 | return 8; | 
|  | 364 | if (SGPRs <= 72) | 
|  | 365 | return 7; | 
|  | 366 | if (SGPRs <= 80) | 
|  | 367 | return 6; | 
|  | 368 | return 5; | 
|  | 369 | } | 
|  | 370 |  | 
|  | 371 | unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { | 
|  | 372 | if (VGPRs <= 24) | 
|  | 373 | return 10; | 
|  | 374 | if (VGPRs <= 28) | 
|  | 375 | return 9; | 
|  | 376 | if (VGPRs <= 32) | 
|  | 377 | return 8; | 
|  | 378 | if (VGPRs <= 36) | 
|  | 379 | return 7; | 
|  | 380 | if (VGPRs <= 40) | 
|  | 381 | return 6; | 
|  | 382 | if (VGPRs <= 48) | 
|  | 383 | return 5; | 
|  | 384 | if (VGPRs <= 64) | 
|  | 385 | return 4; | 
|  | 386 | if (VGPRs <= 84) | 
|  | 387 | return 3; | 
|  | 388 | if (VGPRs <= 128) | 
|  | 389 | return 2; | 
|  | 390 | return 1; | 
|  | 391 | } | 
| Matt Arsenault | 4eae301 | 2016-10-28 20:31:47 +0000 | [diff] [blame] | 392 |  | 
| Konstantin Zhuravlyov | e03b1d7 | 2017-02-08 13:02:33 +0000 | [diff] [blame] | 393 | unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { | 
|  | 394 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); | 
|  | 395 | if (MFI.hasFlatScratchInit()) { | 
|  | 396 | if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) | 
|  | 397 | return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). | 
|  | 398 | if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) | 
|  | 399 | return 4; // FLAT_SCRATCH, VCC (in that order). | 
|  | 400 | } | 
|  | 401 |  | 
|  | 402 | if (isXNACKEnabled()) | 
|  | 403 | return 4; // XNACK, VCC (in that order). | 
|  | 404 | return 2; // VCC. | 
|  | 405 | } | 
|  | 406 |  | 
|  | 407 | unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { | 
|  | 408 | const Function &F = *MF.getFunction(); | 
|  | 409 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); | 
|  | 410 |  | 
|  | 411 | // Compute maximum number of SGPRs function can use using default/requested | 
|  | 412 | // minimum number of waves per execution unit. | 
|  | 413 | std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); | 
|  | 414 | unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); | 
|  | 415 | unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); | 
|  | 416 |  | 
|  | 417 | // Check if maximum number of SGPRs was explicitly requested using | 
|  | 418 | // "amdgpu-num-sgpr" attribute. | 
|  | 419 | if (F.hasFnAttribute("amdgpu-num-sgpr")) { | 
|  | 420 | unsigned Requested = AMDGPU::getIntegerAttribute( | 
|  | 421 | F, "amdgpu-num-sgpr", MaxNumSGPRs); | 
|  | 422 |  | 
|  | 423 | // Make sure requested value does not violate subtarget's specifications. | 
|  | 424 | if (Requested && (Requested <= getReservedNumSGPRs(MF))) | 
|  | 425 | Requested = 0; | 
|  | 426 |  | 
|  | 427 | // If more SGPRs are required to support the input user/system SGPRs, | 
|  | 428 | // increase to accommodate them. | 
|  | 429 | // | 
|  | 430 | // FIXME: This really ends up using the requested number of SGPRs + number | 
|  | 431 | // of reserved special registers in total. Theoretically you could re-use | 
|  | 432 | // the last input registers for these special registers, but this would | 
|  | 433 | // require a lot of complexity to deal with the weird aliasing. | 
|  | 434 | unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); | 
|  | 435 | if (Requested && Requested < InputNumSGPRs) | 
|  | 436 | Requested = InputNumSGPRs; | 
|  | 437 |  | 
|  | 438 | // Make sure requested value is compatible with values implied by | 
|  | 439 | // default/requested minimum/maximum number of waves per execution unit. | 
|  | 440 | if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) | 
|  | 441 | Requested = 0; | 
|  | 442 | if (WavesPerEU.second && | 
|  | 443 | Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) | 
|  | 444 | Requested = 0; | 
|  | 445 |  | 
|  | 446 | if (Requested) | 
|  | 447 | MaxNumSGPRs = Requested; | 
|  | 448 | } | 
|  | 449 |  | 
| Matt Arsenault | 4eae301 | 2016-10-28 20:31:47 +0000 | [diff] [blame] | 450 | if (hasSGPRInitBug()) | 
| Konstantin Zhuravlyov | 9f89ede | 2017-02-08 14:05:23 +0000 | [diff] [blame] | 451 | MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; | 
| Matt Arsenault | 4eae301 | 2016-10-28 20:31:47 +0000 | [diff] [blame] | 452 |  | 
| Konstantin Zhuravlyov | e03b1d7 | 2017-02-08 13:02:33 +0000 | [diff] [blame] | 453 | return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), | 
|  | 454 | MaxAddressableNumSGPRs); | 
|  | 455 | } | 
| Matt Arsenault | 4eae301 | 2016-10-28 20:31:47 +0000 | [diff] [blame] | 456 |  | 
| Konstantin Zhuravlyov | e03b1d7 | 2017-02-08 13:02:33 +0000 | [diff] [blame] | 457 | unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { | 
|  | 458 | const Function &F = *MF.getFunction(); | 
|  | 459 | const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); | 
|  | 460 |  | 
|  | 461 | // Compute maximum number of VGPRs function can use using default/requested | 
|  | 462 | // minimum number of waves per execution unit. | 
|  | 463 | std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); | 
|  | 464 | unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); | 
|  | 465 |  | 
|  | 466 | // Check if maximum number of VGPRs was explicitly requested using | 
|  | 467 | // "amdgpu-num-vgpr" attribute. | 
|  | 468 | if (F.hasFnAttribute("amdgpu-num-vgpr")) { | 
|  | 469 | unsigned Requested = AMDGPU::getIntegerAttribute( | 
|  | 470 | F, "amdgpu-num-vgpr", MaxNumVGPRs); | 
|  | 471 |  | 
|  | 472 | // Make sure requested value does not violate subtarget's specifications. | 
|  | 473 | if (Requested && Requested <= getReservedNumVGPRs(MF)) | 
|  | 474 | Requested = 0; | 
|  | 475 |  | 
|  | 476 | // Make sure requested value is compatible with values implied by | 
|  | 477 | // default/requested minimum/maximum number of waves per execution unit. | 
|  | 478 | if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) | 
|  | 479 | Requested = 0; | 
|  | 480 | if (WavesPerEU.second && | 
|  | 481 | Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) | 
|  | 482 | Requested = 0; | 
|  | 483 |  | 
|  | 484 | if (Requested) | 
|  | 485 | MaxNumVGPRs = Requested; | 
|  | 486 | } | 
|  | 487 |  | 
|  | 488 | return MaxNumVGPRs - getReservedNumVGPRs(MF); | 
| Matt Arsenault | 4eae301 | 2016-10-28 20:31:47 +0000 | [diff] [blame] | 489 | } |