| Stanislav Mekhanoshin | 3b7925f | 2019-05-01 16:49:31 +0000 | [diff] [blame] | 1 | //===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===// | 
|  | 2 | // | 
|  | 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | 4 | // See https://llvm.org/LICENSE.txt for license information. | 
|  | 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | 6 | // | 
|  | 7 | //===----------------------------------------------------------------------===// | 
|  | 8 | // | 
|  | 9 | /// \file | 
|  | 10 | /// \brief Try to reassign registers on GFX10+ to reduce register bank | 
|  | 11 | /// conflicts. | 
|  | 12 | /// | 
|  | 13 | /// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in | 
|  | 14 | /// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to | 
|  | 15 | /// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1, | 
|  | 16 | /// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc. | 
|  | 17 | /// | 
|  | 18 | /// The shader can read one dword from each of these banks once per cycle. | 
|  | 19 | /// If an instruction has to read more register operands from the same bank | 
|  | 20 | /// an additional cycle is needed. HW attempts to pre-load registers through | 
|  | 21 | /// input operand gathering, but a stall cycle may occur if that fails. For | 
|  | 22 | /// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands, | 
|  | 23 | /// potentially incuring 2 stall cycles. | 
|  | 24 | /// | 
|  | 25 | /// The pass tries to reassign registers to reduce bank conflicts. | 
|  | 26 | /// | 
|  | 27 | /// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so | 
|  | 28 | /// that 4 has to be subtracted from an SGPR bank number to get the real value. | 
|  | 29 | /// This also corresponds to bit numbers in bank masks used in the pass. | 
|  | 30 | /// | 
|  | 31 | //===----------------------------------------------------------------------===// | 
|  | 32 |  | 
|  | 33 | #include "AMDGPU.h" | 
|  | 34 | #include "AMDGPUSubtarget.h" | 
|  | 35 | #include "SIInstrInfo.h" | 
|  | 36 | #include "SIMachineFunctionInfo.h" | 
|  | 37 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | 
|  | 38 | #include "llvm/ADT/SmallSet.h" | 
|  | 39 | #include "llvm/ADT/Statistic.h" | 
|  | 40 | #include "llvm/CodeGen/LiveInterval.h" | 
|  | 41 | #include "llvm/CodeGen/LiveIntervals.h" | 
|  | 42 | #include "llvm/CodeGen/LiveRegMatrix.h" | 
|  | 43 | #include "llvm/CodeGen/MachineFunctionPass.h" | 
|  | 44 | #include "llvm/CodeGen/MachineLoopInfo.h" | 
|  | 45 | #include "llvm/CodeGen/VirtRegMap.h" | 
|  | 46 | #include "llvm/Support/MathExtras.h" | 
|  | 47 |  | 
|  | 48 | using namespace llvm; | 
|  | 49 |  | 
|  | 50 | static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign", | 
|  | 51 | cl::desc("Verify stall cycles in the regbanks reassign pass"), | 
|  | 52 | cl::value_desc("0|1|2"), | 
|  | 53 | cl::init(0), cl::Hidden); | 
|  | 54 |  | 
|  | 55 | #define DEBUG_TYPE "amdgpu-regbanks-reassign" | 
|  | 56 |  | 
|  | 57 | #define NUM_VGPR_BANKS 4 | 
|  | 58 | #define NUM_SGPR_BANKS 8 | 
|  | 59 | #define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS) | 
|  | 60 | #define SGPR_BANK_OFFSET NUM_VGPR_BANKS | 
|  | 61 | #define VGPR_BANK_MASK 0xf | 
|  | 62 | #define SGPR_BANK_MASK 0xff0 | 
|  | 63 | #define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET) | 
|  | 64 |  | 
|  | 65 | STATISTIC(NumStallsDetected, | 
|  | 66 | "Number of operand read stalls detected"); | 
|  | 67 | STATISTIC(NumStallsRecovered, | 
|  | 68 | "Number of operand read stalls recovered"); | 
|  | 69 |  | 
|  | 70 | namespace { | 
|  | 71 |  | 
|  | 72 | class GCNRegBankReassign : public MachineFunctionPass { | 
|  | 73 |  | 
|  | 74 | class OperandMask { | 
|  | 75 | public: | 
|  | 76 | OperandMask(unsigned r, unsigned s, unsigned m) | 
|  | 77 | : Reg(r), SubReg(s), Mask(m) {} | 
|  | 78 | unsigned Reg; | 
|  | 79 | unsigned SubReg; | 
|  | 80 | unsigned Mask; | 
|  | 81 | }; | 
|  | 82 |  | 
|  | 83 | class Candidate { | 
|  | 84 | public: | 
|  | 85 | Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, | 
|  | 86 | unsigned weight) | 
|  | 87 | : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} | 
|  | 88 |  | 
|  | 89 | bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } | 
|  | 90 |  | 
|  | 91 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | 
|  | 92 | void dump(const GCNRegBankReassign *P) const { | 
|  | 93 | MI->dump(); | 
|  | 94 | dbgs() << P->printReg(Reg) << " to banks "; | 
|  | 95 | dumpFreeBanks(FreeBanks); | 
|  | 96 | dbgs() << " weight " << Weight << '\n'; | 
|  | 97 | } | 
|  | 98 | #endif | 
|  | 99 |  | 
|  | 100 | MachineInstr *MI; | 
|  | 101 | unsigned Reg; | 
|  | 102 | unsigned FreeBanks; | 
|  | 103 | unsigned Weight; | 
|  | 104 | }; | 
|  | 105 |  | 
|  | 106 | class CandidateList : public std::list<Candidate> { | 
|  | 107 | public: | 
|  | 108 | // Speedup subsequent sort. | 
|  | 109 | void push(const Candidate&& C) { | 
|  | 110 | if (C.Weight) push_back(C); | 
|  | 111 | else push_front(C); | 
|  | 112 | } | 
|  | 113 | }; | 
|  | 114 |  | 
|  | 115 | public: | 
|  | 116 | static char ID; | 
|  | 117 |  | 
|  | 118 | public: | 
|  | 119 | GCNRegBankReassign() : MachineFunctionPass(ID) { | 
|  | 120 | initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry()); | 
|  | 121 | } | 
|  | 122 |  | 
|  | 123 | bool runOnMachineFunction(MachineFunction &MF) override; | 
|  | 124 |  | 
|  | 125 | StringRef getPassName() const override { return "GCN RegBank Reassign"; } | 
|  | 126 |  | 
|  | 127 | void getAnalysisUsage(AnalysisUsage &AU) const override { | 
|  | 128 | AU.addRequired<MachineLoopInfo>(); | 
|  | 129 | AU.addRequired<LiveIntervals>(); | 
|  | 130 | AU.addRequired<VirtRegMap>(); | 
|  | 131 | AU.addRequired<LiveRegMatrix>(); | 
|  | 132 | AU.setPreservesAll(); | 
|  | 133 | MachineFunctionPass::getAnalysisUsage(AU); | 
|  | 134 | } | 
|  | 135 |  | 
|  | 136 | private: | 
|  | 137 | const GCNSubtarget *ST; | 
|  | 138 |  | 
|  | 139 | const MachineRegisterInfo *MRI; | 
|  | 140 |  | 
|  | 141 | const SIRegisterInfo *TRI; | 
|  | 142 |  | 
|  | 143 | MachineLoopInfo *MLI; | 
|  | 144 |  | 
|  | 145 | VirtRegMap *VRM; | 
|  | 146 |  | 
|  | 147 | LiveRegMatrix *LRM; | 
|  | 148 |  | 
|  | 149 | LiveIntervals *LIS; | 
|  | 150 |  | 
|  | 151 | unsigned MaxNumVGPRs; | 
|  | 152 |  | 
|  | 153 | unsigned MaxNumSGPRs; | 
|  | 154 |  | 
|  | 155 | BitVector RegsUsed; | 
|  | 156 |  | 
|  | 157 | SmallVector<OperandMask, 8> OperandMasks; | 
|  | 158 |  | 
|  | 159 | CandidateList Candidates; | 
|  | 160 |  | 
|  | 161 | const MCPhysReg *CSRegs; | 
|  | 162 |  | 
|  | 163 | // Returns bank for a phys reg. | 
|  | 164 | unsigned getPhysRegBank(unsigned Reg) const; | 
|  | 165 |  | 
|  | 166 | // Return a bit set for each register bank used. 4 banks for VGPRs and | 
|  | 167 | // 8 banks for SGPRs. | 
|  | 168 | // Registers already processed and recorded in RegsUsed are excluded. | 
|  | 169 | // If Bank is not -1 assume Reg:SubReg to belong to that Bank. | 
|  | 170 | unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank); | 
|  | 171 |  | 
|  | 172 | // Return number of stalls in the instructions. | 
|  | 173 | // UsedBanks has bits set for the banks used by all operands. | 
|  | 174 | // If Reg and Bank provided substitute the Reg with the Bank. | 
|  | 175 | unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks, | 
|  | 176 | unsigned Reg = AMDGPU::NoRegister, int Bank = -1); | 
|  | 177 |  | 
|  | 178 | // Return true if register is regular VGPR or SGPR or their tuples. | 
|  | 179 | // Returns false for special registers like m0, vcc etc. | 
|  | 180 | bool isReassignable(unsigned Reg) const; | 
|  | 181 |  | 
|  | 182 | // Check if registers' defs are old and may be pre-loaded. | 
|  | 183 | // Returns 0 if both registers are old enough, 1 or 2 if one or both | 
|  | 184 | // registers will not likely be pre-loaded. | 
|  | 185 | unsigned getOperandGatherWeight(const MachineInstr& MI, | 
|  | 186 | unsigned Reg1, | 
|  | 187 | unsigned Reg2, | 
|  | 188 | unsigned StallCycles) const; | 
|  | 189 |  | 
|  | 190 |  | 
|  | 191 | // Find all bank bits in UsedBanks where Mask can be relocated to. | 
|  | 192 | unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const; | 
|  | 193 |  | 
|  | 194 | // Find all bank bits in UsedBanks where Mask can be relocated to. | 
|  | 195 | // Bank is relative to the register and not its subregister component. | 
|  | 196 | // Returns 0 is a register is not reassignable. | 
|  | 197 | unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask, | 
|  | 198 | unsigned UsedBanks) const; | 
|  | 199 |  | 
|  | 200 | // Add cadidate instruction to the work list. | 
|  | 201 | void collectCandidates(MachineInstr& MI, unsigned UsedBanks, | 
|  | 202 | unsigned StallCycles); | 
|  | 203 |  | 
|  | 204 | // Collect cadidate instructions across function. Returns a number stall | 
|  | 205 | // cycles detected. Only counts stalls if Collect is false. | 
|  | 206 | unsigned collectCandidates(MachineFunction &MF, bool Collect = true); | 
|  | 207 |  | 
|  | 208 | // Remove all candidates that read specified register. | 
|  | 209 | void removeCandidates(unsigned Reg); | 
|  | 210 |  | 
|  | 211 | // Compute stalls within the uses of SrcReg replaced by a register from | 
|  | 212 | // Bank. If Bank is -1 does not perform substitution. If Collect is set | 
|  | 213 | // candidates are collected and added to work list. | 
|  | 214 | unsigned computeStallCycles(unsigned SrcReg, | 
|  | 215 | unsigned Reg = AMDGPU::NoRegister, | 
|  | 216 | int Bank = -1, bool Collect = false); | 
|  | 217 |  | 
|  | 218 | // Search for a register in Bank unused within LI. | 
|  | 219 | // Returns phys reg or NoRegister. | 
|  | 220 | unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; | 
|  | 221 |  | 
|  | 222 | // Try to reassign candidate. Returns number or stall cycles saved. | 
|  | 223 | unsigned tryReassign(Candidate &C); | 
|  | 224 |  | 
|  | 225 | bool verifyCycles(MachineFunction &MF, | 
|  | 226 | unsigned OriginalCycles, unsigned CyclesSaved); | 
|  | 227 |  | 
|  | 228 |  | 
|  | 229 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | 
|  | 230 | public: | 
|  | 231 | Printable printReg(unsigned Reg, unsigned SubReg = 0) const { | 
|  | 232 | return Printable([Reg, SubReg, this](raw_ostream &OS) { | 
|  | 233 | if (TargetRegisterInfo::isPhysicalRegister(Reg)) { | 
|  | 234 | OS << llvm::printReg(Reg, TRI); | 
|  | 235 | return; | 
|  | 236 | } | 
|  | 237 | if (!VRM->isAssignedReg(Reg)) | 
|  | 238 | OS << "<unassigned> " << llvm::printReg(Reg, TRI); | 
|  | 239 | else | 
|  | 240 | OS << llvm::printReg(Reg, TRI) << '(' | 
|  | 241 | << llvm::printReg(VRM->getPhys(Reg), TRI) << ')'; | 
|  | 242 | if (SubReg) | 
|  | 243 | OS << ':' << TRI->getSubRegIndexName(SubReg); | 
|  | 244 | }); | 
|  | 245 | } | 
|  | 246 |  | 
|  | 247 | static Printable printBank(unsigned Bank) { | 
|  | 248 | return Printable([Bank](raw_ostream &OS) { | 
|  | 249 | OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank); | 
|  | 250 | }); | 
|  | 251 | } | 
|  | 252 |  | 
|  | 253 | static void dumpFreeBanks(unsigned FreeBanks) { | 
|  | 254 | for (unsigned L = 0; L < NUM_BANKS; ++L) | 
|  | 255 | if (FreeBanks & (1 << L)) | 
|  | 256 | dbgs() << printBank(L) << ' '; | 
|  | 257 | } | 
|  | 258 | #endif | 
|  | 259 | }; | 
|  | 260 |  | 
|  | 261 | } // End anonymous namespace. | 
|  | 262 |  | 
|  | 263 | INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", | 
|  | 264 | false, false) | 
|  | 265 | INITIALIZE_PASS_DEPENDENCY(LiveIntervals) | 
|  | 266 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) | 
|  | 267 | INITIALIZE_PASS_DEPENDENCY(VirtRegMap) | 
|  | 268 | INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) | 
|  | 269 | INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", | 
|  | 270 | false, false) | 
|  | 271 |  | 
|  | 272 |  | 
|  | 273 | char GCNRegBankReassign::ID = 0; | 
|  | 274 |  | 
|  | 275 | char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; | 
|  | 276 |  | 
|  | 277 | unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { | 
|  | 278 | assert (TargetRegisterInfo::isPhysicalRegister(Reg)); | 
|  | 279 |  | 
|  | 280 | const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); | 
|  | 281 | unsigned Size = TRI->getRegSizeInBits(*RC); | 
|  | 282 | if (Size > 32) | 
|  | 283 | Reg = TRI->getSubReg(Reg, AMDGPU::sub0); | 
|  | 284 |  | 
|  | 285 | if (TRI->hasVGPRs(RC)) { | 
|  | 286 | Reg -= AMDGPU::VGPR0; | 
|  | 287 | return Reg % NUM_VGPR_BANKS; | 
|  | 288 | } | 
|  | 289 |  | 
|  | 290 | Reg = TRI->getEncodingValue(Reg) / 2; | 
|  | 291 | return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET; | 
|  | 292 | } | 
|  | 293 |  | 
|  | 294 | unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg, | 
|  | 295 | int Bank) { | 
|  | 296 | if (TargetRegisterInfo::isVirtualRegister(Reg)) { | 
|  | 297 | if (!VRM->isAssignedReg(Reg)) | 
|  | 298 | return 0; | 
|  | 299 |  | 
|  | 300 | Reg = VRM->getPhys(Reg); | 
|  | 301 | if (!Reg) | 
|  | 302 | return 0; | 
|  | 303 | if (SubReg) | 
|  | 304 | Reg = TRI->getSubReg(Reg, SubReg); | 
|  | 305 | } | 
|  | 306 |  | 
|  | 307 | const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); | 
|  | 308 | unsigned Size = TRI->getRegSizeInBits(*RC) / 32; | 
|  | 309 | if (Size > 1) | 
|  | 310 | Reg = TRI->getSubReg(Reg, AMDGPU::sub0); | 
|  | 311 |  | 
|  | 312 | if (TRI->hasVGPRs(RC)) { | 
|  | 313 | // VGPRs have 4 banks assigned in a round-robin fashion. | 
|  | 314 | Reg -= AMDGPU::VGPR0; | 
|  | 315 | unsigned Mask = (1 << Size) - 1; | 
|  | 316 | unsigned Used = 0; | 
|  | 317 | // Bitmask lacks an extract method | 
|  | 318 | for (unsigned I = 0; I < Size; ++I) | 
|  | 319 | if (RegsUsed.test(Reg + I)) | 
|  | 320 | Used |= 1 << I; | 
|  | 321 | RegsUsed.set(Reg, Reg + Size); | 
|  | 322 | Mask &= ~Used; | 
|  | 323 | Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank); | 
|  | 324 | return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; | 
|  | 325 | } | 
|  | 326 |  | 
|  | 327 | // SGPRs have 8 banks holding 2 consequitive registers each. | 
|  | 328 | Reg = TRI->getEncodingValue(Reg) / 2; | 
|  | 329 | unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs(); | 
|  | 330 | if (Reg + StartBit >= RegsUsed.size()) | 
|  | 331 | return 0; | 
|  | 332 |  | 
|  | 333 | if (Size > 1) | 
|  | 334 | Size /= 2; | 
|  | 335 | unsigned Mask = (1 << Size) - 1; | 
|  | 336 | unsigned Used = 0; | 
|  | 337 | for (unsigned I = 0; I < Size; ++I) | 
|  | 338 | if (RegsUsed.test(StartBit + Reg + I)) | 
|  | 339 | Used |= 1 << I; | 
|  | 340 | RegsUsed.set(StartBit + Reg, StartBit + Reg + Size); | 
|  | 341 | Mask &= ~Used; | 
|  | 342 | Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS | 
|  | 343 | : unsigned(Bank - SGPR_BANK_OFFSET); | 
|  | 344 | Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; | 
|  | 345 | // Reserve 4 bank ids for VGPRs. | 
|  | 346 | return Mask << SGPR_BANK_OFFSET; | 
|  | 347 | } | 
|  | 348 |  | 
|  | 349 | unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI, | 
|  | 350 | unsigned& UsedBanks, | 
|  | 351 | unsigned Reg, | 
|  | 352 | int Bank) { | 
|  | 353 | unsigned StallCycles = 0; | 
|  | 354 | UsedBanks = 0; | 
|  | 355 |  | 
|  | 356 | if (MI.isDebugValue()) | 
|  | 357 | return 0; | 
|  | 358 |  | 
|  | 359 | RegsUsed.reset(); | 
|  | 360 | OperandMasks.clear(); | 
|  | 361 | for (const auto& Op : MI.explicit_uses()) { | 
|  | 362 | // Undef can be assigned to any register, so two vregs can be assigned | 
|  | 363 | // the same phys reg within the same instruction. | 
|  | 364 | if (!Op.isReg() || Op.isUndef()) | 
|  | 365 | continue; | 
|  | 366 |  | 
|  | 367 | unsigned R = Op.getReg(); | 
|  | 368 | unsigned ShiftedBank = Bank; | 
|  | 369 |  | 
|  | 370 | if (Bank != -1 && R == Reg && Op.getSubReg()) { | 
|  | 371 | unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger(); | 
|  | 372 | if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) { | 
|  | 373 | // If a register spans all banks we cannot shift it to avoid conflict. | 
|  | 374 | if (countPopulation(LM) >= NUM_VGPR_BANKS) | 
|  | 375 | continue; | 
|  | 376 | ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS; | 
|  | 377 | } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) { | 
|  | 378 | // If a register spans all banks we cannot shift it to avoid conflict. | 
|  | 379 | if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS) | 
|  | 380 | continue; | 
|  | 381 | ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET + | 
|  | 382 | (countTrailingZeros(LM) >> 1)) % | 
|  | 383 | NUM_SGPR_BANKS; | 
|  | 384 | } | 
|  | 385 | } | 
|  | 386 |  | 
|  | 387 | unsigned Mask = getRegBankMask(R, Op.getSubReg(), | 
|  | 388 | (Reg == R) ? ShiftedBank : -1); | 
|  | 389 | StallCycles += countPopulation(UsedBanks & Mask); | 
|  | 390 | UsedBanks |= Mask; | 
|  | 391 | OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask)); | 
|  | 392 | } | 
|  | 393 |  | 
|  | 394 | return StallCycles; | 
|  | 395 | } | 
|  | 396 |  | 
|  | 397 | unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI, | 
|  | 398 | unsigned Reg1, | 
|  | 399 | unsigned Reg2, | 
|  | 400 | unsigned StallCycles) const | 
|  | 401 | { | 
|  | 402 | unsigned Defs = 0; | 
|  | 403 | MachineBasicBlock::const_instr_iterator Def(MI.getIterator()); | 
|  | 404 | MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin()); | 
|  | 405 | for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) { | 
|  | 406 | if (MI.isDebugInstr()) | 
|  | 407 | continue; | 
|  | 408 | --Def; | 
|  | 409 | if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) | 
|  | 410 | continue; | 
|  | 411 | if (Def->modifiesRegister(Reg1, TRI)) | 
|  | 412 | Defs |= 1; | 
|  | 413 | if (Def->modifiesRegister(Reg2, TRI)) | 
|  | 414 | Defs |= 2; | 
|  | 415 | } | 
|  | 416 | return countPopulation(Defs); | 
|  | 417 | } | 
|  | 418 |  | 
|  | 419 | bool GCNRegBankReassign::isReassignable(unsigned Reg) const { | 
|  | 420 | if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) | 
|  | 421 | return false; | 
|  | 422 |  | 
|  | 423 | const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); | 
|  | 424 |  | 
|  | 425 | unsigned PhysReg = VRM->getPhys(Reg); | 
|  | 426 |  | 
|  | 427 | if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) | 
|  | 428 | return false; | 
|  | 429 |  | 
|  | 430 | for (auto U : MRI->use_nodbg_operands(Reg)) { | 
|  | 431 | if (U.isImplicit()) | 
|  | 432 | return false; | 
|  | 433 | const MachineInstr *UseInst = U.getParent(); | 
|  | 434 | if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) | 
|  | 435 | return false; | 
|  | 436 | } | 
|  | 437 |  | 
|  | 438 | const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg); | 
|  | 439 | if (TRI->hasVGPRs(RC)) | 
|  | 440 | return true; | 
|  | 441 |  | 
|  | 442 | unsigned Size = TRI->getRegSizeInBits(*RC); | 
|  | 443 | if (Size > 32) | 
|  | 444 | PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0); | 
|  | 445 |  | 
|  | 446 | return AMDGPU::SGPR_32RegClass.contains(PhysReg); | 
|  | 447 | } | 
|  | 448 |  | 
|  | 449 | unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask, | 
|  | 450 | unsigned UsedBanks) const { | 
|  | 451 | unsigned Size = countPopulation(Mask); | 
|  | 452 | unsigned FreeBanks = 0; | 
|  | 453 | unsigned Bank = findFirstSet(Mask); | 
|  | 454 |  | 
|  | 455 | UsedBanks &= ~Mask; | 
|  | 456 |  | 
|  | 457 | // Find free VGPR banks | 
|  | 458 | if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) { | 
|  | 459 | for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) { | 
|  | 460 | if (Bank == I) | 
|  | 461 | continue; | 
|  | 462 | unsigned NewMask = ((1 << Size) - 1) << I; | 
|  | 463 | NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK; | 
|  | 464 | if (!(UsedBanks & NewMask)) | 
|  | 465 | FreeBanks |= 1 << I; | 
|  | 466 | } | 
|  | 467 | return FreeBanks; | 
|  | 468 | } | 
|  | 469 |  | 
|  | 470 | // Find free SGPR banks | 
|  | 471 | // SGPR tuples must be aligned, so step is size in banks it | 
|  | 472 | // crosses. | 
|  | 473 | Bank -= SGPR_BANK_OFFSET; | 
|  | 474 | for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) { | 
|  | 475 | if (Bank == I) | 
|  | 476 | continue; | 
|  | 477 | unsigned NewMask = ((1 << Size) - 1) << I; | 
|  | 478 | NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK; | 
|  | 479 | if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET))) | 
|  | 480 | FreeBanks |= (1 << SGPR_BANK_OFFSET) << I; | 
|  | 481 | } | 
|  | 482 |  | 
|  | 483 | return FreeBanks; | 
|  | 484 | } | 
|  | 485 |  | 
|  | 486 | unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg, | 
|  | 487 | unsigned SubReg, | 
|  | 488 | unsigned Mask, | 
|  | 489 | unsigned UsedBanks) const { | 
|  | 490 | if (!isReassignable(Reg)) | 
|  | 491 | return 0; | 
|  | 492 |  | 
|  | 493 | unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); | 
|  | 494 |  | 
|  | 495 | unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger(); | 
|  | 496 | if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) { | 
|  | 497 | unsigned Shift = countTrailingZeros(LM); | 
|  | 498 | if (Shift >= NUM_VGPR_BANKS) | 
|  | 499 | return 0; | 
|  | 500 | unsigned VB = FreeBanks & VGPR_BANK_MASK; | 
|  | 501 | FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & | 
|  | 502 | VGPR_BANK_MASK; | 
|  | 503 | } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) { | 
|  | 504 | unsigned Shift = countTrailingZeros(LM) >> 1; | 
|  | 505 | if (Shift >= NUM_SGPR_BANKS) | 
|  | 506 | return 0; | 
|  | 507 | unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; | 
|  | 508 | FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) & | 
|  | 509 | SGPR_BANK_SHIFTED_MASK; | 
|  | 510 | FreeBanks <<= SGPR_BANK_OFFSET; | 
|  | 511 | } | 
|  | 512 |  | 
|  | 513 | LLVM_DEBUG(if (FreeBanks) { | 
|  | 514 | dbgs() << "Potential reassignments of " << printReg(Reg, SubReg) | 
|  | 515 | << " to banks: "; dumpFreeBanks(FreeBanks); | 
|  | 516 | dbgs() << '\n'; }); | 
|  | 517 |  | 
|  | 518 | return FreeBanks; | 
|  | 519 | } | 
|  | 520 |  | 
|  | 521 | void GCNRegBankReassign::collectCandidates(MachineInstr& MI, | 
|  | 522 | unsigned UsedBanks, | 
|  | 523 | unsigned StallCycles) { | 
|  | 524 | LLVM_DEBUG(MI.dump()); | 
|  | 525 |  | 
|  | 526 | if (!StallCycles) | 
|  | 527 | return; | 
|  | 528 |  | 
|  | 529 | LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n'); | 
|  | 530 |  | 
|  | 531 | for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) { | 
|  | 532 | for (unsigned J = I + 1; J != E; ++J) { | 
|  | 533 | if (!(OperandMasks[I].Mask & OperandMasks[J].Mask)) | 
|  | 534 | continue; | 
|  | 535 |  | 
|  | 536 | unsigned Reg1 = OperandMasks[I].Reg; | 
|  | 537 | unsigned Reg2 = OperandMasks[J].Reg; | 
|  | 538 | unsigned SubReg1 = OperandMasks[I].SubReg; | 
|  | 539 | unsigned SubReg2 = OperandMasks[J].SubReg; | 
|  | 540 | unsigned Mask1 = OperandMasks[I].Mask; | 
|  | 541 | unsigned Mask2 = OperandMasks[J].Mask; | 
|  | 542 | unsigned Size1 = countPopulation(Mask1); | 
|  | 543 | unsigned Size2 = countPopulation(Mask2); | 
|  | 544 |  | 
|  | 545 | LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) << | 
|  | 546 | " and " << printReg(Reg2, SubReg2) << '\n'); | 
|  | 547 |  | 
|  | 548 | unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles); | 
|  | 549 | Weight += MLI->getLoopDepth(MI.getParent()) * 10; | 
|  | 550 |  | 
|  | 551 | LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n'); | 
|  | 552 |  | 
|  | 553 | unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); | 
|  | 554 | unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); | 
|  | 555 | if (FreeBanks1) | 
|  | 556 | Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight | 
|  | 557 | + ((Size2 > Size1) ? 1 : 0))); | 
|  | 558 | if (FreeBanks2) | 
|  | 559 | Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight | 
|  | 560 | + ((Size1 > Size2) ? 1 : 0))); | 
|  | 561 | } | 
|  | 562 | } | 
|  | 563 | } | 
|  | 564 |  | 
|  | 565 | unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, | 
|  | 566 | unsigned Reg, int Bank, | 
|  | 567 | bool Collect) { | 
|  | 568 | unsigned TotalStallCycles = 0; | 
|  | 569 | unsigned UsedBanks = 0; | 
|  | 570 | SmallSet<const MachineInstr *, 16> Visited; | 
|  | 571 |  | 
|  | 572 | for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) { | 
|  | 573 | if (MI.isBundle()) | 
|  | 574 | continue; | 
|  | 575 | if (!Visited.insert(&MI).second) | 
|  | 576 | continue; | 
|  | 577 | unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank); | 
|  | 578 | TotalStallCycles += StallCycles; | 
|  | 579 | if (Collect) | 
|  | 580 | collectCandidates(MI, UsedBanks, StallCycles); | 
|  | 581 | } | 
|  | 582 |  | 
|  | 583 | return TotalStallCycles; | 
|  | 584 | } | 
|  | 585 |  | 
|  | 586 | unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, | 
|  | 587 | unsigned Bank) const { | 
|  | 588 | const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); | 
|  | 589 | unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs | 
|  | 590 | : MaxNumSGPRs; | 
|  | 591 | unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0 | 
|  | 592 | : AMDGPU::SGPR0); | 
|  | 593 |  | 
|  | 594 | for (unsigned Reg : RC->getRegisters()) { | 
|  | 595 | // Check occupancy limit. | 
|  | 596 | if (TRI->isSubRegisterEq(Reg, MaxReg)) | 
|  | 597 | break; | 
|  | 598 |  | 
|  | 599 | if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) | 
|  | 600 | continue; | 
|  | 601 |  | 
|  | 602 | for (unsigned I = 0; CSRegs[I]; ++I) | 
|  | 603 | if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && | 
|  | 604 | !LRM->isPhysRegUsed(CSRegs[I])) | 
|  | 605 | return AMDGPU::NoRegister; | 
|  | 606 |  | 
|  | 607 | LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n'); | 
|  | 608 |  | 
|  | 609 | if (!LRM->checkInterference(LI, Reg)) | 
|  | 610 | return Reg; | 
|  | 611 | } | 
|  | 612 |  | 
|  | 613 | return AMDGPU::NoRegister; | 
|  | 614 | } | 
|  | 615 |  | 
|  | 616 | unsigned GCNRegBankReassign::tryReassign(Candidate &C) { | 
|  | 617 | if (!LIS->hasInterval(C.Reg)) | 
|  | 618 | return 0; | 
|  | 619 |  | 
|  | 620 | LiveInterval &LI = LIS->getInterval(C.Reg); | 
|  | 621 | LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump(); | 
|  | 622 | LI.dump()); | 
|  | 623 |  | 
|  | 624 | // For each candidate bank walk all instructions in the range of live | 
|  | 625 | // interval and check if replacing the register with one belonging to | 
|  | 626 | // the candidate bank reduces conflicts. | 
|  | 627 |  | 
|  | 628 | unsigned OrigStalls = computeStallCycles(C.Reg); | 
|  | 629 | LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n'); | 
|  | 630 | if (!OrigStalls) | 
|  | 631 | return 0; | 
|  | 632 |  | 
|  | 633 | struct BankStall { | 
|  | 634 | BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {}; | 
|  | 635 | bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; } | 
|  | 636 | unsigned Bank; | 
|  | 637 | unsigned Stalls; | 
|  | 638 | }; | 
|  | 639 | SmallVector<BankStall, 8> BankStalls; | 
|  | 640 |  | 
|  | 641 | for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { | 
|  | 642 | if (C.FreeBanks & (1 << Bank)) { | 
|  | 643 | LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); | 
|  | 644 | unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); | 
|  | 645 | if (Stalls < OrigStalls) { | 
|  | 646 | LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " | 
|  | 647 | << Stalls << '\n'); | 
|  | 648 | BankStalls.push_back(BankStall((unsigned)Bank, Stalls)); | 
|  | 649 | } | 
|  | 650 | } | 
|  | 651 | } | 
| Galina Kistanova | ed49f6d | 2019-05-22 20:42:56 +0000 | [diff] [blame] | 652 | std::sort(BankStalls.begin(), BankStalls.end()); | 
| Stanislav Mekhanoshin | 3b7925f | 2019-05-01 16:49:31 +0000 | [diff] [blame] | 653 |  | 
|  | 654 | unsigned OrigReg = VRM->getPhys(C.Reg); | 
|  | 655 | LRM->unassign(LI); | 
|  | 656 | while (!BankStalls.empty()) { | 
|  | 657 | BankStall BS = BankStalls.pop_back_val(); | 
|  | 658 | unsigned Reg = scavengeReg(LI, BS.Bank); | 
|  | 659 | if (Reg == AMDGPU::NoRegister) { | 
|  | 660 | LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) | 
|  | 661 | << '\n'); | 
|  | 662 | continue; | 
|  | 663 | } | 
|  | 664 | LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg) | 
|  | 665 | << (LRM->isPhysRegUsed(Reg) ? "" : " (new)") | 
|  | 666 | << " in bank " << printBank(BS.Bank) << '\n'); | 
|  | 667 |  | 
|  | 668 | LRM->assign(LI, Reg); | 
|  | 669 |  | 
|  | 670 | LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n'); | 
|  | 671 |  | 
|  | 672 | return OrigStalls - BS.Stalls; | 
|  | 673 | } | 
|  | 674 | LRM->assign(LI, OrigReg); | 
|  | 675 |  | 
|  | 676 | return 0; | 
|  | 677 | } | 
|  | 678 |  | 
|  | 679 | unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, | 
|  | 680 | bool Collect) { | 
|  | 681 | unsigned TotalStallCycles = 0; | 
|  | 682 |  | 
|  | 683 | for (MachineBasicBlock &MBB : MF) { | 
|  | 684 |  | 
|  | 685 | LLVM_DEBUG(if (Collect) { | 
|  | 686 | if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber(); | 
|  | 687 | else dbgs() << MBB.getName(); dbgs() << ":\n"; | 
|  | 688 | }); | 
|  | 689 |  | 
|  | 690 | for (MachineInstr &MI : MBB.instrs()) { | 
|  | 691 | if (MI.isBundle()) | 
|  | 692 | continue; // we analyze the instructions inside the bundle individually | 
|  | 693 |  | 
|  | 694 | unsigned UsedBanks = 0; | 
|  | 695 | unsigned StallCycles = analyzeInst(MI, UsedBanks); | 
|  | 696 |  | 
|  | 697 | if (Collect) | 
|  | 698 | collectCandidates(MI, UsedBanks, StallCycles); | 
|  | 699 |  | 
|  | 700 | TotalStallCycles += StallCycles; | 
|  | 701 | } | 
|  | 702 |  | 
|  | 703 | LLVM_DEBUG(if (Collect) { dbgs() << '\n'; }); | 
|  | 704 | } | 
|  | 705 |  | 
|  | 706 | return TotalStallCycles; | 
|  | 707 | } | 
|  | 708 |  | 
|  | 709 | void GCNRegBankReassign::removeCandidates(unsigned Reg) { | 
|  | 710 | Candidates.remove_if([Reg, this](const Candidate& C) { | 
|  | 711 | return C.MI->readsRegister(Reg, TRI); | 
|  | 712 | }); | 
|  | 713 | } | 
|  | 714 |  | 
|  | 715 | bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, | 
|  | 716 | unsigned OriginalCycles, | 
|  | 717 | unsigned CyclesSaved) { | 
|  | 718 | unsigned StallCycles = collectCandidates(MF, false); | 
|  | 719 | LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles | 
|  | 720 | << " stall cycles left\n"); | 
|  | 721 | return StallCycles + CyclesSaved == OriginalCycles; | 
|  | 722 | } | 
|  | 723 |  | 
|  | 724 | bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { | 
|  | 725 | ST = &MF.getSubtarget<GCNSubtarget>(); | 
|  | 726 | if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction())) | 
|  | 727 | return false; | 
|  | 728 |  | 
|  | 729 | MRI = &MF.getRegInfo(); | 
|  | 730 | TRI = ST->getRegisterInfo(); | 
|  | 731 | MLI = &getAnalysis<MachineLoopInfo>(); | 
|  | 732 | VRM = &getAnalysis<VirtRegMap>(); | 
|  | 733 | LRM = &getAnalysis<LiveRegMatrix>(); | 
|  | 734 | LIS = &getAnalysis<LiveIntervals>(); | 
|  | 735 |  | 
|  | 736 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | 
|  | 737 | unsigned Occupancy = MFI->getOccupancy(); | 
|  | 738 | MaxNumVGPRs = ST->getMaxNumVGPRs(MF); | 
|  | 739 | MaxNumSGPRs = ST->getMaxNumSGPRs(MF); | 
|  | 740 | MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs); | 
|  | 741 | MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs); | 
|  | 742 |  | 
| Matt Arsenault | e0b8443 | 2019-06-26 13:39:29 +0000 | [diff] [blame] | 743 | CSRegs = MRI->getCalleeSavedRegs(); | 
| Stanislav Mekhanoshin | 3b7925f | 2019-05-01 16:49:31 +0000 | [diff] [blame] | 744 |  | 
|  | 745 | RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() + | 
|  | 746 | TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1); | 
|  | 747 |  | 
|  | 748 | LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName() | 
|  | 749 | << '\n'); | 
|  | 750 |  | 
|  | 751 | unsigned StallCycles = collectCandidates(MF); | 
|  | 752 | NumStallsDetected += StallCycles; | 
|  | 753 |  | 
|  | 754 | LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " | 
|  | 755 | "function " << MF.getName() << '\n'); | 
|  | 756 |  | 
|  | 757 | Candidates.sort(); | 
|  | 758 |  | 
|  | 759 | LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; | 
|  | 760 | for (auto C : Candidates) C.dump(this); | 
|  | 761 | dbgs() << "\n\n"); | 
|  | 762 |  | 
|  | 763 | unsigned CyclesSaved = 0; | 
|  | 764 | while (!Candidates.empty()) { | 
|  | 765 | Candidate C = Candidates.back(); | 
|  | 766 | unsigned LocalCyclesSaved = tryReassign(C); | 
|  | 767 | CyclesSaved += LocalCyclesSaved; | 
|  | 768 |  | 
|  | 769 | if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) | 
|  | 770 | report_fatal_error("RegBank reassign stall cycles verification failed."); | 
|  | 771 |  | 
|  | 772 | Candidates.pop_back(); | 
|  | 773 | if (LocalCyclesSaved) { | 
|  | 774 | removeCandidates(C.Reg); | 
|  | 775 | computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); | 
|  | 776 | Candidates.sort(); | 
|  | 777 |  | 
|  | 778 | LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; | 
|  | 779 | for (auto C : Candidates) | 
|  | 780 | C.dump(this); | 
|  | 781 | dbgs() << "\n\n"); | 
|  | 782 | } | 
|  | 783 | } | 
|  | 784 | NumStallsRecovered += CyclesSaved; | 
|  | 785 |  | 
|  | 786 | LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved | 
|  | 787 | << " cycles saved in function " << MF.getName() << '\n'); | 
|  | 788 |  | 
|  | 789 | Candidates.clear(); | 
|  | 790 |  | 
|  | 791 | if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved)) | 
|  | 792 | report_fatal_error("RegBank reassign stall cycles verification failed."); | 
|  | 793 |  | 
|  | 794 | RegsUsed.clear(); | 
|  | 795 |  | 
|  | 796 | return CyclesSaved > 0; | 
|  | 797 | } |