blob: f0d47eaa4ed1f288e6d9874f355a3b937312b9e5 [file] [log] [blame]
Stanislav Mekhanoshin3b7925f2019-05-01 16:49:31 +00001//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Try to reassign registers on GFX10+ to reduce register bank
11/// conflicts.
12///
13/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
14/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
15/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
16/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
17///
18/// The shader can read one dword from each of these banks once per cycle.
19/// If an instruction has to read more register operands from the same bank
20/// an additional cycle is needed. HW attempts to pre-load registers through
21/// input operand gathering, but a stall cycle may occur if that fails. For
22/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
23/// potentially incuring 2 stall cycles.
24///
25/// The pass tries to reassign registers to reduce bank conflicts.
26///
27/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
28/// that 4 has to be subtracted from an SGPR bank number to get the real value.
29/// This also corresponds to bit numbers in bank masks used in the pass.
30///
31//===----------------------------------------------------------------------===//
32
33#include "AMDGPU.h"
34#include "AMDGPUSubtarget.h"
35#include "SIInstrInfo.h"
36#include "SIMachineFunctionInfo.h"
37#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
38#include "llvm/ADT/SmallSet.h"
39#include "llvm/ADT/Statistic.h"
40#include "llvm/CodeGen/LiveInterval.h"
41#include "llvm/CodeGen/LiveIntervals.h"
42#include "llvm/CodeGen/LiveRegMatrix.h"
43#include "llvm/CodeGen/MachineFunctionPass.h"
44#include "llvm/CodeGen/MachineLoopInfo.h"
45#include "llvm/CodeGen/VirtRegMap.h"
46#include "llvm/Support/MathExtras.h"
47
48using namespace llvm;
49
50static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
51 cl::desc("Verify stall cycles in the regbanks reassign pass"),
52 cl::value_desc("0|1|2"),
53 cl::init(0), cl::Hidden);
54
55#define DEBUG_TYPE "amdgpu-regbanks-reassign"
56
57#define NUM_VGPR_BANKS 4
58#define NUM_SGPR_BANKS 8
59#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
60#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
61#define VGPR_BANK_MASK 0xf
62#define SGPR_BANK_MASK 0xff0
63#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
64
65STATISTIC(NumStallsDetected,
66 "Number of operand read stalls detected");
67STATISTIC(NumStallsRecovered,
68 "Number of operand read stalls recovered");
69
70namespace {
71
72class GCNRegBankReassign : public MachineFunctionPass {
73
74 class OperandMask {
75 public:
76 OperandMask(unsigned r, unsigned s, unsigned m)
77 : Reg(r), SubReg(s), Mask(m) {}
78 unsigned Reg;
79 unsigned SubReg;
80 unsigned Mask;
81 };
82
83 class Candidate {
84 public:
85 Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
86 unsigned weight)
87 : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
88
89 bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
90
91#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
92 void dump(const GCNRegBankReassign *P) const {
93 MI->dump();
94 dbgs() << P->printReg(Reg) << " to banks ";
95 dumpFreeBanks(FreeBanks);
96 dbgs() << " weight " << Weight << '\n';
97 }
98#endif
99
100 MachineInstr *MI;
101 unsigned Reg;
102 unsigned FreeBanks;
103 unsigned Weight;
104 };
105
106 class CandidateList : public std::list<Candidate> {
107 public:
108 // Speedup subsequent sort.
109 void push(const Candidate&& C) {
110 if (C.Weight) push_back(C);
111 else push_front(C);
112 }
113 };
114
115public:
116 static char ID;
117
118public:
119 GCNRegBankReassign() : MachineFunctionPass(ID) {
120 initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
121 }
122
123 bool runOnMachineFunction(MachineFunction &MF) override;
124
125 StringRef getPassName() const override { return "GCN RegBank Reassign"; }
126
127 void getAnalysisUsage(AnalysisUsage &AU) const override {
128 AU.addRequired<MachineLoopInfo>();
129 AU.addRequired<LiveIntervals>();
130 AU.addRequired<VirtRegMap>();
131 AU.addRequired<LiveRegMatrix>();
132 AU.setPreservesAll();
133 MachineFunctionPass::getAnalysisUsage(AU);
134 }
135
136private:
137 const GCNSubtarget *ST;
138
139 const MachineRegisterInfo *MRI;
140
141 const SIRegisterInfo *TRI;
142
143 MachineLoopInfo *MLI;
144
145 VirtRegMap *VRM;
146
147 LiveRegMatrix *LRM;
148
149 LiveIntervals *LIS;
150
151 unsigned MaxNumVGPRs;
152
153 unsigned MaxNumSGPRs;
154
155 BitVector RegsUsed;
156
157 SmallVector<OperandMask, 8> OperandMasks;
158
159 CandidateList Candidates;
160
161 const MCPhysReg *CSRegs;
162
163 // Returns bank for a phys reg.
164 unsigned getPhysRegBank(unsigned Reg) const;
165
166 // Return a bit set for each register bank used. 4 banks for VGPRs and
167 // 8 banks for SGPRs.
168 // Registers already processed and recorded in RegsUsed are excluded.
169 // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
170 unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
171
172 // Return number of stalls in the instructions.
173 // UsedBanks has bits set for the banks used by all operands.
174 // If Reg and Bank provided substitute the Reg with the Bank.
175 unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks,
176 unsigned Reg = AMDGPU::NoRegister, int Bank = -1);
177
178 // Return true if register is regular VGPR or SGPR or their tuples.
179 // Returns false for special registers like m0, vcc etc.
180 bool isReassignable(unsigned Reg) const;
181
182 // Check if registers' defs are old and may be pre-loaded.
183 // Returns 0 if both registers are old enough, 1 or 2 if one or both
184 // registers will not likely be pre-loaded.
185 unsigned getOperandGatherWeight(const MachineInstr& MI,
186 unsigned Reg1,
187 unsigned Reg2,
188 unsigned StallCycles) const;
189
190
191 // Find all bank bits in UsedBanks where Mask can be relocated to.
192 unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
193
194 // Find all bank bits in UsedBanks where Mask can be relocated to.
195 // Bank is relative to the register and not its subregister component.
196 // Returns 0 is a register is not reassignable.
197 unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
198 unsigned UsedBanks) const;
199
200 // Add cadidate instruction to the work list.
201 void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
202 unsigned StallCycles);
203
204 // Collect cadidate instructions across function. Returns a number stall
205 // cycles detected. Only counts stalls if Collect is false.
206 unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
207
208 // Remove all candidates that read specified register.
209 void removeCandidates(unsigned Reg);
210
211 // Compute stalls within the uses of SrcReg replaced by a register from
212 // Bank. If Bank is -1 does not perform substitution. If Collect is set
213 // candidates are collected and added to work list.
214 unsigned computeStallCycles(unsigned SrcReg,
215 unsigned Reg = AMDGPU::NoRegister,
216 int Bank = -1, bool Collect = false);
217
218 // Search for a register in Bank unused within LI.
219 // Returns phys reg or NoRegister.
220 unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
221
222 // Try to reassign candidate. Returns number or stall cycles saved.
223 unsigned tryReassign(Candidate &C);
224
225 bool verifyCycles(MachineFunction &MF,
226 unsigned OriginalCycles, unsigned CyclesSaved);
227
228
229#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
230public:
231 Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
232 return Printable([Reg, SubReg, this](raw_ostream &OS) {
233 if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
234 OS << llvm::printReg(Reg, TRI);
235 return;
236 }
237 if (!VRM->isAssignedReg(Reg))
238 OS << "<unassigned> " << llvm::printReg(Reg, TRI);
239 else
240 OS << llvm::printReg(Reg, TRI) << '('
241 << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
242 if (SubReg)
243 OS << ':' << TRI->getSubRegIndexName(SubReg);
244 });
245 }
246
247 static Printable printBank(unsigned Bank) {
248 return Printable([Bank](raw_ostream &OS) {
249 OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
250 });
251 }
252
253 static void dumpFreeBanks(unsigned FreeBanks) {
254 for (unsigned L = 0; L < NUM_BANKS; ++L)
255 if (FreeBanks & (1 << L))
256 dbgs() << printBank(L) << ' ';
257 }
258#endif
259};
260
261} // End anonymous namespace.
262
263INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
264 false, false)
265INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
266INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
267INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
268INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
269INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
270 false, false)
271
272
273char GCNRegBankReassign::ID = 0;
274
275char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
276
277unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
278 assert (TargetRegisterInfo::isPhysicalRegister(Reg));
279
280 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
281 unsigned Size = TRI->getRegSizeInBits(*RC);
282 if (Size > 32)
283 Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
284
285 if (TRI->hasVGPRs(RC)) {
286 Reg -= AMDGPU::VGPR0;
287 return Reg % NUM_VGPR_BANKS;
288 }
289
290 Reg = TRI->getEncodingValue(Reg) / 2;
291 return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
292}
293
294unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
295 int Bank) {
296 if (TargetRegisterInfo::isVirtualRegister(Reg)) {
297 if (!VRM->isAssignedReg(Reg))
298 return 0;
299
300 Reg = VRM->getPhys(Reg);
301 if (!Reg)
302 return 0;
303 if (SubReg)
304 Reg = TRI->getSubReg(Reg, SubReg);
305 }
306
307 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
308 unsigned Size = TRI->getRegSizeInBits(*RC) / 32;
309 if (Size > 1)
310 Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
311
312 if (TRI->hasVGPRs(RC)) {
313 // VGPRs have 4 banks assigned in a round-robin fashion.
314 Reg -= AMDGPU::VGPR0;
315 unsigned Mask = (1 << Size) - 1;
316 unsigned Used = 0;
317 // Bitmask lacks an extract method
318 for (unsigned I = 0; I < Size; ++I)
319 if (RegsUsed.test(Reg + I))
320 Used |= 1 << I;
321 RegsUsed.set(Reg, Reg + Size);
322 Mask &= ~Used;
323 Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank);
324 return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
325 }
326
327 // SGPRs have 8 banks holding 2 consequitive registers each.
328 Reg = TRI->getEncodingValue(Reg) / 2;
329 unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
330 if (Reg + StartBit >= RegsUsed.size())
331 return 0;
332
333 if (Size > 1)
334 Size /= 2;
335 unsigned Mask = (1 << Size) - 1;
336 unsigned Used = 0;
337 for (unsigned I = 0; I < Size; ++I)
338 if (RegsUsed.test(StartBit + Reg + I))
339 Used |= 1 << I;
340 RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
341 Mask &= ~Used;
342 Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
343 : unsigned(Bank - SGPR_BANK_OFFSET);
344 Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
345 // Reserve 4 bank ids for VGPRs.
346 return Mask << SGPR_BANK_OFFSET;
347}
348
349unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
350 unsigned& UsedBanks,
351 unsigned Reg,
352 int Bank) {
353 unsigned StallCycles = 0;
354 UsedBanks = 0;
355
356 if (MI.isDebugValue())
357 return 0;
358
359 RegsUsed.reset();
360 OperandMasks.clear();
361 for (const auto& Op : MI.explicit_uses()) {
362 // Undef can be assigned to any register, so two vregs can be assigned
363 // the same phys reg within the same instruction.
364 if (!Op.isReg() || Op.isUndef())
365 continue;
366
367 unsigned R = Op.getReg();
Stanislav Mekhanoshine67cc382019-07-11 21:19:33 +0000368 if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
369 continue;
370
Stanislav Mekhanoshin3b7925f2019-05-01 16:49:31 +0000371 unsigned ShiftedBank = Bank;
372
373 if (Bank != -1 && R == Reg && Op.getSubReg()) {
374 unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger();
375 if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) {
376 // If a register spans all banks we cannot shift it to avoid conflict.
377 if (countPopulation(LM) >= NUM_VGPR_BANKS)
378 continue;
379 ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS;
380 } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) {
381 // If a register spans all banks we cannot shift it to avoid conflict.
382 if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS)
383 continue;
384 ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET +
385 (countTrailingZeros(LM) >> 1)) %
386 NUM_SGPR_BANKS;
387 }
388 }
389
390 unsigned Mask = getRegBankMask(R, Op.getSubReg(),
391 (Reg == R) ? ShiftedBank : -1);
392 StallCycles += countPopulation(UsedBanks & Mask);
393 UsedBanks |= Mask;
394 OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
395 }
396
397 return StallCycles;
398}
399
400unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
401 unsigned Reg1,
402 unsigned Reg2,
403 unsigned StallCycles) const
404{
405 unsigned Defs = 0;
406 MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
407 MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
408 for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
409 if (MI.isDebugInstr())
410 continue;
411 --Def;
412 if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
413 continue;
414 if (Def->modifiesRegister(Reg1, TRI))
415 Defs |= 1;
416 if (Def->modifiesRegister(Reg2, TRI))
417 Defs |= 2;
418 }
419 return countPopulation(Defs);
420}
421
422bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
423 if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
424 return false;
425
426 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
427
428 unsigned PhysReg = VRM->getPhys(Reg);
429
430 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
431 return false;
432
433 for (auto U : MRI->use_nodbg_operands(Reg)) {
434 if (U.isImplicit())
435 return false;
436 const MachineInstr *UseInst = U.getParent();
437 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
438 return false;
439 }
440
441 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
442 if (TRI->hasVGPRs(RC))
443 return true;
444
445 unsigned Size = TRI->getRegSizeInBits(*RC);
446 if (Size > 32)
447 PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
448
449 return AMDGPU::SGPR_32RegClass.contains(PhysReg);
450}
451
452unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
453 unsigned UsedBanks) const {
454 unsigned Size = countPopulation(Mask);
455 unsigned FreeBanks = 0;
456 unsigned Bank = findFirstSet(Mask);
457
458 UsedBanks &= ~Mask;
459
460 // Find free VGPR banks
461 if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
462 for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
463 if (Bank == I)
464 continue;
465 unsigned NewMask = ((1 << Size) - 1) << I;
466 NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
467 if (!(UsedBanks & NewMask))
468 FreeBanks |= 1 << I;
469 }
470 return FreeBanks;
471 }
472
473 // Find free SGPR banks
474 // SGPR tuples must be aligned, so step is size in banks it
475 // crosses.
476 Bank -= SGPR_BANK_OFFSET;
477 for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
478 if (Bank == I)
479 continue;
480 unsigned NewMask = ((1 << Size) - 1) << I;
481 NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
482 if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
483 FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
484 }
485
486 return FreeBanks;
487}
488
489unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
490 unsigned SubReg,
491 unsigned Mask,
492 unsigned UsedBanks) const {
493 if (!isReassignable(Reg))
494 return 0;
495
496 unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
497
498 unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger();
499 if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) {
500 unsigned Shift = countTrailingZeros(LM);
501 if (Shift >= NUM_VGPR_BANKS)
502 return 0;
503 unsigned VB = FreeBanks & VGPR_BANK_MASK;
504 FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
505 VGPR_BANK_MASK;
506 } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) {
507 unsigned Shift = countTrailingZeros(LM) >> 1;
508 if (Shift >= NUM_SGPR_BANKS)
509 return 0;
510 unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
511 FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
512 SGPR_BANK_SHIFTED_MASK;
513 FreeBanks <<= SGPR_BANK_OFFSET;
514 }
515
516 LLVM_DEBUG(if (FreeBanks) {
517 dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
518 << " to banks: "; dumpFreeBanks(FreeBanks);
519 dbgs() << '\n'; });
520
521 return FreeBanks;
522}
523
524void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
525 unsigned UsedBanks,
526 unsigned StallCycles) {
527 LLVM_DEBUG(MI.dump());
528
529 if (!StallCycles)
530 return;
531
532 LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
533
534 for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
535 for (unsigned J = I + 1; J != E; ++J) {
536 if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
537 continue;
538
539 unsigned Reg1 = OperandMasks[I].Reg;
540 unsigned Reg2 = OperandMasks[J].Reg;
541 unsigned SubReg1 = OperandMasks[I].SubReg;
542 unsigned SubReg2 = OperandMasks[J].SubReg;
543 unsigned Mask1 = OperandMasks[I].Mask;
544 unsigned Mask2 = OperandMasks[J].Mask;
545 unsigned Size1 = countPopulation(Mask1);
546 unsigned Size2 = countPopulation(Mask2);
547
548 LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
549 " and " << printReg(Reg2, SubReg2) << '\n');
550
551 unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
552 Weight += MLI->getLoopDepth(MI.getParent()) * 10;
553
554 LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
555
556 unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
557 unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
558 if (FreeBanks1)
559 Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
560 + ((Size2 > Size1) ? 1 : 0)));
561 if (FreeBanks2)
562 Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
563 + ((Size1 > Size2) ? 1 : 0)));
564 }
565 }
566}
567
568unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
569 unsigned Reg, int Bank,
570 bool Collect) {
571 unsigned TotalStallCycles = 0;
572 unsigned UsedBanks = 0;
573 SmallSet<const MachineInstr *, 16> Visited;
574
575 for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
576 if (MI.isBundle())
577 continue;
578 if (!Visited.insert(&MI).second)
579 continue;
580 unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank);
581 TotalStallCycles += StallCycles;
582 if (Collect)
583 collectCandidates(MI, UsedBanks, StallCycles);
584 }
585
586 return TotalStallCycles;
587}
588
589unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
590 unsigned Bank) const {
591 const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
592 unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
593 : MaxNumSGPRs;
594 unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
595 : AMDGPU::SGPR0);
596
597 for (unsigned Reg : RC->getRegisters()) {
598 // Check occupancy limit.
599 if (TRI->isSubRegisterEq(Reg, MaxReg))
600 break;
601
602 if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
603 continue;
604
605 for (unsigned I = 0; CSRegs[I]; ++I)
606 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
607 !LRM->isPhysRegUsed(CSRegs[I]))
608 return AMDGPU::NoRegister;
609
610 LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
611
612 if (!LRM->checkInterference(LI, Reg))
613 return Reg;
614 }
615
616 return AMDGPU::NoRegister;
617}
618
619unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
620 if (!LIS->hasInterval(C.Reg))
621 return 0;
622
623 LiveInterval &LI = LIS->getInterval(C.Reg);
624 LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
625 LI.dump());
626
627 // For each candidate bank walk all instructions in the range of live
628 // interval and check if replacing the register with one belonging to
629 // the candidate bank reduces conflicts.
630
631 unsigned OrigStalls = computeStallCycles(C.Reg);
632 LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
633 if (!OrigStalls)
634 return 0;
635
636 struct BankStall {
637 BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
638 bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; }
639 unsigned Bank;
640 unsigned Stalls;
641 };
642 SmallVector<BankStall, 8> BankStalls;
643
644 for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
645 if (C.FreeBanks & (1 << Bank)) {
646 LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
647 unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
648 if (Stalls < OrigStalls) {
649 LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
650 << Stalls << '\n');
651 BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
652 }
653 }
654 }
Galina Kistanovaed49f6d2019-05-22 20:42:56 +0000655 std::sort(BankStalls.begin(), BankStalls.end());
Stanislav Mekhanoshin3b7925f2019-05-01 16:49:31 +0000656
657 unsigned OrigReg = VRM->getPhys(C.Reg);
658 LRM->unassign(LI);
659 while (!BankStalls.empty()) {
660 BankStall BS = BankStalls.pop_back_val();
661 unsigned Reg = scavengeReg(LI, BS.Bank);
662 if (Reg == AMDGPU::NoRegister) {
663 LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
664 << '\n');
665 continue;
666 }
667 LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
668 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
669 << " in bank " << printBank(BS.Bank) << '\n');
670
671 LRM->assign(LI, Reg);
672
673 LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
674
675 return OrigStalls - BS.Stalls;
676 }
677 LRM->assign(LI, OrigReg);
678
679 return 0;
680}
681
682unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
683 bool Collect) {
684 unsigned TotalStallCycles = 0;
685
686 for (MachineBasicBlock &MBB : MF) {
687
688 LLVM_DEBUG(if (Collect) {
689 if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
690 else dbgs() << MBB.getName(); dbgs() << ":\n";
691 });
692
693 for (MachineInstr &MI : MBB.instrs()) {
694 if (MI.isBundle())
695 continue; // we analyze the instructions inside the bundle individually
696
697 unsigned UsedBanks = 0;
698 unsigned StallCycles = analyzeInst(MI, UsedBanks);
699
700 if (Collect)
701 collectCandidates(MI, UsedBanks, StallCycles);
702
703 TotalStallCycles += StallCycles;
704 }
705
706 LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
707 }
708
709 return TotalStallCycles;
710}
711
712void GCNRegBankReassign::removeCandidates(unsigned Reg) {
713 Candidates.remove_if([Reg, this](const Candidate& C) {
714 return C.MI->readsRegister(Reg, TRI);
715 });
716}
717
718bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
719 unsigned OriginalCycles,
720 unsigned CyclesSaved) {
721 unsigned StallCycles = collectCandidates(MF, false);
722 LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
723 << " stall cycles left\n");
724 return StallCycles + CyclesSaved == OriginalCycles;
725}
726
727bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
728 ST = &MF.getSubtarget<GCNSubtarget>();
729 if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
730 return false;
731
732 MRI = &MF.getRegInfo();
733 TRI = ST->getRegisterInfo();
734 MLI = &getAnalysis<MachineLoopInfo>();
735 VRM = &getAnalysis<VirtRegMap>();
736 LRM = &getAnalysis<LiveRegMatrix>();
737 LIS = &getAnalysis<LiveIntervals>();
738
739 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
740 unsigned Occupancy = MFI->getOccupancy();
741 MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
742 MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
743 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
744 MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
745
Matt Arsenaulte0b84432019-06-26 13:39:29 +0000746 CSRegs = MRI->getCalleeSavedRegs();
Stanislav Mekhanoshin3b7925f2019-05-01 16:49:31 +0000747
748 RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
749 TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
750
751 LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
752 << '\n');
753
754 unsigned StallCycles = collectCandidates(MF);
755 NumStallsDetected += StallCycles;
756
757 LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
758 "function " << MF.getName() << '\n');
759
760 Candidates.sort();
761
762 LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
763 for (auto C : Candidates) C.dump(this);
764 dbgs() << "\n\n");
765
766 unsigned CyclesSaved = 0;
767 while (!Candidates.empty()) {
768 Candidate C = Candidates.back();
769 unsigned LocalCyclesSaved = tryReassign(C);
770 CyclesSaved += LocalCyclesSaved;
771
772 if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
773 report_fatal_error("RegBank reassign stall cycles verification failed.");
774
775 Candidates.pop_back();
776 if (LocalCyclesSaved) {
777 removeCandidates(C.Reg);
778 computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
779 Candidates.sort();
780
781 LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
782 for (auto C : Candidates)
783 C.dump(this);
784 dbgs() << "\n\n");
785 }
786 }
787 NumStallsRecovered += CyclesSaved;
788
789 LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
790 << " cycles saved in function " << MF.getName() << '\n');
791
792 Candidates.clear();
793
794 if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
795 report_fatal_error("RegBank reassign stall cycles verification failed.");
796
797 RegsUsed.clear();
798
799 return CyclesSaved > 0;
800}