Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 1 | //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential |
| 11 | /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA |
| 12 | /// with sequential versions where possible. |
| 13 | /// |
| 14 | //===----------------------------------------------------------------------===// |
| 15 | |
| 16 | #include "AMDGPU.h" |
| 17 | #include "AMDGPUSubtarget.h" |
| 18 | #include "SIInstrInfo.h" |
| 19 | #include "SIMachineFunctionInfo.h" |
| 20 | #include "llvm/ADT/Statistic.h" |
| 21 | #include "llvm/CodeGen/LiveInterval.h" |
| 22 | #include "llvm/CodeGen/LiveIntervals.h" |
| 23 | #include "llvm/CodeGen/LiveRegMatrix.h" |
| 24 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 25 | #include "llvm/CodeGen/VirtRegMap.h" |
Reid Kleckner | 05da2fe | 2019-11-13 13:15:01 -0800 | [diff] [blame^] | 26 | #include "llvm/InitializePasses.h" |
Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 27 | #include "llvm/Support/MathExtras.h" |
| 28 | #include <algorithm> |
| 29 | |
| 30 | using namespace llvm; |
| 31 | |
| 32 | #define DEBUG_TYPE "amdgpu-nsa-reassign" |
| 33 | |
| 34 | STATISTIC(NumNSAInstructions, |
| 35 | "Number of NSA instructions with non-sequential address found"); |
| 36 | STATISTIC(NumNSAConverted, |
| 37 | "Number of NSA instructions changed to sequential"); |
| 38 | |
| 39 | namespace { |
| 40 | |
| 41 | class GCNNSAReassign : public MachineFunctionPass { |
| 42 | public: |
| 43 | static char ID; |
| 44 | |
| 45 | GCNNSAReassign() : MachineFunctionPass(ID) { |
| 46 | initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); |
| 47 | } |
| 48 | |
| 49 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 50 | |
| 51 | StringRef getPassName() const override { return "GCN NSA Reassign"; } |
| 52 | |
| 53 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 54 | AU.addRequired<LiveIntervals>(); |
| 55 | AU.addRequired<VirtRegMap>(); |
| 56 | AU.addRequired<LiveRegMatrix>(); |
| 57 | AU.setPreservesAll(); |
| 58 | MachineFunctionPass::getAnalysisUsage(AU); |
| 59 | } |
| 60 | |
| 61 | private: |
| 62 | typedef enum { |
| 63 | NOT_NSA, // Not an NSA instruction |
| 64 | FIXED, // NSA which we cannot modify |
| 65 | NON_CONTIGUOUS, // NSA with non-sequential address which we can try |
| 66 | // to optimize. |
| 67 | CONTIGUOUS // NSA with all sequential address registers |
| 68 | } NSA_Status; |
| 69 | |
| 70 | const GCNSubtarget *ST; |
| 71 | |
| 72 | const MachineRegisterInfo *MRI; |
| 73 | |
| 74 | const SIRegisterInfo *TRI; |
| 75 | |
| 76 | VirtRegMap *VRM; |
| 77 | |
| 78 | LiveRegMatrix *LRM; |
| 79 | |
| 80 | LiveIntervals *LIS; |
| 81 | |
| 82 | unsigned MaxNumVGPRs; |
| 83 | |
| 84 | const MCPhysReg *CSRegs; |
| 85 | |
| 86 | NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; |
| 87 | |
| 88 | bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, |
| 89 | unsigned StartReg) const; |
| 90 | |
| 91 | bool canAssign(unsigned StartReg, unsigned NumRegs) const; |
| 92 | |
| 93 | bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; |
| 94 | }; |
| 95 | |
| 96 | } // End anonymous namespace. |
| 97 | |
| 98 | INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", |
| 99 | false, false) |
| 100 | INITIALIZE_PASS_DEPENDENCY(LiveIntervals) |
| 101 | INITIALIZE_PASS_DEPENDENCY(VirtRegMap) |
| 102 | INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) |
| 103 | INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", |
| 104 | false, false) |
| 105 | |
| 106 | |
| 107 | char GCNNSAReassign::ID = 0; |
| 108 | |
| 109 | char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; |
| 110 | |
| 111 | bool |
| 112 | GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, |
| 113 | unsigned StartReg) const { |
| 114 | unsigned NumRegs = Intervals.size(); |
| 115 | |
| 116 | for (unsigned N = 0; N < NumRegs; ++N) |
| 117 | if (VRM->hasPhys(Intervals[N]->reg)) |
| 118 | LRM->unassign(*Intervals[N]); |
| 119 | |
| 120 | for (unsigned N = 0; N < NumRegs; ++N) |
| 121 | if (LRM->checkInterference(*Intervals[N], StartReg + N)) |
| 122 | return false; |
| 123 | |
| 124 | for (unsigned N = 0; N < NumRegs; ++N) |
| 125 | LRM->assign(*Intervals[N], StartReg + N); |
| 126 | |
| 127 | return true; |
| 128 | } |
| 129 | |
| 130 | bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { |
| 131 | for (unsigned N = 0; N < NumRegs; ++N) { |
| 132 | unsigned Reg = StartReg + N; |
| 133 | if (!MRI->isAllocatable(Reg)) |
| 134 | return false; |
| 135 | |
| 136 | for (unsigned I = 0; CSRegs[I]; ++I) |
| 137 | if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && |
| 138 | !LRM->isPhysRegUsed(CSRegs[I])) |
| 139 | return false; |
| 140 | } |
| 141 | |
| 142 | return true; |
| 143 | } |
| 144 | |
| 145 | bool |
| 146 | GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const { |
| 147 | unsigned NumRegs = Intervals.size(); |
| 148 | |
| 149 | if (NumRegs > MaxNumVGPRs) |
| 150 | return false; |
| 151 | unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; |
| 152 | |
| 153 | for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { |
| 154 | if (!canAssign(Reg, NumRegs)) |
| 155 | continue; |
| 156 | |
| 157 | if (tryAssignRegisters(Intervals, Reg)) |
| 158 | return true; |
| 159 | } |
| 160 | |
| 161 | return false; |
| 162 | } |
| 163 | |
| 164 | GCNNSAReassign::NSA_Status |
| 165 | GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { |
| 166 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); |
| 167 | if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) |
| 168 | return NSA_Status::NOT_NSA; |
| 169 | |
| 170 | int VAddr0Idx = |
| 171 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); |
| 172 | |
| 173 | unsigned VgprBase = 0; |
| 174 | bool NSA = false; |
| 175 | for (unsigned I = 0; I < Info->VAddrDwords; ++I) { |
| 176 | const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); |
Daniel Sanders | 0c47611 | 2019-08-15 19:22:08 +0000 | [diff] [blame] | 177 | Register Reg = Op.getReg(); |
Daniel Sanders | 2bea69b | 2019-08-01 23:27:28 +0000 | [diff] [blame] | 178 | if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) |
Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 179 | return NSA_Status::FIXED; |
| 180 | |
Daniel Sanders | 0c47611 | 2019-08-15 19:22:08 +0000 | [diff] [blame] | 181 | Register PhysReg = VRM->getPhys(Reg); |
Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 182 | |
| 183 | if (!Fast) { |
| 184 | if (!PhysReg) |
| 185 | return NSA_Status::FIXED; |
| 186 | |
| 187 | // Bail if address is not a VGPR32. That should be possible to extend the |
| 188 | // optimization to work with subregs of a wider register tuples, but the |
| 189 | // logic to find free registers will be much more complicated with much |
| 190 | // less chances for success. That seems reasonable to assume that in most |
| 191 | // cases a tuple is used because a vector variable contains different |
| 192 | // parts of an address and it is either already consequitive or cannot |
| 193 | // be reassigned if not. If needed it is better to rely on register |
| 194 | // coalescer to process such address tuples. |
| 195 | if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) |
| 196 | return NSA_Status::FIXED; |
| 197 | |
| 198 | const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
| 199 | |
| 200 | if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) |
| 201 | return NSA_Status::FIXED; |
| 202 | |
| 203 | for (auto U : MRI->use_nodbg_operands(Reg)) { |
| 204 | if (U.isImplicit()) |
| 205 | return NSA_Status::FIXED; |
| 206 | const MachineInstr *UseInst = U.getParent(); |
| 207 | if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) |
| 208 | return NSA_Status::FIXED; |
| 209 | } |
| 210 | |
| 211 | if (!LIS->hasInterval(Reg)) |
| 212 | return NSA_Status::FIXED; |
| 213 | } |
| 214 | |
| 215 | if (I == 0) |
| 216 | VgprBase = PhysReg; |
| 217 | else if (VgprBase + I != PhysReg) |
| 218 | NSA = true; |
| 219 | } |
| 220 | |
| 221 | return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; |
| 222 | } |
| 223 | |
| 224 | bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { |
| 225 | ST = &MF.getSubtarget<GCNSubtarget>(); |
| 226 | if (ST->getGeneration() < GCNSubtarget::GFX10) |
| 227 | return false; |
| 228 | |
| 229 | MRI = &MF.getRegInfo(); |
| 230 | TRI = ST->getRegisterInfo(); |
| 231 | VRM = &getAnalysis<VirtRegMap>(); |
| 232 | LRM = &getAnalysis<LiveRegMatrix>(); |
| 233 | LIS = &getAnalysis<LiveIntervals>(); |
| 234 | |
| 235 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| 236 | MaxNumVGPRs = ST->getMaxNumVGPRs(MF); |
| 237 | MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs); |
Matt Arsenault | e0b8443 | 2019-06-26 13:39:29 +0000 | [diff] [blame] | 238 | CSRegs = MRI->getCalleeSavedRegs(); |
Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 239 | |
| 240 | using Candidate = std::pair<const MachineInstr*, bool>; |
| 241 | SmallVector<Candidate, 32> Candidates; |
| 242 | for (const MachineBasicBlock &MBB : MF) { |
| 243 | for (const MachineInstr &MI : MBB) { |
| 244 | switch (CheckNSA(MI)) { |
| 245 | default: |
| 246 | continue; |
| 247 | case NSA_Status::CONTIGUOUS: |
| 248 | Candidates.push_back(std::make_pair(&MI, true)); |
| 249 | break; |
| 250 | case NSA_Status::NON_CONTIGUOUS: |
| 251 | Candidates.push_back(std::make_pair(&MI, false)); |
| 252 | ++NumNSAInstructions; |
| 253 | break; |
| 254 | } |
| 255 | } |
| 256 | } |
| 257 | |
| 258 | bool Changed = false; |
| 259 | for (auto &C : Candidates) { |
| 260 | if (C.second) |
| 261 | continue; |
| 262 | |
| 263 | const MachineInstr *MI = C.first; |
| 264 | if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) { |
| 265 | // Already happen to be fixed. |
| 266 | C.second = true; |
| 267 | ++NumNSAConverted; |
| 268 | continue; |
| 269 | } |
| 270 | |
| 271 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode()); |
| 272 | int VAddr0Idx = |
| 273 | AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0); |
| 274 | |
| 275 | SmallVector<LiveInterval *, 16> Intervals; |
| 276 | SmallVector<unsigned, 16> OrigRegs; |
| 277 | SlotIndex MinInd, MaxInd; |
| 278 | for (unsigned I = 0; I < Info->VAddrDwords; ++I) { |
| 279 | const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); |
Daniel Sanders | 0c47611 | 2019-08-15 19:22:08 +0000 | [diff] [blame] | 280 | Register Reg = Op.getReg(); |
Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 281 | LiveInterval *LI = &LIS->getInterval(Reg); |
| 282 | if (llvm::find(Intervals, LI) != Intervals.end()) { |
| 283 | // Same register used, unable to make sequential |
| 284 | Intervals.clear(); |
| 285 | break; |
| 286 | } |
| 287 | Intervals.push_back(LI); |
| 288 | OrigRegs.push_back(VRM->getPhys(Reg)); |
| 289 | MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); |
| 290 | MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); |
| 291 | } |
| 292 | |
| 293 | if (Intervals.empty()) |
| 294 | continue; |
| 295 | |
| 296 | LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI |
| 297 | << "\tOriginal allocation:\t"; |
| 298 | for(auto *LI : Intervals) |
| 299 | dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI); |
| 300 | dbgs() << '\n'); |
| 301 | |
| 302 | bool Success = scavengeRegs(Intervals); |
| 303 | if (!Success) { |
| 304 | LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); |
| 305 | if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation. |
| 306 | continue; |
| 307 | } else { |
| 308 | // Check we did not make it worse for other instructions. |
| 309 | auto I = std::lower_bound(Candidates.begin(), &C, MinInd, |
| 310 | [this](const Candidate &C, SlotIndex I) { |
| 311 | return LIS->getInstructionIndex(*C.first) < I; |
| 312 | }); |
| 313 | for (auto E = Candidates.end(); Success && I != E && |
| 314 | LIS->getInstructionIndex(*I->first) < MaxInd; ++I) { |
| 315 | if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) { |
| 316 | Success = false; |
| 317 | LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); |
| 318 | } |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | if (!Success) { |
| 323 | for (unsigned I = 0; I < Info->VAddrDwords; ++I) |
| 324 | if (VRM->hasPhys(Intervals[I]->reg)) |
| 325 | LRM->unassign(*Intervals[I]); |
| 326 | |
| 327 | for (unsigned I = 0; I < Info->VAddrDwords; ++I) |
| 328 | LRM->assign(*Intervals[I], OrigRegs[I]); |
| 329 | |
| 330 | continue; |
| 331 | } |
| 332 | |
| 333 | C.second = true; |
| 334 | ++NumNSAConverted; |
| 335 | LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t [" |
| 336 | << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI) |
| 337 | << " : " |
| 338 | << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI) |
| 339 | << "]\n"); |
| 340 | Changed = true; |
| 341 | } |
| 342 | |
| 343 | return Changed; |
| 344 | } |