Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 1 | //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file |
| 10 | /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential |
| 11 | /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA |
| 12 | /// with sequential versions where possible. |
| 13 | /// |
| 14 | //===----------------------------------------------------------------------===// |
| 15 | |
| 16 | #include "AMDGPU.h" |
| 17 | #include "AMDGPUSubtarget.h" |
| 18 | #include "SIInstrInfo.h" |
| 19 | #include "SIMachineFunctionInfo.h" |
| 20 | #include "llvm/ADT/Statistic.h" |
| 21 | #include "llvm/CodeGen/LiveInterval.h" |
| 22 | #include "llvm/CodeGen/LiveIntervals.h" |
| 23 | #include "llvm/CodeGen/LiveRegMatrix.h" |
| 24 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 25 | #include "llvm/CodeGen/VirtRegMap.h" |
| 26 | #include "llvm/Support/MathExtras.h" |
| 27 | #include <algorithm> |
| 28 | |
| 29 | using namespace llvm; |
| 30 | |
| 31 | #define DEBUG_TYPE "amdgpu-nsa-reassign" |
| 32 | |
| 33 | STATISTIC(NumNSAInstructions, |
| 34 | "Number of NSA instructions with non-sequential address found"); |
| 35 | STATISTIC(NumNSAConverted, |
| 36 | "Number of NSA instructions changed to sequential"); |
| 37 | |
| 38 | namespace { |
| 39 | |
| 40 | class GCNNSAReassign : public MachineFunctionPass { |
| 41 | public: |
| 42 | static char ID; |
| 43 | |
| 44 | GCNNSAReassign() : MachineFunctionPass(ID) { |
| 45 | initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); |
| 46 | } |
| 47 | |
| 48 | bool runOnMachineFunction(MachineFunction &MF) override; |
| 49 | |
| 50 | StringRef getPassName() const override { return "GCN NSA Reassign"; } |
| 51 | |
| 52 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 53 | AU.addRequired<LiveIntervals>(); |
| 54 | AU.addRequired<VirtRegMap>(); |
| 55 | AU.addRequired<LiveRegMatrix>(); |
| 56 | AU.setPreservesAll(); |
| 57 | MachineFunctionPass::getAnalysisUsage(AU); |
| 58 | } |
| 59 | |
| 60 | private: |
| 61 | typedef enum { |
| 62 | NOT_NSA, // Not an NSA instruction |
| 63 | FIXED, // NSA which we cannot modify |
| 64 | NON_CONTIGUOUS, // NSA with non-sequential address which we can try |
| 65 | // to optimize. |
| 66 | CONTIGUOUS // NSA with all sequential address registers |
| 67 | } NSA_Status; |
| 68 | |
| 69 | const GCNSubtarget *ST; |
| 70 | |
| 71 | const MachineRegisterInfo *MRI; |
| 72 | |
| 73 | const SIRegisterInfo *TRI; |
| 74 | |
| 75 | VirtRegMap *VRM; |
| 76 | |
| 77 | LiveRegMatrix *LRM; |
| 78 | |
| 79 | LiveIntervals *LIS; |
| 80 | |
| 81 | unsigned MaxNumVGPRs; |
| 82 | |
| 83 | const MCPhysReg *CSRegs; |
| 84 | |
| 85 | NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; |
| 86 | |
| 87 | bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, |
| 88 | unsigned StartReg) const; |
| 89 | |
| 90 | bool canAssign(unsigned StartReg, unsigned NumRegs) const; |
| 91 | |
| 92 | bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const; |
| 93 | }; |
| 94 | |
| 95 | } // End anonymous namespace. |
| 96 | |
| 97 | INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", |
| 98 | false, false) |
| 99 | INITIALIZE_PASS_DEPENDENCY(LiveIntervals) |
| 100 | INITIALIZE_PASS_DEPENDENCY(VirtRegMap) |
| 101 | INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) |
| 102 | INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", |
| 103 | false, false) |
| 104 | |
| 105 | |
| 106 | char GCNNSAReassign::ID = 0; |
| 107 | |
| 108 | char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; |
| 109 | |
| 110 | bool |
| 111 | GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals, |
| 112 | unsigned StartReg) const { |
| 113 | unsigned NumRegs = Intervals.size(); |
| 114 | |
| 115 | for (unsigned N = 0; N < NumRegs; ++N) |
| 116 | if (VRM->hasPhys(Intervals[N]->reg)) |
| 117 | LRM->unassign(*Intervals[N]); |
| 118 | |
| 119 | for (unsigned N = 0; N < NumRegs; ++N) |
| 120 | if (LRM->checkInterference(*Intervals[N], StartReg + N)) |
| 121 | return false; |
| 122 | |
| 123 | for (unsigned N = 0; N < NumRegs; ++N) |
| 124 | LRM->assign(*Intervals[N], StartReg + N); |
| 125 | |
| 126 | return true; |
| 127 | } |
| 128 | |
| 129 | bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { |
| 130 | for (unsigned N = 0; N < NumRegs; ++N) { |
| 131 | unsigned Reg = StartReg + N; |
| 132 | if (!MRI->isAllocatable(Reg)) |
| 133 | return false; |
| 134 | |
| 135 | for (unsigned I = 0; CSRegs[I]; ++I) |
| 136 | if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && |
| 137 | !LRM->isPhysRegUsed(CSRegs[I])) |
| 138 | return false; |
| 139 | } |
| 140 | |
| 141 | return true; |
| 142 | } |
| 143 | |
| 144 | bool |
| 145 | GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const { |
| 146 | unsigned NumRegs = Intervals.size(); |
| 147 | |
| 148 | if (NumRegs > MaxNumVGPRs) |
| 149 | return false; |
| 150 | unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; |
| 151 | |
| 152 | for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { |
| 153 | if (!canAssign(Reg, NumRegs)) |
| 154 | continue; |
| 155 | |
| 156 | if (tryAssignRegisters(Intervals, Reg)) |
| 157 | return true; |
| 158 | } |
| 159 | |
| 160 | return false; |
| 161 | } |
| 162 | |
| 163 | GCNNSAReassign::NSA_Status |
| 164 | GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { |
| 165 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); |
| 166 | if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) |
| 167 | return NSA_Status::NOT_NSA; |
| 168 | |
| 169 | int VAddr0Idx = |
| 170 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); |
| 171 | |
| 172 | unsigned VgprBase = 0; |
| 173 | bool NSA = false; |
| 174 | for (unsigned I = 0; I < Info->VAddrDwords; ++I) { |
| 175 | const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); |
Daniel Sanders | 0c47611 | 2019-08-15 19:22:08 +0000 | [diff] [blame] | 176 | Register Reg = Op.getReg(); |
Daniel Sanders | 2bea69b | 2019-08-01 23:27:28 +0000 | [diff] [blame] | 177 | if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) |
Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 178 | return NSA_Status::FIXED; |
| 179 | |
Daniel Sanders | 0c47611 | 2019-08-15 19:22:08 +0000 | [diff] [blame] | 180 | Register PhysReg = VRM->getPhys(Reg); |
Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 181 | |
| 182 | if (!Fast) { |
| 183 | if (!PhysReg) |
| 184 | return NSA_Status::FIXED; |
| 185 | |
| 186 | // Bail if address is not a VGPR32. That should be possible to extend the |
| 187 | // optimization to work with subregs of a wider register tuples, but the |
| 188 | // logic to find free registers will be much more complicated with much |
| 189 | // less chances for success. That seems reasonable to assume that in most |
| 190 | // cases a tuple is used because a vector variable contains different |
| 191 | // parts of an address and it is either already consequitive or cannot |
| 192 | // be reassigned if not. If needed it is better to rely on register |
| 193 | // coalescer to process such address tuples. |
| 194 | if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) |
| 195 | return NSA_Status::FIXED; |
| 196 | |
| 197 | const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); |
| 198 | |
| 199 | if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) |
| 200 | return NSA_Status::FIXED; |
| 201 | |
| 202 | for (auto U : MRI->use_nodbg_operands(Reg)) { |
| 203 | if (U.isImplicit()) |
| 204 | return NSA_Status::FIXED; |
| 205 | const MachineInstr *UseInst = U.getParent(); |
| 206 | if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) |
| 207 | return NSA_Status::FIXED; |
| 208 | } |
| 209 | |
| 210 | if (!LIS->hasInterval(Reg)) |
| 211 | return NSA_Status::FIXED; |
| 212 | } |
| 213 | |
| 214 | if (I == 0) |
| 215 | VgprBase = PhysReg; |
| 216 | else if (VgprBase + I != PhysReg) |
| 217 | NSA = true; |
| 218 | } |
| 219 | |
| 220 | return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; |
| 221 | } |
| 222 | |
| 223 | bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { |
| 224 | ST = &MF.getSubtarget<GCNSubtarget>(); |
| 225 | if (ST->getGeneration() < GCNSubtarget::GFX10) |
| 226 | return false; |
| 227 | |
| 228 | MRI = &MF.getRegInfo(); |
| 229 | TRI = ST->getRegisterInfo(); |
| 230 | VRM = &getAnalysis<VirtRegMap>(); |
| 231 | LRM = &getAnalysis<LiveRegMatrix>(); |
| 232 | LIS = &getAnalysis<LiveIntervals>(); |
| 233 | |
| 234 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| 235 | MaxNumVGPRs = ST->getMaxNumVGPRs(MF); |
| 236 | MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs); |
Matt Arsenault | e0b8443 | 2019-06-26 13:39:29 +0000 | [diff] [blame] | 237 | CSRegs = MRI->getCalleeSavedRegs(); |
Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 238 | |
| 239 | using Candidate = std::pair<const MachineInstr*, bool>; |
| 240 | SmallVector<Candidate, 32> Candidates; |
| 241 | for (const MachineBasicBlock &MBB : MF) { |
| 242 | for (const MachineInstr &MI : MBB) { |
| 243 | switch (CheckNSA(MI)) { |
| 244 | default: |
| 245 | continue; |
| 246 | case NSA_Status::CONTIGUOUS: |
| 247 | Candidates.push_back(std::make_pair(&MI, true)); |
| 248 | break; |
| 249 | case NSA_Status::NON_CONTIGUOUS: |
| 250 | Candidates.push_back(std::make_pair(&MI, false)); |
| 251 | ++NumNSAInstructions; |
| 252 | break; |
| 253 | } |
| 254 | } |
| 255 | } |
| 256 | |
| 257 | bool Changed = false; |
| 258 | for (auto &C : Candidates) { |
| 259 | if (C.second) |
| 260 | continue; |
| 261 | |
| 262 | const MachineInstr *MI = C.first; |
| 263 | if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) { |
| 264 | // Already happen to be fixed. |
| 265 | C.second = true; |
| 266 | ++NumNSAConverted; |
| 267 | continue; |
| 268 | } |
| 269 | |
| 270 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode()); |
| 271 | int VAddr0Idx = |
| 272 | AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0); |
| 273 | |
| 274 | SmallVector<LiveInterval *, 16> Intervals; |
| 275 | SmallVector<unsigned, 16> OrigRegs; |
| 276 | SlotIndex MinInd, MaxInd; |
| 277 | for (unsigned I = 0; I < Info->VAddrDwords; ++I) { |
| 278 | const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); |
Daniel Sanders | 0c47611 | 2019-08-15 19:22:08 +0000 | [diff] [blame] | 279 | Register Reg = Op.getReg(); |
Stanislav Mekhanoshin | c29d491 | 2019-05-01 16:40:49 +0000 | [diff] [blame] | 280 | LiveInterval *LI = &LIS->getInterval(Reg); |
| 281 | if (llvm::find(Intervals, LI) != Intervals.end()) { |
| 282 | // Same register used, unable to make sequential |
| 283 | Intervals.clear(); |
| 284 | break; |
| 285 | } |
| 286 | Intervals.push_back(LI); |
| 287 | OrigRegs.push_back(VRM->getPhys(Reg)); |
| 288 | MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); |
| 289 | MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); |
| 290 | } |
| 291 | |
| 292 | if (Intervals.empty()) |
| 293 | continue; |
| 294 | |
| 295 | LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI |
| 296 | << "\tOriginal allocation:\t"; |
| 297 | for(auto *LI : Intervals) |
| 298 | dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI); |
| 299 | dbgs() << '\n'); |
| 300 | |
| 301 | bool Success = scavengeRegs(Intervals); |
| 302 | if (!Success) { |
| 303 | LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); |
| 304 | if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation. |
| 305 | continue; |
| 306 | } else { |
| 307 | // Check we did not make it worse for other instructions. |
| 308 | auto I = std::lower_bound(Candidates.begin(), &C, MinInd, |
| 309 | [this](const Candidate &C, SlotIndex I) { |
| 310 | return LIS->getInstructionIndex(*C.first) < I; |
| 311 | }); |
| 312 | for (auto E = Candidates.end(); Success && I != E && |
| 313 | LIS->getInstructionIndex(*I->first) < MaxInd; ++I) { |
| 314 | if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) { |
| 315 | Success = false; |
| 316 | LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); |
| 317 | } |
| 318 | } |
| 319 | } |
| 320 | |
| 321 | if (!Success) { |
| 322 | for (unsigned I = 0; I < Info->VAddrDwords; ++I) |
| 323 | if (VRM->hasPhys(Intervals[I]->reg)) |
| 324 | LRM->unassign(*Intervals[I]); |
| 325 | |
| 326 | for (unsigned I = 0; I < Info->VAddrDwords; ++I) |
| 327 | LRM->assign(*Intervals[I], OrigRegs[I]); |
| 328 | |
| 329 | continue; |
| 330 | } |
| 331 | |
| 332 | C.second = true; |
| 333 | ++NumNSAConverted; |
| 334 | LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t [" |
| 335 | << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI) |
| 336 | << " : " |
| 337 | << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI) |
| 338 | << "]\n"); |
| 339 | Changed = true; |
| 340 | } |
| 341 | |
| 342 | return Changed; |
| 343 | } |