blob: dd133d37eb7f166c9347e756cf110378a540d17e [file] [log] [blame]
Nicolai Haehnle213e87f2016-03-21 20:28:33 +00001//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief This pass adds instructions to enable whole quad mode for pixel
12/// shaders.
13///
14/// Whole quad mode is required for derivative computations, but it interferes
15/// with shader side effects (stores and atomics). This pass is run on the
16/// scheduled machine IR but before register coalescing, so that machine SSA is
17/// available for analysis. It ensures that WQM is enabled when necessary, but
18/// disabled around stores and atomics.
19///
20/// When necessary, this pass creates a function prolog
21///
22/// S_MOV_B64 LiveMask, EXEC
23/// S_WQM_B64 EXEC, EXEC
24///
25/// to enter WQM at the top of the function and surrounds blocks of Exact
26/// instructions by
27///
28/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
29/// ...
30/// S_MOV_B64 EXEC, Tmp
31///
32/// In order to avoid excessive switching during sequences of Exact
33/// instructions, the pass first analyzes which instructions must be run in WQM
34/// (aka which instructions produce values that lead to derivative
35/// computations).
36///
37/// Basic blocks are always exited in WQM as long as some successor needs WQM.
38///
39/// There is room for improvement given better control flow analysis:
40///
41/// (1) at the top level (outside of control flow statements, and as long as
42/// kill hasn't been used), one SGPR can be saved by recovering WQM from
43/// the LiveMask (this is implemented for the entry block).
44///
45/// (2) when entire regions (e.g. if-else blocks or entire loops) only
46/// consist of exact and don't-care instructions, the switch only has to
47/// be done at the entry and exit points rather than potentially in each
48/// block of the region.
49///
50//===----------------------------------------------------------------------===//
51
52#include "AMDGPU.h"
53#include "AMDGPUSubtarget.h"
54#include "SIInstrInfo.h"
55#include "SIMachineFunctionInfo.h"
56#include "llvm/CodeGen/MachineDominanceFrontier.h"
57#include "llvm/CodeGen/MachineDominators.h"
58#include "llvm/CodeGen/MachineFunction.h"
59#include "llvm/CodeGen/MachineFunctionPass.h"
60#include "llvm/CodeGen/MachineInstrBuilder.h"
61#include "llvm/CodeGen/MachineRegisterInfo.h"
62#include "llvm/IR/Constants.h"
63
64using namespace llvm;
65
66#define DEBUG_TYPE "si-wqm"
67
68namespace {
69
70enum {
71 StateWQM = 0x1,
72 StateExact = 0x2,
73};
74
75struct InstrInfo {
76 char Needs = 0;
77 char OutNeeds = 0;
78};
79
80struct BlockInfo {
81 char Needs = 0;
82 char InNeeds = 0;
83 char OutNeeds = 0;
84};
85
86struct WorkItem {
87 const MachineBasicBlock *MBB = nullptr;
88 const MachineInstr *MI = nullptr;
89
90 WorkItem() {}
91 WorkItem(const MachineBasicBlock *MBB) : MBB(MBB) {}
92 WorkItem(const MachineInstr *MI) : MI(MI) {}
93};
94
95class SIWholeQuadMode : public MachineFunctionPass {
96private:
97 const SIInstrInfo *TII;
98 const SIRegisterInfo *TRI;
99 MachineRegisterInfo *MRI;
100
101 DenseMap<const MachineInstr *, InstrInfo> Instructions;
102 DenseMap<const MachineBasicBlock *, BlockInfo> Blocks;
103 SmallVector<const MachineInstr *, 2> ExecExports;
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000104 SmallVector<MachineInstr *, 1> LiveMaskQueries;
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000105
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000106 char scanInstructions(MachineFunction &MF, std::vector<WorkItem>& Worklist);
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000107 void propagateInstruction(const MachineInstr &MI, std::vector<WorkItem>& Worklist);
108 void propagateBlock(const MachineBasicBlock &MBB, std::vector<WorkItem>& Worklist);
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000109 char analyzeFunction(MachineFunction &MF);
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000110
111 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
112 unsigned SaveWQM, unsigned LiveMaskReg);
113 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
114 unsigned SavedWQM);
115 void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
116
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000117 void lowerLiveMaskQueries(unsigned LiveMaskReg);
118
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000119public:
120 static char ID;
121
122 SIWholeQuadMode() :
123 MachineFunctionPass(ID) { }
124
125 bool runOnMachineFunction(MachineFunction &MF) override;
126
127 const char *getPassName() const override {
128 return "SI Whole Quad Mode";
129 }
130
131 void getAnalysisUsage(AnalysisUsage &AU) const override {
132 AU.setPreservesCFG();
133 MachineFunctionPass::getAnalysisUsage(AU);
134 }
135};
136
137} // End anonymous namespace
138
139char SIWholeQuadMode::ID = 0;
140
141INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE,
142 "SI Whole Quad Mode", false, false)
143INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE,
144 "SI Whole Quad Mode", false, false)
145
146char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
147
148FunctionPass *llvm::createSIWholeQuadModePass() {
149 return new SIWholeQuadMode;
150}
151
152// Scan instructions to determine which ones require an Exact execmask and
153// which ones seed WQM requirements.
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000154char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000155 std::vector<WorkItem> &Worklist) {
156 char GlobalFlags = 0;
157
158 for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000159 MachineBasicBlock &MBB = *BI;
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000160
161 for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000162 MachineInstr &MI = *II;
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000163 unsigned Opcode = MI.getOpcode();
164 char Flags;
165
166 if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
167 Flags = StateWQM;
168 } else if (TII->get(Opcode).mayStore() &&
169 (MI.getDesc().TSFlags & SIInstrFlags::VM_CNT)) {
170 Flags = StateExact;
171 } else {
172 // Handle export instructions with the exec mask valid flag set
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000173 if (Opcode == AMDGPU::EXP) {
174 if (MI.getOperand(4).getImm() != 0)
175 ExecExports.push_back(&MI);
176 } else if (Opcode == AMDGPU::SI_PS_LIVE) {
177 LiveMaskQueries.push_back(&MI);
178 }
179
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000180 continue;
181 }
182
183 Instructions[&MI].Needs = Flags;
184 Worklist.push_back(&MI);
185 GlobalFlags |= Flags;
186 }
187 }
188
189 return GlobalFlags;
190}
191
192void SIWholeQuadMode::propagateInstruction(const MachineInstr &MI,
193 std::vector<WorkItem>& Worklist) {
194 const MachineBasicBlock &MBB = *MI.getParent();
Nicolai Haehnle0a33abd2016-03-21 22:54:02 +0000195 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000196 BlockInfo &BI = Blocks[&MBB];
197
198 // Control flow-type instructions that are followed by WQM computations
199 // must themselves be in WQM.
200 if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) &&
Nicolai Haehnle0a33abd2016-03-21 22:54:02 +0000201 (MI.isBranch() || MI.isTerminator() || MI.getOpcode() == AMDGPU::SI_KILL)) {
202 Instructions[&MI].Needs = StateWQM;
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000203 II.Needs = StateWQM;
Nicolai Haehnle0a33abd2016-03-21 22:54:02 +0000204 }
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000205
206 // Propagate to block level
207 BI.Needs |= II.Needs;
208 if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
209 BI.InNeeds |= II.Needs;
210 Worklist.push_back(&MBB);
211 }
212
213 // Propagate backwards within block
214 if (const MachineInstr *PrevMI = MI.getPrevNode()) {
215 char InNeeds = II.Needs | II.OutNeeds;
216 if (!PrevMI->isPHI()) {
217 InstrInfo &PrevII = Instructions[PrevMI];
218 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
219 PrevII.OutNeeds |= InNeeds;
220 Worklist.push_back(PrevMI);
221 }
222 }
223 }
224
225 // Propagate WQM flag to instruction inputs
226 assert(II.Needs != (StateWQM | StateExact));
227 if (II.Needs != StateWQM)
228 return;
229
230 for (const MachineOperand &Use : MI.uses()) {
231 if (!Use.isReg() || !Use.isUse())
232 continue;
233
234 // At this point, physical registers appear as inputs or outputs
235 // and following them makes no sense (and would in fact be incorrect
236 // when the same VGPR is used as both an output and an input that leads
237 // to a NeedsWQM instruction).
238 //
239 // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
240 // have to trace this, in practice it happens for 64-bit computations like
241 // pointers where both dwords are followed already anyway.
242 if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
243 continue;
244
245 for (const MachineOperand &Def : MRI->def_operands(Use.getReg())) {
246 const MachineInstr *DefMI = Def.getParent();
247 InstrInfo &DefII = Instructions[DefMI];
248
249 // Obviously skip if DefMI is already flagged as NeedWQM.
250 //
251 // The instruction might also be flagged as NeedExact. This happens when
252 // the result of an atomic is used in a WQM computation. In this case,
253 // the atomic must not run for helper pixels and the WQM result is
254 // undefined.
255 if (DefII.Needs != 0)
256 continue;
257
258 DefII.Needs = StateWQM;
259 Worklist.push_back(DefMI);
260 }
261 }
262}
263
264void SIWholeQuadMode::propagateBlock(const MachineBasicBlock &MBB,
265 std::vector<WorkItem>& Worklist) {
Nicolai Haehnle0a33abd2016-03-21 22:54:02 +0000266 BlockInfo BI = Blocks[&MBB]; // take a copy to prevent dangling references
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000267
268 // Propagate through instructions
269 if (!MBB.empty()) {
270 const MachineInstr *LastMI = &*MBB.rbegin();
271 InstrInfo &LastII = Instructions[LastMI];
272 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
273 LastII.OutNeeds |= BI.OutNeeds;
274 Worklist.push_back(LastMI);
275 }
276 }
277
278 // Predecessor blocks must provide for our WQM/Exact needs.
279 for (const MachineBasicBlock *Pred : MBB.predecessors()) {
280 BlockInfo &PredBI = Blocks[Pred];
281 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
282 continue;
283
284 PredBI.OutNeeds |= BI.InNeeds;
285 PredBI.InNeeds |= BI.InNeeds;
286 Worklist.push_back(Pred);
287 }
288
289 // All successors must be prepared to accept the same set of WQM/Exact
290 // data.
291 for (const MachineBasicBlock *Succ : MBB.successors()) {
292 BlockInfo &SuccBI = Blocks[Succ];
293 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
294 continue;
295
296 SuccBI.InNeeds |= BI.OutNeeds;
297 Worklist.push_back(Succ);
298 }
299}
300
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000301char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000302 std::vector<WorkItem> Worklist;
303 char GlobalFlags = scanInstructions(MF, Worklist);
304
305 while (!Worklist.empty()) {
306 WorkItem WI = Worklist.back();
307 Worklist.pop_back();
308
309 if (WI.MI)
310 propagateInstruction(*WI.MI, Worklist);
311 else
312 propagateBlock(*WI.MBB, Worklist);
313 }
314
315 return GlobalFlags;
316}
317
318void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
319 MachineBasicBlock::iterator Before,
Nicolai Haehnlea56e6b62016-03-21 20:39:24 +0000320 unsigned SaveWQM, unsigned LiveMaskReg) {
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000321 if (SaveWQM) {
322 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
323 SaveWQM)
324 .addReg(LiveMaskReg);
325 } else {
326 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
327 AMDGPU::EXEC)
328 .addReg(AMDGPU::EXEC)
329 .addReg(LiveMaskReg);
330 }
331}
332
333void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
334 MachineBasicBlock::iterator Before,
Nicolai Haehnlea56e6b62016-03-21 20:39:24 +0000335 unsigned SavedWQM) {
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000336 if (SavedWQM) {
337 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
338 .addReg(SavedWQM);
339 } else {
340 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
341 AMDGPU::EXEC)
342 .addReg(AMDGPU::EXEC);
343 }
344}
345
346void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
347 bool isEntry) {
348 auto BII = Blocks.find(&MBB);
349 if (BII == Blocks.end())
350 return;
351
352 const BlockInfo &BI = BII->second;
353
354 if (!(BI.InNeeds & StateWQM))
355 return;
356
357 // This is a non-entry block that is WQM throughout, so no need to do
358 // anything.
359 if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
360 return;
361
362 unsigned SavedWQMReg = 0;
363 bool WQMFromExec = isEntry;
364 char State = isEntry ? StateExact : StateWQM;
365
366 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
367 while (II != IE) {
368 MachineInstr &MI = *II;
369 ++II;
370
371 // Skip instructions that are not affected by EXEC
372 if (MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD) &&
373 !MI.isBranch() && !MI.isTerminator())
374 continue;
375
376 // Generic instructions such as COPY will either disappear by register
377 // coalescing or be lowered to SALU or VALU instructions.
378 if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
379 if (MI.getNumExplicitOperands() >= 1) {
380 const MachineOperand &Op = MI.getOperand(0);
381 if (Op.isReg()) {
382 if (TRI->isSGPRReg(*MRI, Op.getReg())) {
383 // SGPR instructions are not affected by EXEC
384 continue;
385 }
386 }
387 }
388 }
389
390 char Needs = 0;
391 char OutNeeds = 0;
392 auto InstrInfoIt = Instructions.find(&MI);
393 if (InstrInfoIt != Instructions.end()) {
394 Needs = InstrInfoIt->second.Needs;
395 OutNeeds = InstrInfoIt->second.OutNeeds;
396
397 // Make sure to switch to Exact mode before the end of the block when
398 // Exact and only Exact is needed further downstream.
399 if (OutNeeds == StateExact && (MI.isBranch() || MI.isTerminator())) {
400 assert(Needs == 0);
401 Needs = StateExact;
402 }
403 }
404
405 // State switching
406 if (Needs && State != Needs) {
407 if (Needs == StateExact) {
408 assert(!SavedWQMReg);
409
410 if (!WQMFromExec && (OutNeeds & StateWQM))
411 SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
412
413 toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
414 } else {
415 assert(WQMFromExec == (SavedWQMReg == 0));
416 toWQM(MBB, &MI, SavedWQMReg);
417 SavedWQMReg = 0;
418 }
419
420 State = Needs;
421 }
422
423 if (MI.getOpcode() == AMDGPU::SI_KILL)
424 WQMFromExec = false;
425 }
426
427 if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
428 assert(WQMFromExec == (SavedWQMReg == 0));
429 toWQM(MBB, MBB.end(), SavedWQMReg);
430 } else if (BI.OutNeeds == StateExact && State != StateExact) {
431 toExact(MBB, MBB.end(), 0, LiveMaskReg);
432 }
433}
434
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000435void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
436 for (MachineInstr *MI : LiveMaskQueries) {
437 DebugLoc DL = MI->getDebugLoc();
438 unsigned Dest = MI->getOperand(0).getReg();
439 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
440 .addReg(LiveMaskReg);
441 MI->eraseFromParent();
442 }
443}
444
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000445bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
Nicolai Haehnledf3a20c2016-04-06 19:40:20 +0000446 if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000447 return false;
448
449 Instructions.clear();
450 Blocks.clear();
451 ExecExports.clear();
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000452 LiveMaskQueries.clear();
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000453
454 TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
455 TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
456 MRI = &MF.getRegInfo();
457
458 char GlobalFlags = analyzeFunction(MF);
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000459 if (!(GlobalFlags & StateWQM)) {
460 lowerLiveMaskQueries(AMDGPU::EXEC);
461 return !LiveMaskQueries.empty();
462 }
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000463
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000464 // Store a copy of the original live mask when required
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000465 MachineBasicBlock &Entry = MF.front();
466 MachineInstr *EntryMI = Entry.getFirstNonPHI();
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000467 unsigned LiveMaskReg = 0;
468
469 if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
470 LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
471 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
472 .addReg(AMDGPU::EXEC);
473 }
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000474
475 if (GlobalFlags == StateWQM) {
476 // For a shader that needs only WQM, we can just set it once.
477 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
478 AMDGPU::EXEC).addReg(AMDGPU::EXEC);
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000479
480 lowerLiveMaskQueries(LiveMaskReg);
481 // EntryMI may become invalid here
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000482 return true;
483 }
484
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000485 lowerLiveMaskQueries(LiveMaskReg);
486 EntryMI = nullptr;
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000487
Nicolai Haehnleb0c97482016-04-22 04:04:08 +0000488 // Handle the general case
Nicolai Haehnle213e87f2016-03-21 20:28:33 +0000489 for (const auto &BII : Blocks)
490 processBlock(const_cast<MachineBasicBlock &>(*BII.first), LiveMaskReg,
491 BII.first == &*MF.begin());
492
493 return true;
494}