blob: 21eecb1007f6ac9b1c6d672b390a3ee2219f014a [file] [log] [blame]
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +00001//===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +00006//
7//===----------------------------------------------------------------------===//
8//
9/// \file
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000010/// This pass removes redundant S_OR_B64 instructions enabling lanes in
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +000011/// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
12/// vector instructions between them we can only keep outer SI_END_CF, given
13/// that CFG is structured and exec bits of the outer end statement are always
14/// not less than exec bit of the inner one.
15///
16/// This needs to be done before the RA to eliminate saved exec bits registers
17/// but after register coalescer to have no vector registers copies in between
18/// of different end cf statements.
19///
20//===----------------------------------------------------------------------===//
21
22#include "AMDGPU.h"
23#include "AMDGPUSubtarget.h"
24#include "SIInstrInfo.h"
Tom Stellard44b30b42018-05-22 02:03:23 +000025#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
Matthias Braunf8422972017-12-13 02:51:04 +000026#include "llvm/CodeGen/LiveIntervals.h"
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +000027#include "llvm/CodeGen/MachineFunctionPass.h"
28
29using namespace llvm;
30
31#define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
32
33namespace {
34
35class SIOptimizeExecMaskingPreRA : public MachineFunctionPass {
36public:
37 static char ID;
38
39public:
40 SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) {
41 initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
42 }
43
44 bool runOnMachineFunction(MachineFunction &MF) override;
45
46 StringRef getPassName() const override {
47 return "SI optimize exec mask operations pre-RA";
48 }
49
50 void getAnalysisUsage(AnalysisUsage &AU) const override {
51 AU.addRequired<LiveIntervals>();
52 AU.setPreservesAll();
53 MachineFunctionPass::getAnalysisUsage(AU);
54 }
55};
56
57} // End anonymous namespace.
58
59INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
60 "SI optimize exec mask operations pre-RA", false, false)
61INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
62INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
63 "SI optimize exec mask operations pre-RA", false, false)
64
65char SIOptimizeExecMaskingPreRA::ID = 0;
66
67char &llvm::SIOptimizeExecMaskingPreRAID = SIOptimizeExecMaskingPreRA::ID;
68
69FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
70 return new SIOptimizeExecMaskingPreRA();
71}
72
73static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) {
74 return MI.getOpcode() == AMDGPU::S_OR_B64 &&
75 MI.modifiesRegister(AMDGPU::EXEC, TRI);
76}
77
78static bool isFullExecCopy(const MachineInstr& MI) {
Matt Arsenault77bf2e32019-03-25 21:28:53 +000079 return MI.getOperand(1).getReg() == AMDGPU::EXEC;
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +000080}
81
82static unsigned getOrNonExecReg(const MachineInstr &MI,
83 const SIInstrInfo &TII) {
84 auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1);
85 if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
86 return Op->getReg();
87 Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0);
88 if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
89 return Op->getReg();
90 return AMDGPU::NoRegister;
91}
92
93static MachineInstr* getOrExecSource(const MachineInstr &MI,
94 const SIInstrInfo &TII,
95 const MachineRegisterInfo &MRI) {
96 auto SavedExec = getOrNonExecReg(MI, TII);
97 if (SavedExec == AMDGPU::NoRegister)
98 return nullptr;
99 auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec);
100 if (!SaveExecInst || !isFullExecCopy(*SaveExecInst))
101 return nullptr;
102 return SaveExecInst;
103}
104
Stanislav Mekhanoshin6071e1a2018-12-13 03:17:40 +0000105// Optimize sequence
106// %sel = V_CNDMASK_B32_e64 0, 1, %cc
107// %cmp = V_CMP_NE_U32 1, %1
108// $vcc = S_AND_B64 $exec, %cmp
109// S_CBRANCH_VCC[N]Z
110// =>
111// $vcc = S_ANDN2_B64 $exec, %cc
112// S_CBRANCH_VCC[N]Z
113//
114// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
115// rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
116// only 3 first instructions are really needed. S_AND_B64 with exec is a
117// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
118// lanes.
119//
120// Returns %cc register on success.
121static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
122 const GCNSubtarget &ST,
123 MachineRegisterInfo &MRI,
124 LiveIntervals *LIS) {
125 const SIRegisterInfo *TRI = ST.getRegisterInfo();
126 const SIInstrInfo *TII = ST.getInstrInfo();
127 const unsigned AndOpc = AMDGPU::S_AND_B64;
128 const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64;
129 const unsigned CondReg = AMDGPU::VCC;
130 const unsigned ExecReg = AMDGPU::EXEC;
131
132 auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
133 unsigned Opc = MI.getOpcode();
134 return Opc == AMDGPU::S_CBRANCH_VCCZ ||
135 Opc == AMDGPU::S_CBRANCH_VCCNZ; });
136 if (I == MBB.terminators().end())
137 return AMDGPU::NoRegister;
138
139 auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister,
140 *I, MRI, LIS);
141 if (!And || And->getOpcode() != AndOpc ||
142 !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
143 return AMDGPU::NoRegister;
144
145 MachineOperand *AndCC = &And->getOperand(1);
146 unsigned CmpReg = AndCC->getReg();
147 unsigned CmpSubReg = AndCC->getSubReg();
148 if (CmpReg == ExecReg) {
149 AndCC = &And->getOperand(2);
150 CmpReg = AndCC->getReg();
151 CmpSubReg = AndCC->getSubReg();
152 } else if (And->getOperand(2).getReg() != ExecReg) {
153 return AMDGPU::NoRegister;
154 }
155
156 auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, MRI, LIS);
157 if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
158 Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
159 Cmp->getParent() != And->getParent())
160 return AMDGPU::NoRegister;
161
162 MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
163 MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
164 if (Op1->isImm() && Op2->isReg())
165 std::swap(Op1, Op2);
166 if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
167 return AMDGPU::NoRegister;
168
169 unsigned SelReg = Op1->getReg();
170 auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
171 if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
172 return AMDGPU::NoRegister;
173
Tim Renouf2e94f6e2019-03-18 19:25:39 +0000174 if (TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
175 TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers))
176 return AMDGPU::NoRegister;
177
Stanislav Mekhanoshin6071e1a2018-12-13 03:17:40 +0000178 Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
179 Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
180 MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
181 if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
182 Op1->getImm() != 0 || Op2->getImm() != 1)
183 return AMDGPU::NoRegister;
184
185 LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t'
186 << *Cmp << '\t' << *And);
187
188 unsigned CCReg = CC->getReg();
189 LIS->RemoveMachineInstrFromMaps(*And);
190 MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
191 TII->get(Andn2Opc), And->getOperand(0).getReg())
192 .addReg(ExecReg)
193 .addReg(CCReg, CC->getSubReg());
194 And->eraseFromParent();
195 LIS->InsertMachineInstrInMaps(*Andn2);
196
197 LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
198
199 // Try to remove compare. Cmp value should not used in between of cmp
200 // and s_and_b64 if VCC or just unused if any other register.
201 if ((TargetRegisterInfo::isVirtualRegister(CmpReg) &&
202 MRI.use_nodbg_empty(CmpReg)) ||
203 (CmpReg == CondReg &&
204 std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
Stanislav Mekhanoshind933c2c2018-12-13 05:52:11 +0000205 [&](const MachineInstr &MI) {
Stanislav Mekhanoshin6071e1a2018-12-13 03:17:40 +0000206 return MI.readsRegister(CondReg, TRI); }))) {
207 LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
208
209 LIS->RemoveMachineInstrFromMaps(*Cmp);
210 Cmp->eraseFromParent();
211
212 // Try to remove v_cndmask_b32.
213 if (TargetRegisterInfo::isVirtualRegister(SelReg) &&
214 MRI.use_nodbg_empty(SelReg)) {
215 LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
216
217 LIS->RemoveMachineInstrFromMaps(*Sel);
218 Sel->eraseFromParent();
219 }
220 }
221
222 return CCReg;
223}
224
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +0000225bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
Matthias Braunf1caa282017-12-15 22:22:58 +0000226 if (skipFunction(MF.getFunction()))
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +0000227 return false;
228
Tom Stellard5bfbae52018-07-11 20:59:01 +0000229 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +0000230 const SIRegisterInfo *TRI = ST.getRegisterInfo();
231 const SIInstrInfo *TII = ST.getInstrInfo();
232 MachineRegisterInfo &MRI = MF.getRegInfo();
233 LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000234 DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +0000235 bool Changed = false;
236
237 for (MachineBasicBlock &MBB : MF) {
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000238
Stanislav Mekhanoshin6071e1a2018-12-13 03:17:40 +0000239 if (unsigned Reg = optimizeVcndVcmpPair(MBB, ST, MRI, LIS)) {
240 RecalcRegs.insert(Reg);
241 RecalcRegs.insert(AMDGPU::VCC_LO);
242 RecalcRegs.insert(AMDGPU::VCC_HI);
243 RecalcRegs.insert(AMDGPU::SCC);
244 Changed = true;
245 }
246
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000247 // Try to remove unneeded instructions before s_endpgm.
248 if (MBB.succ_empty()) {
Matt Arsenault755f41f2018-08-28 18:55:55 +0000249 if (MBB.empty())
250 continue;
251
252 // Skip this if the endpgm has any implicit uses, otherwise we would need
253 // to be careful to update / remove them.
David Stuttard20ea21c2019-03-12 09:52:58 +0000254 // S_ENDPGM always has a single imm operand that is not used other than to
255 // end up in the encoding
Matt Arsenault755f41f2018-08-28 18:55:55 +0000256 MachineInstr &Term = MBB.back();
David Stuttard20ea21c2019-03-12 09:52:58 +0000257 if (Term.getOpcode() != AMDGPU::S_ENDPGM || Term.getNumOperands() != 1)
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000258 continue;
259
260 SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
261
262 while (!Blocks.empty()) {
263 auto CurBB = Blocks.pop_back_val();
264 auto I = CurBB->rbegin(), E = CurBB->rend();
265 if (I != E) {
266 if (I->isUnconditionalBranch() || I->getOpcode() == AMDGPU::S_ENDPGM)
267 ++I;
268 else if (I->isBranch())
269 continue;
270 }
271
272 while (I != E) {
Shiva Chen801bf7e2018-05-09 02:42:00 +0000273 if (I->isDebugInstr()) {
Matt Arsenault7f0a5272017-12-05 18:23:17 +0000274 I = std::next(I);
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000275 continue;
Matt Arsenault7f0a5272017-12-05 18:23:17 +0000276 }
277
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000278 if (I->mayStore() || I->isBarrier() || I->isCall() ||
279 I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
280 break;
281
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000282 LLVM_DEBUG(dbgs()
283 << "Removing no effect instruction: " << *I << '\n');
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000284
Matt Arsenault2f4df7e2017-09-08 18:51:26 +0000285 for (auto &Op : I->operands()) {
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000286 if (Op.isReg())
287 RecalcRegs.insert(Op.getReg());
Matt Arsenault2f4df7e2017-09-08 18:51:26 +0000288 }
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000289
290 auto Next = std::next(I);
291 LIS->RemoveMachineInstrFromMaps(*I);
292 I->eraseFromParent();
293 I = Next;
294
295 Changed = true;
296 }
297
298 if (I != E)
299 continue;
300
301 // Try to ascend predecessors.
302 for (auto *Pred : CurBB->predecessors()) {
303 if (Pred->succ_size() == 1)
304 Blocks.push_back(Pred);
305 }
306 }
307 continue;
308 }
309
310 // Try to collapse adjacent endifs.
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +0000311 auto Lead = MBB.begin(), E = MBB.end();
312 if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI))
313 continue;
314
315 const MachineBasicBlock* Succ = *MBB.succ_begin();
316 if (!MBB.isLayoutSuccessor(Succ))
317 continue;
318
319 auto I = std::next(Lead);
320
321 for ( ; I != E; ++I)
322 if (!TII->isSALU(*I) || I->readsRegister(AMDGPU::EXEC, TRI))
323 break;
324
325 if (I != E)
326 continue;
327
328 const auto NextLead = Succ->begin();
329 if (NextLead == Succ->end() || !isEndCF(*NextLead, TRI) ||
330 !getOrExecSource(*NextLead, *TII, MRI))
331 continue;
332
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000333 LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +0000334
Stanislav Mekhanoshinf23ae4f2017-08-02 01:18:57 +0000335 auto SaveExec = getOrExecSource(*Lead, *TII, MRI);
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +0000336 unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII);
Matt Arsenault2f4df7e2017-09-08 18:51:26 +0000337 for (auto &Op : Lead->operands()) {
338 if (Op.isReg())
339 RecalcRegs.insert(Op.getReg());
340 }
341
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +0000342 LIS->RemoveMachineInstrFromMaps(*Lead);
343 Lead->eraseFromParent();
344 if (SaveExecReg) {
345 LIS->removeInterval(SaveExecReg);
346 LIS->createAndComputeVirtRegInterval(SaveExecReg);
347 }
348
349 Changed = true;
Stanislav Mekhanoshinda0edef2017-08-01 23:44:35 +0000350
351 // If the only use of saved exec in the removed instruction is S_AND_B64
352 // fold the copy now.
Stanislav Mekhanoshinda0edef2017-08-01 23:44:35 +0000353 if (!SaveExec || !SaveExec->isFullCopy())
354 continue;
355
356 unsigned SavedExec = SaveExec->getOperand(0).getReg();
357 bool SafeToReplace = true;
358 for (auto& U : MRI.use_nodbg_instructions(SavedExec)) {
359 if (U.getParent() != SaveExec->getParent()) {
360 SafeToReplace = false;
361 break;
362 }
363
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000364 LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
Stanislav Mekhanoshinda0edef2017-08-01 23:44:35 +0000365 }
366
367 if (SafeToReplace) {
368 LIS->RemoveMachineInstrFromMaps(*SaveExec);
369 SaveExec->eraseFromParent();
370 MRI.replaceRegWith(SavedExec, AMDGPU::EXEC);
371 LIS->removeInterval(SavedExec);
372 }
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +0000373 }
374
375 if (Changed) {
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000376 for (auto Reg : RecalcRegs) {
377 if (TargetRegisterInfo::isVirtualRegister(Reg)) {
378 LIS->removeInterval(Reg);
379 if (!MRI.reg_empty(Reg))
380 LIS->createAndComputeVirtRegInterval(Reg);
381 } else {
Matt Arsenault476e26b2019-02-22 19:03:36 +0000382 LIS->removeAllRegUnitsForPhysReg(Reg);
Stanislav Mekhanoshina9487d92017-08-16 04:43:49 +0000383 }
384 }
Stanislav Mekhanoshin37e7f952017-08-01 23:14:32 +0000385 }
386
387 return Changed;
388}