blob: 56071d0d23744dfd0e887e8b6f13cff377cc5c9f [file] [log] [blame]
Valery Pykhtin3d9afa22018-11-30 14:21:56 +00001//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
10// operand.If any of the use instruction cannot be combined with the mov the
11// whole sequence is reverted.
12//
13// $old = ...
14// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
15// dpp_controls..., $bound_ctrl
16// $res = VALU $dpp_value, ...
17//
18// to
19//
20// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ...,
21// dpp_controls..., $folded_bound_ctrl
22//
23// Combining rules :
24//
25// $bound_ctrl is DPP_BOUND_ZERO, $old is any
26// $bound_ctrl is DPP_BOUND_OFF, $old is 0
27//
28// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO
29// $bound_ctrl is DPP_BOUND_OFF, $old is undef
30//
31// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF
32// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
33//
34// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF
35//===----------------------------------------------------------------------===//
36
37#include "AMDGPU.h"
38#include "AMDGPUSubtarget.h"
39#include "SIInstrInfo.h"
40#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
41#include "llvm/ADT/SmallVector.h"
42#include "llvm/ADT/Statistic.h"
43#include "llvm/CodeGen/MachineBasicBlock.h"
44#include "llvm/CodeGen/MachineFunction.h"
45#include "llvm/CodeGen/MachineFunctionPass.h"
46#include "llvm/CodeGen/MachineInstr.h"
47#include "llvm/CodeGen/MachineInstrBuilder.h"
48#include "llvm/CodeGen/MachineOperand.h"
49#include "llvm/CodeGen/MachineRegisterInfo.h"
50#include "llvm/CodeGen/TargetRegisterInfo.h"
51#include "llvm/Pass.h"
52#include <cassert>
53
54using namespace llvm;
55
56#define DEBUG_TYPE "gcn-dpp-combine"
57
58STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
59
60namespace {
61
62class GCNDPPCombine : public MachineFunctionPass {
63 MachineRegisterInfo *MRI;
64 const SIInstrInfo *TII;
65
66 using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
67
68 MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
69
70 RegSubRegPair foldOldOpnd(MachineInstr &OrigMI,
71 RegSubRegPair OldOpndVGPR,
72 MachineOperand &OldOpndValue) const;
73
74 MachineInstr *createDPPInst(MachineInstr &OrigMI,
75 MachineInstr &MovMI,
76 RegSubRegPair OldOpndVGPR,
77 MachineOperand *OldOpnd,
78 bool BoundCtrlZero) const;
79
80 MachineInstr *createDPPInst(MachineInstr &OrigMI,
81 MachineInstr &MovMI,
82 RegSubRegPair OldOpndVGPR,
83 bool BoundCtrlZero) const;
84
85 bool hasNoImmOrEqual(MachineInstr &MI,
86 unsigned OpndName,
87 int64_t Value,
88 int64_t Mask = -1) const;
89
90 bool combineDPPMov(MachineInstr &MI) const;
91
92public:
93 static char ID;
94
95 GCNDPPCombine() : MachineFunctionPass(ID) {
96 initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
97 }
98
99 bool runOnMachineFunction(MachineFunction &MF) override;
100
101 StringRef getPassName() const override { return "GCN DPP Combine"; }
102
103 void getAnalysisUsage(AnalysisUsage &AU) const override {
104 AU.setPreservesCFG();
105 MachineFunctionPass::getAnalysisUsage(AU);
106 }
107};
108
109} // end anonymous namespace
110
111INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
112
113char GCNDPPCombine::ID = 0;
114
115char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
116
117FunctionPass *llvm::createGCNDPPCombinePass() {
118 return new GCNDPPCombine();
119}
120
121static int getDPPOp(unsigned Op) {
122 auto DPP32 = AMDGPU::getDPPOp32(Op);
123 if (DPP32 != -1)
124 return DPP32;
125
126 auto E32 = AMDGPU::getVOPe32(Op);
127 return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
128}
129
130// tracks the register operand definition and returns:
131// 1. immediate operand used to initialize the register if found
132// 2. nullptr if the register operand is undef
133// 3. the operand itself otherwise
134MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
135 auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
136 if (!Def)
137 return nullptr;
138
139 switch(Def->getOpcode()) {
140 default: break;
141 case AMDGPU::IMPLICIT_DEF:
142 return nullptr;
143 case AMDGPU::COPY:
144 case AMDGPU::V_MOV_B32_e32: {
145 auto &Op1 = Def->getOperand(1);
146 if (Op1.isImm())
147 return &Op1;
148 break;
149 }
150 }
151 return &OldOpnd;
152}
153
154MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
155 MachineInstr &MovMI,
156 RegSubRegPair OldOpndVGPR,
157 bool BoundCtrlZero) const {
158 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
159 assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
160 TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
161
162 auto OrigOp = OrigMI.getOpcode();
163 auto DPPOp = getDPPOp(OrigOp);
164 if (DPPOp == -1) {
165 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
166 return nullptr;
167 }
168
169 auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
170 OrigMI.getDebugLoc(), TII->get(DPPOp));
171 bool Fail = false;
172 do {
173 auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
174 assert(Dst);
175 DPPInst.add(*Dst);
176 int NumOperands = 1;
177
178 const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
179 if (OldIdx != -1) {
180 assert(OldIdx == NumOperands);
181 assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI));
182 DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg);
183 ++NumOperands;
184 }
185
186 if (auto *Mod0 = TII->getNamedOperand(OrigMI,
187 AMDGPU::OpName::src0_modifiers)) {
188 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
189 AMDGPU::OpName::src0_modifiers));
190 assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
191 DPPInst.addImm(Mod0->getImm());
192 ++NumOperands;
193 }
194 auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
195 assert(Src0);
196 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
197 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
198 Fail = true;
199 break;
200 }
201 DPPInst.add(*Src0);
202 ++NumOperands;
203
204 if (auto *Mod1 = TII->getNamedOperand(OrigMI,
205 AMDGPU::OpName::src1_modifiers)) {
206 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
207 AMDGPU::OpName::src1_modifiers));
208 assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
209 DPPInst.addImm(Mod1->getImm());
210 ++NumOperands;
211 }
212 if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
213 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
214 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
215 Fail = true;
216 break;
217 }
218 DPPInst.add(*Src1);
219 ++NumOperands;
220 }
221
222 if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
223 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
224 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
225 Fail = true;
226 break;
227 }
228 DPPInst.add(*Src2);
229 }
230
231 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
232 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
233 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
234 DPPInst.addImm(BoundCtrlZero ? 1 : 0);
235 } while (false);
236
237 if (Fail) {
238 DPPInst.getInstr()->eraseFromParent();
239 return nullptr;
240 }
241 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
242 return DPPInst.getInstr();
243}
244
245GCNDPPCombine::RegSubRegPair
246GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI,
247 RegSubRegPair OldOpndVGPR,
248 MachineOperand &OldOpndValue) const {
249 assert(OldOpndValue.isImm());
250 switch (OrigMI.getOpcode()) {
251 default: break;
252 case AMDGPU::V_MAX_U32_e32:
253 if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max())
254 return OldOpndVGPR;
255 break;
256 case AMDGPU::V_MAX_I32_e32:
257 if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max())
258 return OldOpndVGPR;
259 break;
260 case AMDGPU::V_MIN_I32_e32:
261 if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min())
262 return OldOpndVGPR;
263 break;
264
265 case AMDGPU::V_MUL_I32_I24_e32:
266 case AMDGPU::V_MUL_U32_U24_e32:
267 if (OldOpndValue.getImm() == 1) {
268 auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
269 assert(Src1 && Src1->isReg());
270 return getRegSubRegPair(*Src1);
271 }
272 break;
273 }
274 return RegSubRegPair();
275}
276
277// Cases to combine:
278// $bound_ctrl is DPP_BOUND_ZERO, $old is any
279// $bound_ctrl is DPP_BOUND_OFF, $old is 0
280// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO
281
282// $bound_ctrl is DPP_BOUND_OFF, $old is undef
283// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF
284
285// $bound_ctrl is DPP_BOUND_OFF, $old is foldable
286// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF
287
288MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
289 MachineInstr &MovMI,
290 RegSubRegPair OldOpndVGPR,
291 MachineOperand *OldOpndValue,
292 bool BoundCtrlZero) const {
293 assert(OldOpndVGPR.Reg);
294 if (!BoundCtrlZero && OldOpndValue) {
295 assert(OldOpndValue->isImm());
296 OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue);
297 if (!OldOpndVGPR.Reg) {
298 LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n");
299 return nullptr;
300 }
301 }
302 return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero);
303}
304
305// returns true if MI doesn't have OpndName immediate operand or the
306// operand has Value
307bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
308 int64_t Value, int64_t Mask) const {
309 auto *Imm = TII->getNamedOperand(MI, OpndName);
310 if (!Imm)
311 return true;
312
313 assert(Imm->isImm());
314 return (Imm->getImm() & Mask) == Value;
315}
316
317bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
318 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
319 auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
320 assert(BCZOpnd && BCZOpnd->isImm());
321 bool BoundCtrlZero = 0 != BCZOpnd->getImm();
322
323 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
324
325 auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
326 assert(OldOpnd && OldOpnd->isReg());
327 auto OldOpndVGPR = getRegSubRegPair(*OldOpnd);
328 auto *OldOpndValue = getOldOpndValue(*OldOpnd);
329 assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
330 if (OldOpndValue) {
331 if (BoundCtrlZero) {
332 OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd
333 OldOpndValue = nullptr;
334 } else {
335 if (!OldOpndValue->isImm()) {
336 LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n");
337 return false;
338 }
339 if (OldOpndValue->getImm() == 0) {
340 OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef
341 OldOpndValue = nullptr;
342 BoundCtrlZero = true;
343 }
344 }
345 }
346
347 LLVM_DEBUG(dbgs() << " old=";
348 if (!OldOpndValue)
349 dbgs() << "undef";
350 else
351 dbgs() << OldOpndValue->getImm();
352 dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n');
353
354 std::vector<MachineInstr*> OrigMIs, DPPMIs;
355 if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef
356 OldOpndVGPR = RegSubRegPair(
357 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
358 auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
359 TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg);
360 DPPMIs.push_back(UndefInst.getInstr());
361 }
362
363 OrigMIs.push_back(&MovMI);
364 bool Rollback = true;
365 for (auto &Use : MRI->use_nodbg_operands(
366 TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) {
367 Rollback = true;
368
369 auto &OrigMI = *Use.getParent();
370 auto OrigOp = OrigMI.getOpcode();
371 if (TII->isVOP3(OrigOp)) {
372 if (!TII->hasVALU32BitEncoding(OrigOp)) {
373 LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
374 break;
375 }
376 // check if other than abs|neg modifiers are set (opsel for example)
377 const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
378 if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
379 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
380 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
381 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
382 LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
383 break;
384 }
385 } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
386 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
387 break;
388 }
389
390 LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
391 if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
392 if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR,
393 OldOpndValue, BoundCtrlZero)) {
394 DPPMIs.push_back(DPPInst);
395 Rollback = false;
396 }
397 } else if (OrigMI.isCommutable() &&
398 &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
399 auto *BB = OrigMI.getParent();
400 auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
401 BB->insert(OrigMI, NewMI);
402 if (TII->commuteInstruction(*NewMI)) {
403 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
404 if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR,
405 OldOpndValue, BoundCtrlZero)) {
406 DPPMIs.push_back(DPPInst);
407 Rollback = false;
408 }
409 } else
410 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
411 NewMI->eraseFromParent();
412 } else
413 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
414 if (Rollback)
415 break;
416 OrigMIs.push_back(&OrigMI);
417 }
418
419 for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
420 MI->eraseFromParent();
421
422 return !Rollback;
423}
424
425bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
426 auto &ST = MF.getSubtarget<GCNSubtarget>();
427 if (!ST.hasDPP() || skipFunction(MF.getFunction()))
428 return false;
429
430 MRI = &MF.getRegInfo();
431 TII = ST.getInstrInfo();
432
433 assert(MRI->isSSA() && "Must be run on SSA");
434
435 bool Changed = false;
436 for (auto &MBB : MF) {
437 for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
438 auto &MI = *I++;
439 if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
440 Changed = true;
441 ++NumDPPMovsCombined;
442 }
443 }
444 }
445 return Changed;
446}