blob: 67c86c3b8b91ed111af1625b50bec0e87787f5d8 [file] [log] [blame]
Sam Koltonf60ad582017-03-21 12:51:34 +00001//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file This pass tries to apply several peephole SDWA patterns.
11///
12/// E.g. original:
13/// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1
14/// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3
15/// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2
16///
17/// Replace:
18/// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3
19/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
20///
21//===----------------------------------------------------------------------===//
22
23
24#include "AMDGPU.h"
25#include "AMDGPUSubtarget.h"
26#include "SIDefines.h"
27#include "SIInstrInfo.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/CodeGen/MachineFunctionPass.h"
31#include "llvm/CodeGen/MachineInstrBuilder.h"
32#include <unordered_map>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "si-peephole-sdwa"
37
38STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
39STATISTIC(NumSDWAInstructionsPeepholed,
40 "Number of instruction converted to SDWA.");
41
42namespace {
43
44class SDWAOperand;
45
46class SIPeepholeSDWA : public MachineFunctionPass {
47private:
48 MachineRegisterInfo *MRI;
49 const SIRegisterInfo *TRI;
50 const SIInstrInfo *TII;
51
52 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
53
Sam Kolton27e0f8b2017-03-31 11:42:43 +000054 Optional<int64_t> foldToImm(const MachineOperand &Op) const;
55
Sam Koltonf60ad582017-03-21 12:51:34 +000056public:
57 static char ID;
58
59 typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;
60
61 SIPeepholeSDWA() : MachineFunctionPass(ID) {
62 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
63 }
64
65 bool runOnMachineFunction(MachineFunction &MF) override;
66 void matchSDWAOperands(MachineBasicBlock &MBB);
67 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
68
69 StringRef getPassName() const override { return "SI Peephole SDWA"; }
70
71 void getAnalysisUsage(AnalysisUsage &AU) const override {
72 AU.setPreservesCFG();
73 MachineFunctionPass::getAnalysisUsage(AU);
74 }
75};
76
77class SDWAOperand {
78private:
79 MachineOperand *Target; // Operand that would be used in converted instruction
80 MachineOperand *Replaced; // Operand that would be replace by Target
81
82public:
83 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
84 : Target(TargetOp), Replaced(ReplacedOp) {
85 assert(Target->isReg());
86 assert(Replaced->isReg());
87 }
88
89 virtual ~SDWAOperand() {}
90
91 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
92 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
93
94 MachineOperand *getTargetOperand() const { return Target; }
95 MachineOperand *getReplacedOperand() const { return Replaced; }
96 MachineInstr *getParentInst() const { return Target->getParent(); }
97 MachineRegisterInfo *getMRI() const {
98 return &getParentInst()->getParent()->getParent()->getRegInfo();
99 }
100};
101
102using namespace AMDGPU::SDWA;
103
104class SDWASrcOperand : public SDWAOperand {
105private:
106 SdwaSel SrcSel;
107 bool Abs;
108 bool Neg;
109 bool Sext;
110
111public:
112 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
113 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
114 bool Sext_ = false)
115 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
116 Neg(Neg_), Sext(Sext_) {}
117
118 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
119 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
120
121 SdwaSel getSrcSel() const { return SrcSel; }
122 bool getAbs() const { return Abs; }
123 bool getNeg() const { return Neg; }
124 bool getSext() const { return Sext; }
125
126 uint64_t getSrcMods() const;
127};
128
129class SDWADstOperand : public SDWAOperand {
130private:
131 SdwaSel DstSel;
132 DstUnused DstUn;
133
134public:
135 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
136 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
137 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
138
139 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
140 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
141
142 SdwaSel getDstSel() const { return DstSel; }
143 DstUnused getDstUnused() const { return DstUn; }
144};
145
146} // End anonymous namespace.
147
148INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
149
150char SIPeepholeSDWA::ID = 0;
151
152char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
153
154FunctionPass *llvm::createSIPeepholeSDWAPass() {
155 return new SIPeepholeSDWA();
156}
157
158#ifndef NDEBUG
159
160static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
161 switch(Sel) {
162 case BYTE_0: OS << "BYTE_0"; break;
163 case BYTE_1: OS << "BYTE_1"; break;
164 case BYTE_2: OS << "BYTE_2"; break;
165 case BYTE_3: OS << "BYTE_3"; break;
166 case WORD_0: OS << "WORD_0"; break;
167 case WORD_1: OS << "WORD_1"; break;
168 case DWORD: OS << "DWORD"; break;
169 }
170 return OS;
171}
172
173static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
174 switch(Un) {
175 case UNUSED_PAD: OS << "UNUSED_PAD"; break;
176 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
177 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
178 }
179 return OS;
180}
181
182static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) {
183 OS << "SDWA src: " << *Src.getTargetOperand()
184 << " src_sel:" << Src.getSrcSel()
185 << " abs:" << Src.getAbs() << " neg:" << Src.getNeg()
186 << " sext:" << Src.getSext() << '\n';
187 return OS;
188}
189
190static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) {
191 OS << "SDWA dst: " << *Dst.getTargetOperand()
192 << " dst_sel:" << Dst.getDstSel()
193 << " dst_unused:" << Dst.getDstUnused() << '\n';
194 return OS;
195}
196
197#endif
198
199static bool isSameBB(const MachineInstr *FirstMI, const MachineInstr *SecondMI) {
200 assert(FirstMI && SecondMI);
201 return FirstMI->getParent() == SecondMI->getParent();
202}
203
204static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
205 assert(To.isReg() && From.isReg());
206 To.setReg(From.getReg());
207 To.setSubReg(From.getSubReg());
208 To.setIsUndef(From.isUndef());
209 if (To.isUse()) {
210 To.setIsKill(From.isKill());
211 } else {
212 To.setIsDead(From.isDead());
213 }
214}
215
216static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
217 return LHS.isReg() &&
218 RHS.isReg() &&
219 LHS.getReg() == RHS.getReg() &&
220 LHS.getSubReg() == RHS.getSubReg();
221}
222
223static bool isSubregOf(const MachineOperand &SubReg,
224 const MachineOperand &SuperReg,
225 const TargetRegisterInfo *TRI) {
226
227 if (!SuperReg.isReg() || !SubReg.isReg())
228 return false;
229
230 if (isSameReg(SuperReg, SubReg))
231 return true;
232
233 if (SuperReg.getReg() != SubReg.getReg())
234 return false;
235
236 LaneBitmask::Type SuperMask =
237 TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()).getAsInteger();
238 LaneBitmask::Type SubMask =
239 TRI->getSubRegIndexLaneMask(SubReg.getSubReg()).getAsInteger();
240 return TRI->regmaskSubsetEqual(&SubMask, &SuperMask);
241}
242
243uint64_t SDWASrcOperand::getSrcMods() const {
244 uint64_t Mods = 0;
245 if (Abs || Neg) {
246 assert(!Sext &&
247 "Float and integer src modifiers can't be set simulteniously");
248 Mods |= Abs ? SISrcMods::ABS : 0;
249 Mods |= Neg ? SISrcMods::NEG : 0;
250 } else if (Sext) {
251 Mods |= SISrcMods::SEXT;
252 }
253
254 return Mods;
255}
256
257MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
258 // For SDWA src operand potential instruction is one that use register
259 // defined by parent instruction
260 MachineRegisterInfo *MRI = getMRI();
261 MachineOperand *Replaced = getReplacedOperand();
262 assert(Replaced->isReg());
263
264 MachineInstr *PotentialMI = nullptr;
265 for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) {
266 // If this is use of another subreg of dst reg then do nothing
267 if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
268 continue;
269
270 // If there exist use of dst in another basic block or use of superreg of
271 // dst then we should not combine this opernad
272 if (!isSameBB(PotentialMO.getParent(), getParentInst()) ||
273 !isSameReg(PotentialMO, *Replaced))
274 return nullptr;
275
276 // Check that PotentialMI is only instruction that uses dst reg
277 if (PotentialMI == nullptr) {
278 PotentialMI = PotentialMO.getParent();
279 } else if (PotentialMI != PotentialMO.getParent()) {
280 return nullptr;
281 }
282 }
283
284 return PotentialMI;
285}
286
287bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
288 // Find operand in instruction that matches source operand and replace it with
289 // target operand. Set corresponding src_sel
290
291 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
292 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
293 MachineOperand *SrcMods =
294 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
295 assert(Src && Src->isReg());
296 if (!isSameReg(*Src, *getReplacedOperand())) {
297 // If this is not src0 then it should be src1
298 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
299 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
300 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
301
302 assert(Src && Src->isReg());
303
304 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
305 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
306 !isSameReg(*Src, *getReplacedOperand())) {
307 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
308 // src2. This is not allowed.
309 return false;
310 }
311
312 assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
313 }
314 copyRegOperand(*Src, *getTargetOperand());
315 SrcSel->setImm(getSrcSel());
316 SrcMods->setImm(getSrcMods());
317 getTargetOperand()->setIsKill(false);
318 return true;
319}
320
321MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
322 // For SDWA dst operand potential instruction is one that defines register
323 // that this operand uses
324 MachineRegisterInfo *MRI = getMRI();
325 MachineInstr *ParentMI = getParentInst();
326 MachineOperand *Replaced = getReplacedOperand();
327 assert(Replaced->isReg());
328
329 for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) {
330 if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
331 continue;
332
333 if (!isSameBB(getParentInst(), PotentialMO.getParent()) ||
334 !isSameReg(*Replaced, PotentialMO))
335 return nullptr;
336
337 // Check that ParentMI is the only instruction that uses replaced register
338 for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) {
339 if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) &&
340 UseMO.getParent() != ParentMI) {
341 return nullptr;
342 }
343 }
344
345 // Due to SSA this should be onle def of replaced register, so return it
346 return PotentialMO.getParent();
347 }
348
349 return nullptr;
350}
351
352bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
353 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
354
355 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
356 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
357 getDstSel() != AMDGPU::SDWA::DWORD) {
358 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
359 return false;
360 }
361
362 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
363 assert(Operand &&
364 Operand->isReg() &&
365 isSameReg(*Operand, *getReplacedOperand()));
366 copyRegOperand(*Operand, *getTargetOperand());
367 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
368 assert(DstSel);
369 DstSel->setImm(getDstSel());
370 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
371 assert(DstUnused);
372 DstUnused->setImm(getDstUnused());
373
374 // Remove original instruction because it would conflict with our new
375 // instruction by register definition
376 getParentInst()->eraseFromParent();
377 return true;
378}
379
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000380Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
381 if (Op.isImm()) {
382 return Op.getImm();
383 }
384
385 // If this is not immediate then it can be copy of immediate value, e.g.:
386 // %vreg1<def> = S_MOV_B32 255;
387 if (Op.isReg()) {
388 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
389 if (!isSameReg(Op, Def))
390 continue;
391
392 const MachineInstr *DefInst = Def.getParent();
393 if (!TII->isFoldableCopy(*DefInst) || !isSameBB(Op.getParent(), DefInst))
394 return None;
395
396 const MachineOperand &Copied = DefInst->getOperand(1);
397 if (!Copied.isImm())
398 return None;
399
400 return Copied.getImm();
401 }
402 }
403
404 return None;
405}
406
Sam Koltonf60ad582017-03-21 12:51:34 +0000407void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
408 for (MachineInstr &MI : MBB) {
409 unsigned Opcode = MI.getOpcode();
410 switch (Opcode) {
411 case AMDGPU::V_LSHRREV_B32_e32:
412 case AMDGPU::V_ASHRREV_I32_e32:
413 case AMDGPU::V_LSHLREV_B32_e32: {
414 // from: v_lshrrev_b32_e32 v1, 16/24, v0
415 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
416
417 // from: v_ashrrev_i32_e32 v1, 16/24, v0
418 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
419
420 // from: v_lshlrev_b32_e32 v1, 16/24, v0
421 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
422 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000423 auto Imm = foldToImm(*Src0);
424 if (!Imm)
Sam Koltonf60ad582017-03-21 12:51:34 +0000425 break;
426
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000427 if (*Imm != 16 && *Imm != 24)
Sam Koltonf60ad582017-03-21 12:51:34 +0000428 break;
429
430 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
431 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
432 if (TRI->isPhysicalRegister(Src1->getReg()) ||
433 TRI->isPhysicalRegister(Dst->getReg()))
434 break;
435
436 if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
437 auto SDWADst = make_unique<SDWADstOperand>(
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000438 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
Sam Koltonf60ad582017-03-21 12:51:34 +0000439 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
440 SDWAOperands[&MI] = std::move(SDWADst);
441 ++NumSDWAPatternsFound;
442 } else {
443 auto SDWASrc = make_unique<SDWASrcOperand>(
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000444 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
Sam Koltonf60ad582017-03-21 12:51:34 +0000445 Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
446 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
447 SDWAOperands[&MI] = std::move(SDWASrc);
448 ++NumSDWAPatternsFound;
449 }
450 break;
451 }
452
453 case AMDGPU::V_LSHRREV_B16_e32:
454 case AMDGPU::V_ASHRREV_I16_e32:
455 case AMDGPU::V_LSHLREV_B16_e32: {
456 // from: v_lshrrev_b16_e32 v1, 8, v0
457 // to SDWA src:v0 src_sel:BYTE_1
458
459 // from: v_ashrrev_i16_e32 v1, 8, v0
460 // to SDWA src:v0 src_sel:BYTE_1 sext:1
461
462 // from: v_lshlrev_b16_e32 v1, 8, v0
463 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
464 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000465 auto Imm = foldToImm(*Src0);
466 if (!Imm || *Imm != 8)
Sam Koltonf60ad582017-03-21 12:51:34 +0000467 break;
468
469 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
470 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
471
472 if (TRI->isPhysicalRegister(Src1->getReg()) ||
473 TRI->isPhysicalRegister(Dst->getReg()))
474 break;
475
476 if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
477 auto SDWADst =
478 make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
479 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
480 SDWAOperands[&MI] = std::move(SDWADst);
481 ++NumSDWAPatternsFound;
482 } else {
483 auto SDWASrc = make_unique<SDWASrcOperand>(
484 Src1, Dst, BYTE_1, false, false,
485 Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
486 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
487 SDWAOperands[&MI] = std::move(SDWASrc);
488 ++NumSDWAPatternsFound;
489 }
490 break;
491 }
492
493 case AMDGPU::V_BFE_I32:
494 case AMDGPU::V_BFE_U32: {
495 // e.g.:
496 // from: v_bfe_u32 v1, v0, 8, 8
497 // to SDWA src:v0 src_sel:BYTE_1
498
499 // offset | width | src_sel
500 // ------------------------
501 // 0 | 8 | BYTE_0
502 // 0 | 16 | WORD_0
503 // 0 | 32 | DWORD ?
504 // 8 | 8 | BYTE_1
505 // 16 | 8 | BYTE_2
506 // 16 | 16 | WORD_1
507 // 24 | 8 | BYTE_3
508
509 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000510 auto Offset = foldToImm(*Src1);
511 if (!Offset)
Sam Koltonf60ad582017-03-21 12:51:34 +0000512 break;
Sam Koltonf60ad582017-03-21 12:51:34 +0000513
514 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000515 auto Width = foldToImm(*Src2);
516 if (!Width)
Sam Koltonf60ad582017-03-21 12:51:34 +0000517 break;
Sam Koltonf60ad582017-03-21 12:51:34 +0000518
519 SdwaSel SrcSel = DWORD;
520
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000521 if (*Offset == 0 && *Width == 8)
Sam Koltonf60ad582017-03-21 12:51:34 +0000522 SrcSel = BYTE_0;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000523 else if (*Offset == 0 && *Width == 16)
Sam Koltonf60ad582017-03-21 12:51:34 +0000524 SrcSel = WORD_0;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000525 else if (*Offset == 0 && *Width == 32)
Sam Koltonf60ad582017-03-21 12:51:34 +0000526 SrcSel = DWORD;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000527 else if (*Offset == 8 && *Width == 8)
Sam Koltonf60ad582017-03-21 12:51:34 +0000528 SrcSel = BYTE_1;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000529 else if (*Offset == 16 && *Width == 8)
Sam Koltonf60ad582017-03-21 12:51:34 +0000530 SrcSel = BYTE_2;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000531 else if (*Offset == 16 && *Width == 16)
Sam Koltonf60ad582017-03-21 12:51:34 +0000532 SrcSel = WORD_1;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000533 else if (*Offset == 24 && *Width == 8)
Sam Koltonf60ad582017-03-21 12:51:34 +0000534 SrcSel = BYTE_3;
535 else
536 break;
537
538 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
539 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
540
541 if (TRI->isPhysicalRegister(Src0->getReg()) ||
542 TRI->isPhysicalRegister(Dst->getReg()))
543 break;
544
545 auto SDWASrc = make_unique<SDWASrcOperand>(
546 Src0, Dst, SrcSel, false, false,
547 Opcode == AMDGPU::V_BFE_U32 ? false : true);
548 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
549 SDWAOperands[&MI] = std::move(SDWASrc);
550 ++NumSDWAPatternsFound;
551 break;
552 }
553 case AMDGPU::V_AND_B32_e32: {
554 // e.g.:
555 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
556 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
557
558 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000559 auto Imm = foldToImm(*Src0);
560 if (!Imm)
Sam Koltonf60ad582017-03-21 12:51:34 +0000561 break;
562
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000563 if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
Sam Koltonf60ad582017-03-21 12:51:34 +0000564 break;
565
566 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
567 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
568
569 if (TRI->isPhysicalRegister(Src1->getReg()) ||
570 TRI->isPhysicalRegister(Dst->getReg()))
571 break;
572
573 auto SDWASrc = make_unique<SDWASrcOperand>(
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000574 Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
Sam Koltonf60ad582017-03-21 12:51:34 +0000575 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
576 SDWAOperands[&MI] = std::move(SDWASrc);
577 ++NumSDWAPatternsFound;
578 break;
579 }
580 }
581 }
582}
583
584bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
585 const SDWAOperandsVector &SDWAOperands) {
586 // Check if this instruction can be converted to SDWA:
587 // 1. Does this opcode support SDWA
588 if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
589 return false;
590
591 // 2. Are all operands - VGPRs
592 for (const MachineOperand &Operand : MI.explicit_operands()) {
593 if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg()))
594 return false;
595 }
596
597 // Convert to sdwa
598 int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
599 assert(SDWAOpcode != -1);
600
601 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
602
603 // Create SDWA version of instruction MI and initialize its operands
604 MachineInstrBuilder SDWAInst =
605 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
606
607 // Copy dst, if it is present in original then should also be present in SDWA
608 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
609 if (Dst) {
610 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
611 SDWAInst.add(*Dst);
612 } else {
613 assert(TII->isVOPC(MI));
614 }
615
616 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
617 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
618 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
619 assert(
620 Src0 &&
621 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
622 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
623 SDWAInst.addImm(0);
624 SDWAInst.add(*Src0);
625
626 // Copy src1 if present, initialize src1_modifiers.
627 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
628 if (Src1) {
629 assert(
630 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
631 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
632 SDWAInst.addImm(0);
633 SDWAInst.add(*Src1);
634 } else {
635 assert(TII->isVOP1(MI));
636 }
637
638 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
639 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
640 // v_mac_f16/32 has additional src2 operand tied to vdst
641 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
642 assert(Src2);
643 SDWAInst.add(*Src2);
644 }
645
646 // Initialize clamp.
647 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
648 SDWAInst.addImm(0);
649
650 // Initialize dst_sel and dst_unused if present
651 if (Dst) {
652 assert(
653 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 &&
654 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1);
655 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
656 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
657 }
658
659 // Initialize src0_sel
660 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
661 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
662
663
664 // Initialize src1_sel if present
665 if (Src1) {
666 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
667 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
668 }
669
670 // Apply all sdwa operand pattenrs
671 bool Converted = false;
672 for (auto &Operand : SDWAOperands) {
673 Converted |= Operand->convertToSDWA(*SDWAInst, TII);
674 }
675 if (!Converted) {
676 SDWAInst->eraseFromParent();
677 return false;
678 }
679
680 DEBUG(dbgs() << "Convert instruction:" << MI
681 << "Into:" << *SDWAInst << '\n');
682 ++NumSDWAInstructionsPeepholed;
683
684 MI.eraseFromParent();
685 return true;
686}
687
688bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
689 const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
690
691 if (!ST.hasSDWA() ||
692 !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
693 return false;
694 }
695
696 MRI = &MF.getRegInfo();
697 TRI = ST.getRegisterInfo();
698 TII = ST.getInstrInfo();
699
700 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
701
702 // FIXME: For now we only combine instructions in one basic block
703 for (MachineBasicBlock &MBB : MF) {
704 SDWAOperands.clear();
705 matchSDWAOperands(MBB);
706
707 PotentialMatches.clear();
708 for (auto &OperandPair : SDWAOperands) {
709 auto &Operand = OperandPair.second;
710 MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
711 if (PotentialMI) {
712 PotentialMatches[PotentialMI].push_back(std::move(Operand));
713 }
714 }
715
716 for (auto &PotentialPair : PotentialMatches) {
717 MachineInstr &PotentialMI = *PotentialPair.first;
718 convertToSDWA(PotentialMI, PotentialPair.second);
719 }
720 }
721 return false;
722}