blob: 599c9d715527ac42733bf154f8cff412ce1ef60b [file] [log] [blame]
Sam Koltonf60ad582017-03-21 12:51:34 +00001//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file This pass tries to apply several peephole SDWA patterns.
11///
12/// E.g. original:
13/// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1
14/// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3
15/// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2
16///
17/// Replace:
18/// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3
19/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
20///
21//===----------------------------------------------------------------------===//
22
23
24#include "AMDGPU.h"
25#include "AMDGPUSubtarget.h"
26#include "SIDefines.h"
27#include "SIInstrInfo.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/CodeGen/MachineFunctionPass.h"
31#include "llvm/CodeGen/MachineInstrBuilder.h"
32#include <unordered_map>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "si-peephole-sdwa"
37
38STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
39STATISTIC(NumSDWAInstructionsPeepholed,
40 "Number of instruction converted to SDWA.");
41
42namespace {
43
44class SDWAOperand;
45
46class SIPeepholeSDWA : public MachineFunctionPass {
47private:
48 MachineRegisterInfo *MRI;
49 const SIRegisterInfo *TRI;
50 const SIInstrInfo *TII;
51
52 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
53
Sam Kolton27e0f8b2017-03-31 11:42:43 +000054 Optional<int64_t> foldToImm(const MachineOperand &Op) const;
55
Sam Koltonf60ad582017-03-21 12:51:34 +000056public:
57 static char ID;
58
59 typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;
60
61 SIPeepholeSDWA() : MachineFunctionPass(ID) {
62 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
63 }
64
65 bool runOnMachineFunction(MachineFunction &MF) override;
66 void matchSDWAOperands(MachineBasicBlock &MBB);
67 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
68
69 StringRef getPassName() const override { return "SI Peephole SDWA"; }
70
71 void getAnalysisUsage(AnalysisUsage &AU) const override {
72 AU.setPreservesCFG();
73 MachineFunctionPass::getAnalysisUsage(AU);
74 }
75};
76
77class SDWAOperand {
78private:
79 MachineOperand *Target; // Operand that would be used in converted instruction
80 MachineOperand *Replaced; // Operand that would be replace by Target
81
82public:
83 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
84 : Target(TargetOp), Replaced(ReplacedOp) {
85 assert(Target->isReg());
86 assert(Replaced->isReg());
87 }
88
89 virtual ~SDWAOperand() {}
90
91 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
92 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
93
94 MachineOperand *getTargetOperand() const { return Target; }
95 MachineOperand *getReplacedOperand() const { return Replaced; }
96 MachineInstr *getParentInst() const { return Target->getParent(); }
97 MachineRegisterInfo *getMRI() const {
98 return &getParentInst()->getParent()->getParent()->getRegInfo();
99 }
100};
101
102using namespace AMDGPU::SDWA;
103
104class SDWASrcOperand : public SDWAOperand {
105private:
106 SdwaSel SrcSel;
107 bool Abs;
108 bool Neg;
109 bool Sext;
110
111public:
112 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
113 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
114 bool Sext_ = false)
115 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
116 Neg(Neg_), Sext(Sext_) {}
117
118 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
119 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
120
121 SdwaSel getSrcSel() const { return SrcSel; }
122 bool getAbs() const { return Abs; }
123 bool getNeg() const { return Neg; }
124 bool getSext() const { return Sext; }
125
126 uint64_t getSrcMods() const;
127};
128
129class SDWADstOperand : public SDWAOperand {
130private:
131 SdwaSel DstSel;
132 DstUnused DstUn;
133
134public:
135 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
136 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
137 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
138
139 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
140 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
141
142 SdwaSel getDstSel() const { return DstSel; }
143 DstUnused getDstUnused() const { return DstUn; }
144};
145
146} // End anonymous namespace.
147
148INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
149
150char SIPeepholeSDWA::ID = 0;
151
152char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
153
154FunctionPass *llvm::createSIPeepholeSDWAPass() {
155 return new SIPeepholeSDWA();
156}
157
158#ifndef NDEBUG
159
160static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
161 switch(Sel) {
162 case BYTE_0: OS << "BYTE_0"; break;
163 case BYTE_1: OS << "BYTE_1"; break;
164 case BYTE_2: OS << "BYTE_2"; break;
165 case BYTE_3: OS << "BYTE_3"; break;
166 case WORD_0: OS << "WORD_0"; break;
167 case WORD_1: OS << "WORD_1"; break;
168 case DWORD: OS << "DWORD"; break;
169 }
170 return OS;
171}
172
173static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
174 switch(Un) {
175 case UNUSED_PAD: OS << "UNUSED_PAD"; break;
176 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
177 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
178 }
179 return OS;
180}
181
182static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) {
183 OS << "SDWA src: " << *Src.getTargetOperand()
184 << " src_sel:" << Src.getSrcSel()
185 << " abs:" << Src.getAbs() << " neg:" << Src.getNeg()
186 << " sext:" << Src.getSext() << '\n';
187 return OS;
188}
189
190static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) {
191 OS << "SDWA dst: " << *Dst.getTargetOperand()
192 << " dst_sel:" << Dst.getDstSel()
193 << " dst_unused:" << Dst.getDstUnused() << '\n';
194 return OS;
195}
196
197#endif
198
199static bool isSameBB(const MachineInstr *FirstMI, const MachineInstr *SecondMI) {
200 assert(FirstMI && SecondMI);
201 return FirstMI->getParent() == SecondMI->getParent();
202}
203
204static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
205 assert(To.isReg() && From.isReg());
206 To.setReg(From.getReg());
207 To.setSubReg(From.getSubReg());
208 To.setIsUndef(From.isUndef());
209 if (To.isUse()) {
210 To.setIsKill(From.isKill());
211 } else {
212 To.setIsDead(From.isDead());
213 }
214}
215
216static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
217 return LHS.isReg() &&
218 RHS.isReg() &&
219 LHS.getReg() == RHS.getReg() &&
220 LHS.getSubReg() == RHS.getSubReg();
221}
222
223static bool isSubregOf(const MachineOperand &SubReg,
224 const MachineOperand &SuperReg,
225 const TargetRegisterInfo *TRI) {
226
227 if (!SuperReg.isReg() || !SubReg.isReg())
228 return false;
229
230 if (isSameReg(SuperReg, SubReg))
231 return true;
232
233 if (SuperReg.getReg() != SubReg.getReg())
234 return false;
235
Sam Kolton9fa16962017-04-06 15:03:28 +0000236 LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg());
237 LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg());
238 SuperMask |= ~SubMask;
239 return SuperMask.all();
Sam Koltonf60ad582017-03-21 12:51:34 +0000240}
241
242uint64_t SDWASrcOperand::getSrcMods() const {
243 uint64_t Mods = 0;
244 if (Abs || Neg) {
245 assert(!Sext &&
246 "Float and integer src modifiers can't be set simulteniously");
247 Mods |= Abs ? SISrcMods::ABS : 0;
248 Mods |= Neg ? SISrcMods::NEG : 0;
249 } else if (Sext) {
250 Mods |= SISrcMods::SEXT;
251 }
252
253 return Mods;
254}
255
256MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
257 // For SDWA src operand potential instruction is one that use register
258 // defined by parent instruction
259 MachineRegisterInfo *MRI = getMRI();
260 MachineOperand *Replaced = getReplacedOperand();
261 assert(Replaced->isReg());
262
263 MachineInstr *PotentialMI = nullptr;
264 for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) {
265 // If this is use of another subreg of dst reg then do nothing
266 if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
267 continue;
268
269 // If there exist use of dst in another basic block or use of superreg of
270 // dst then we should not combine this opernad
271 if (!isSameBB(PotentialMO.getParent(), getParentInst()) ||
272 !isSameReg(PotentialMO, *Replaced))
273 return nullptr;
274
275 // Check that PotentialMI is only instruction that uses dst reg
276 if (PotentialMI == nullptr) {
277 PotentialMI = PotentialMO.getParent();
278 } else if (PotentialMI != PotentialMO.getParent()) {
279 return nullptr;
280 }
281 }
282
283 return PotentialMI;
284}
285
286bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
287 // Find operand in instruction that matches source operand and replace it with
288 // target operand. Set corresponding src_sel
289
290 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
291 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
292 MachineOperand *SrcMods =
293 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
294 assert(Src && Src->isReg());
295 if (!isSameReg(*Src, *getReplacedOperand())) {
296 // If this is not src0 then it should be src1
297 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
298 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
299 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
300
301 assert(Src && Src->isReg());
302
303 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
304 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
305 !isSameReg(*Src, *getReplacedOperand())) {
306 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
307 // src2. This is not allowed.
308 return false;
309 }
310
311 assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
312 }
313 copyRegOperand(*Src, *getTargetOperand());
314 SrcSel->setImm(getSrcSel());
315 SrcMods->setImm(getSrcMods());
316 getTargetOperand()->setIsKill(false);
317 return true;
318}
319
320MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
321 // For SDWA dst operand potential instruction is one that defines register
322 // that this operand uses
323 MachineRegisterInfo *MRI = getMRI();
324 MachineInstr *ParentMI = getParentInst();
325 MachineOperand *Replaced = getReplacedOperand();
326 assert(Replaced->isReg());
327
328 for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) {
329 if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
330 continue;
331
332 if (!isSameBB(getParentInst(), PotentialMO.getParent()) ||
333 !isSameReg(*Replaced, PotentialMO))
334 return nullptr;
335
336 // Check that ParentMI is the only instruction that uses replaced register
337 for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) {
338 if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) &&
339 UseMO.getParent() != ParentMI) {
340 return nullptr;
341 }
342 }
343
344 // Due to SSA this should be onle def of replaced register, so return it
345 return PotentialMO.getParent();
346 }
347
348 return nullptr;
349}
350
351bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
352 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
353
354 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
355 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
356 getDstSel() != AMDGPU::SDWA::DWORD) {
357 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
358 return false;
359 }
360
361 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
362 assert(Operand &&
363 Operand->isReg() &&
364 isSameReg(*Operand, *getReplacedOperand()));
365 copyRegOperand(*Operand, *getTargetOperand());
366 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
367 assert(DstSel);
368 DstSel->setImm(getDstSel());
369 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
370 assert(DstUnused);
371 DstUnused->setImm(getDstUnused());
372
373 // Remove original instruction because it would conflict with our new
374 // instruction by register definition
375 getParentInst()->eraseFromParent();
376 return true;
377}
378
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000379Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
380 if (Op.isImm()) {
381 return Op.getImm();
382 }
383
384 // If this is not immediate then it can be copy of immediate value, e.g.:
385 // %vreg1<def> = S_MOV_B32 255;
386 if (Op.isReg()) {
387 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
388 if (!isSameReg(Op, Def))
389 continue;
390
391 const MachineInstr *DefInst = Def.getParent();
392 if (!TII->isFoldableCopy(*DefInst) || !isSameBB(Op.getParent(), DefInst))
393 return None;
394
395 const MachineOperand &Copied = DefInst->getOperand(1);
396 if (!Copied.isImm())
397 return None;
398
399 return Copied.getImm();
400 }
401 }
402
403 return None;
404}
405
Sam Koltonf60ad582017-03-21 12:51:34 +0000406void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
407 for (MachineInstr &MI : MBB) {
408 unsigned Opcode = MI.getOpcode();
409 switch (Opcode) {
410 case AMDGPU::V_LSHRREV_B32_e32:
411 case AMDGPU::V_ASHRREV_I32_e32:
412 case AMDGPU::V_LSHLREV_B32_e32: {
413 // from: v_lshrrev_b32_e32 v1, 16/24, v0
414 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
415
416 // from: v_ashrrev_i32_e32 v1, 16/24, v0
417 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
418
419 // from: v_lshlrev_b32_e32 v1, 16/24, v0
420 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
421 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000422 auto Imm = foldToImm(*Src0);
423 if (!Imm)
Sam Koltonf60ad582017-03-21 12:51:34 +0000424 break;
425
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000426 if (*Imm != 16 && *Imm != 24)
Sam Koltonf60ad582017-03-21 12:51:34 +0000427 break;
428
429 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
430 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
431 if (TRI->isPhysicalRegister(Src1->getReg()) ||
432 TRI->isPhysicalRegister(Dst->getReg()))
433 break;
434
435 if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
436 auto SDWADst = make_unique<SDWADstOperand>(
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000437 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
Sam Koltonf60ad582017-03-21 12:51:34 +0000438 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
439 SDWAOperands[&MI] = std::move(SDWADst);
440 ++NumSDWAPatternsFound;
441 } else {
442 auto SDWASrc = make_unique<SDWASrcOperand>(
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000443 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
Sam Koltonf60ad582017-03-21 12:51:34 +0000444 Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
445 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
446 SDWAOperands[&MI] = std::move(SDWASrc);
447 ++NumSDWAPatternsFound;
448 }
449 break;
450 }
451
452 case AMDGPU::V_LSHRREV_B16_e32:
453 case AMDGPU::V_ASHRREV_I16_e32:
454 case AMDGPU::V_LSHLREV_B16_e32: {
455 // from: v_lshrrev_b16_e32 v1, 8, v0
456 // to SDWA src:v0 src_sel:BYTE_1
457
458 // from: v_ashrrev_i16_e32 v1, 8, v0
459 // to SDWA src:v0 src_sel:BYTE_1 sext:1
460
461 // from: v_lshlrev_b16_e32 v1, 8, v0
462 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
463 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000464 auto Imm = foldToImm(*Src0);
465 if (!Imm || *Imm != 8)
Sam Koltonf60ad582017-03-21 12:51:34 +0000466 break;
467
468 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
469 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
470
471 if (TRI->isPhysicalRegister(Src1->getReg()) ||
472 TRI->isPhysicalRegister(Dst->getReg()))
473 break;
474
475 if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
476 auto SDWADst =
477 make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
478 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
479 SDWAOperands[&MI] = std::move(SDWADst);
480 ++NumSDWAPatternsFound;
481 } else {
482 auto SDWASrc = make_unique<SDWASrcOperand>(
483 Src1, Dst, BYTE_1, false, false,
484 Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
485 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
486 SDWAOperands[&MI] = std::move(SDWASrc);
487 ++NumSDWAPatternsFound;
488 }
489 break;
490 }
491
492 case AMDGPU::V_BFE_I32:
493 case AMDGPU::V_BFE_U32: {
494 // e.g.:
495 // from: v_bfe_u32 v1, v0, 8, 8
496 // to SDWA src:v0 src_sel:BYTE_1
497
498 // offset | width | src_sel
499 // ------------------------
500 // 0 | 8 | BYTE_0
501 // 0 | 16 | WORD_0
502 // 0 | 32 | DWORD ?
503 // 8 | 8 | BYTE_1
504 // 16 | 8 | BYTE_2
505 // 16 | 16 | WORD_1
506 // 24 | 8 | BYTE_3
507
508 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000509 auto Offset = foldToImm(*Src1);
510 if (!Offset)
Sam Koltonf60ad582017-03-21 12:51:34 +0000511 break;
Sam Koltonf60ad582017-03-21 12:51:34 +0000512
513 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000514 auto Width = foldToImm(*Src2);
515 if (!Width)
Sam Koltonf60ad582017-03-21 12:51:34 +0000516 break;
Sam Koltonf60ad582017-03-21 12:51:34 +0000517
518 SdwaSel SrcSel = DWORD;
519
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000520 if (*Offset == 0 && *Width == 8)
Sam Koltonf60ad582017-03-21 12:51:34 +0000521 SrcSel = BYTE_0;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000522 else if (*Offset == 0 && *Width == 16)
Sam Koltonf60ad582017-03-21 12:51:34 +0000523 SrcSel = WORD_0;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000524 else if (*Offset == 0 && *Width == 32)
Sam Koltonf60ad582017-03-21 12:51:34 +0000525 SrcSel = DWORD;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000526 else if (*Offset == 8 && *Width == 8)
Sam Koltonf60ad582017-03-21 12:51:34 +0000527 SrcSel = BYTE_1;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000528 else if (*Offset == 16 && *Width == 8)
Sam Koltonf60ad582017-03-21 12:51:34 +0000529 SrcSel = BYTE_2;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000530 else if (*Offset == 16 && *Width == 16)
Sam Koltonf60ad582017-03-21 12:51:34 +0000531 SrcSel = WORD_1;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000532 else if (*Offset == 24 && *Width == 8)
Sam Koltonf60ad582017-03-21 12:51:34 +0000533 SrcSel = BYTE_3;
534 else
535 break;
536
537 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
538 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
539
540 if (TRI->isPhysicalRegister(Src0->getReg()) ||
541 TRI->isPhysicalRegister(Dst->getReg()))
542 break;
543
544 auto SDWASrc = make_unique<SDWASrcOperand>(
545 Src0, Dst, SrcSel, false, false,
546 Opcode == AMDGPU::V_BFE_U32 ? false : true);
547 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
548 SDWAOperands[&MI] = std::move(SDWASrc);
549 ++NumSDWAPatternsFound;
550 break;
551 }
552 case AMDGPU::V_AND_B32_e32: {
553 // e.g.:
554 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
555 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
556
557 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000558 auto Imm = foldToImm(*Src0);
559 if (!Imm)
Sam Koltonf60ad582017-03-21 12:51:34 +0000560 break;
561
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000562 if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
Sam Koltonf60ad582017-03-21 12:51:34 +0000563 break;
564
565 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
566 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
567
568 if (TRI->isPhysicalRegister(Src1->getReg()) ||
569 TRI->isPhysicalRegister(Dst->getReg()))
570 break;
571
572 auto SDWASrc = make_unique<SDWASrcOperand>(
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000573 Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
Sam Koltonf60ad582017-03-21 12:51:34 +0000574 DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
575 SDWAOperands[&MI] = std::move(SDWASrc);
576 ++NumSDWAPatternsFound;
577 break;
578 }
579 }
580 }
581}
582
583bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
584 const SDWAOperandsVector &SDWAOperands) {
585 // Check if this instruction can be converted to SDWA:
586 // 1. Does this opcode support SDWA
587 if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
588 return false;
589
590 // 2. Are all operands - VGPRs
591 for (const MachineOperand &Operand : MI.explicit_operands()) {
592 if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg()))
593 return false;
594 }
595
596 // Convert to sdwa
597 int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
598 assert(SDWAOpcode != -1);
599
600 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
601
602 // Create SDWA version of instruction MI and initialize its operands
603 MachineInstrBuilder SDWAInst =
604 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
605
606 // Copy dst, if it is present in original then should also be present in SDWA
607 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
608 if (Dst) {
609 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
610 SDWAInst.add(*Dst);
611 } else {
612 assert(TII->isVOPC(MI));
613 }
614
615 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
616 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
617 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
618 assert(
619 Src0 &&
620 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
621 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
622 SDWAInst.addImm(0);
623 SDWAInst.add(*Src0);
624
625 // Copy src1 if present, initialize src1_modifiers.
626 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
627 if (Src1) {
628 assert(
629 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
630 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
631 SDWAInst.addImm(0);
632 SDWAInst.add(*Src1);
633 } else {
634 assert(TII->isVOP1(MI));
635 }
636
637 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
638 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
639 // v_mac_f16/32 has additional src2 operand tied to vdst
640 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
641 assert(Src2);
642 SDWAInst.add(*Src2);
643 }
644
645 // Initialize clamp.
646 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
647 SDWAInst.addImm(0);
648
649 // Initialize dst_sel and dst_unused if present
650 if (Dst) {
651 assert(
652 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 &&
653 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1);
654 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
655 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
656 }
657
658 // Initialize src0_sel
659 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
660 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
661
662
663 // Initialize src1_sel if present
664 if (Src1) {
665 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
666 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
667 }
668
669 // Apply all sdwa operand pattenrs
670 bool Converted = false;
671 for (auto &Operand : SDWAOperands) {
672 Converted |= Operand->convertToSDWA(*SDWAInst, TII);
673 }
674 if (!Converted) {
675 SDWAInst->eraseFromParent();
676 return false;
677 }
678
679 DEBUG(dbgs() << "Convert instruction:" << MI
680 << "Into:" << *SDWAInst << '\n');
681 ++NumSDWAInstructionsPeepholed;
682
683 MI.eraseFromParent();
684 return true;
685}
686
687bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
688 const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
689
690 if (!ST.hasSDWA() ||
691 !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
692 return false;
693 }
694
695 MRI = &MF.getRegInfo();
696 TRI = ST.getRegisterInfo();
697 TII = ST.getInstrInfo();
698
699 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
700
701 // FIXME: For now we only combine instructions in one basic block
702 for (MachineBasicBlock &MBB : MF) {
703 SDWAOperands.clear();
704 matchSDWAOperands(MBB);
705
706 PotentialMatches.clear();
707 for (auto &OperandPair : SDWAOperands) {
708 auto &Operand = OperandPair.second;
709 MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
710 if (PotentialMI) {
711 PotentialMatches[PotentialMI].push_back(std::move(Operand));
712 }
713 }
714
715 for (auto &PotentialPair : PotentialMatches) {
716 MachineInstr &PotentialMI = *PotentialPair.first;
717 convertToSDWA(PotentialMI, PotentialPair.second);
718 }
719 }
720 return false;
721}