blob: 6f9d7522872888cec8131e76ed294bec206129fd [file] [log] [blame]
Eugene Zelenko59e12822017-08-08 00:47:13 +00001//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
Sam Koltonf60ad582017-03-21 12:51:34 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file This pass tries to apply several peephole SDWA patterns.
11///
12/// E.g. original:
Francis Visoiu Mistrih93ef1452017-11-30 12:12:19 +000013/// V_LSHRREV_B32_e32 %0, 16, %1
14/// V_ADD_I32_e32 %2, %0, %3
15/// V_LSHLREV_B32_e32 %4, 16, %2
Sam Koltonf60ad582017-03-21 12:51:34 +000016///
17/// Replace:
Francis Visoiu Mistrih93ef1452017-11-30 12:12:19 +000018/// V_ADD_I32_sdwa %4, %1, %3
Sam Koltonf60ad582017-03-21 12:51:34 +000019/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
20///
21//===----------------------------------------------------------------------===//
22
Sam Koltonf60ad582017-03-21 12:51:34 +000023#include "AMDGPU.h"
24#include "AMDGPUSubtarget.h"
25#include "SIDefines.h"
26#include "SIInstrInfo.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000027#include "SIRegisterInfo.h"
28#include "Utils/AMDGPUBaseInfo.h"
29#include "llvm/ADT/None.h"
30#include "llvm/ADT/Optional.h"
Sam Koltonf60ad582017-03-21 12:51:34 +000031#include "llvm/ADT/STLExtras.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000032#include "llvm/ADT/SmallVector.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000033#include "llvm/ADT/Statistic.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000034#include "llvm/CodeGen/MachineBasicBlock.h"
35#include "llvm/CodeGen/MachineFunction.h"
Sam Koltonf60ad582017-03-21 12:51:34 +000036#include "llvm/CodeGen/MachineFunctionPass.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000037#include "llvm/CodeGen/MachineInstr.h"
Sam Koltonf60ad582017-03-21 12:51:34 +000038#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000039#include "llvm/CodeGen/MachineOperand.h"
40#include "llvm/CodeGen/MachineRegisterInfo.h"
David Blaikieb3bde2e2017-11-17 01:07:10 +000041#include "llvm/CodeGen/TargetRegisterInfo.h"
Nico Weber432a3882018-04-30 14:59:11 +000042#include "llvm/Config/llvm-config.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000043#include "llvm/MC/LaneBitmask.h"
44#include "llvm/MC/MCInstrDesc.h"
45#include "llvm/Pass.h"
46#include "llvm/Support/Debug.h"
47#include "llvm/Support/raw_ostream.h"
Eugene Zelenko59e12822017-08-08 00:47:13 +000048#include <algorithm>
49#include <cassert>
50#include <cstdint>
51#include <memory>
Sam Koltonf60ad582017-03-21 12:51:34 +000052#include <unordered_map>
53
54using namespace llvm;
55
56#define DEBUG_TYPE "si-peephole-sdwa"
57
58STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
59STATISTIC(NumSDWAInstructionsPeepholed,
60 "Number of instruction converted to SDWA.");
61
62namespace {
63
64class SDWAOperand;
Sam Kolton5f7f32c2017-12-04 16:22:32 +000065class SDWADstOperand;
Sam Koltonf60ad582017-03-21 12:51:34 +000066
67class SIPeepholeSDWA : public MachineFunctionPass {
Sam Koltonebfdaf72017-05-18 12:12:03 +000068public:
Eugene Zelenko59e12822017-08-08 00:47:13 +000069 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
Sam Koltonebfdaf72017-05-18 12:12:03 +000070
Sam Koltonf60ad582017-03-21 12:51:34 +000071private:
72 MachineRegisterInfo *MRI;
73 const SIRegisterInfo *TRI;
74 const SIInstrInfo *TII;
75
76 std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
Sam Koltonebfdaf72017-05-18 12:12:03 +000077 std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +000078 SmallVector<MachineInstr *, 8> ConvertedInstructions;
Sam Koltonf60ad582017-03-21 12:51:34 +000079
Sam Kolton27e0f8b2017-03-31 11:42:43 +000080 Optional<int64_t> foldToImm(const MachineOperand &Op) const;
81
Sam Koltonf60ad582017-03-21 12:51:34 +000082public:
83 static char ID;
84
Sam Koltonf60ad582017-03-21 12:51:34 +000085 SIPeepholeSDWA() : MachineFunctionPass(ID) {
86 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
87 }
88
89 bool runOnMachineFunction(MachineFunction &MF) override;
Matt Arsenault9c2f3c42018-02-08 22:46:41 +000090 void matchSDWAOperands(MachineBasicBlock &MBB);
Sam Kolton5f7f32c2017-12-04 16:22:32 +000091 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
Sam Kolton3c4933f2017-06-22 06:26:41 +000092 bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
Sam Koltonf60ad582017-03-21 12:51:34 +000093 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
Sam Kolton3c4933f2017-06-22 06:26:41 +000094 void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
Sam Koltonf60ad582017-03-21 12:51:34 +000095
96 StringRef getPassName() const override { return "SI Peephole SDWA"; }
97
98 void getAnalysisUsage(AnalysisUsage &AU) const override {
99 AU.setPreservesCFG();
100 MachineFunctionPass::getAnalysisUsage(AU);
101 }
102};
103
104class SDWAOperand {
105private:
106 MachineOperand *Target; // Operand that would be used in converted instruction
107 MachineOperand *Replaced; // Operand that would be replace by Target
108
109public:
110 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
111 : Target(TargetOp), Replaced(ReplacedOp) {
112 assert(Target->isReg());
113 assert(Replaced->isReg());
114 }
115
Eugene Zelenko59e12822017-08-08 00:47:13 +0000116 virtual ~SDWAOperand() = default;
Sam Koltonf60ad582017-03-21 12:51:34 +0000117
118 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
119 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
120
121 MachineOperand *getTargetOperand() const { return Target; }
122 MachineOperand *getReplacedOperand() const { return Replaced; }
123 MachineInstr *getParentInst() const { return Target->getParent(); }
Eugene Zelenko59e12822017-08-08 00:47:13 +0000124
Sam Koltonf60ad582017-03-21 12:51:34 +0000125 MachineRegisterInfo *getMRI() const {
126 return &getParentInst()->getParent()->getParent()->getRegInfo();
127 }
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000128
129#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
130 virtual void print(raw_ostream& OS) const = 0;
131 void dump() const { print(dbgs()); }
132#endif
Sam Koltonf60ad582017-03-21 12:51:34 +0000133};
134
135using namespace AMDGPU::SDWA;
136
137class SDWASrcOperand : public SDWAOperand {
138private:
139 SdwaSel SrcSel;
140 bool Abs;
141 bool Neg;
142 bool Sext;
143
144public:
145 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
146 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
147 bool Sext_ = false)
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000148 : SDWAOperand(TargetOp, ReplacedOp),
149 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
Sam Koltonf60ad582017-03-21 12:51:34 +0000150
Eugene Zelenko59e12822017-08-08 00:47:13 +0000151 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
152 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
Sam Koltonf60ad582017-03-21 12:51:34 +0000153
154 SdwaSel getSrcSel() const { return SrcSel; }
155 bool getAbs() const { return Abs; }
156 bool getNeg() const { return Neg; }
157 bool getSext() const { return Sext; }
158
Stanislav Mekhanoshin03306602017-06-03 17:39:47 +0000159 uint64_t getSrcMods(const SIInstrInfo *TII,
160 const MachineOperand *SrcOp) const;
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000161
162#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
163 void print(raw_ostream& OS) const override;
164#endif
Sam Koltonf60ad582017-03-21 12:51:34 +0000165};
166
167class SDWADstOperand : public SDWAOperand {
168private:
169 SdwaSel DstSel;
170 DstUnused DstUn;
171
172public:
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000173
Sam Koltonf60ad582017-03-21 12:51:34 +0000174 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
175 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000176 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
Sam Koltonf60ad582017-03-21 12:51:34 +0000177
Eugene Zelenko59e12822017-08-08 00:47:13 +0000178 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
179 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
Sam Koltonf60ad582017-03-21 12:51:34 +0000180
181 SdwaSel getDstSel() const { return DstSel; }
182 DstUnused getDstUnused() const { return DstUn; }
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000183
184#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
185 void print(raw_ostream& OS) const override;
186#endif
187};
188
189class SDWADstPreserveOperand : public SDWADstOperand {
190private:
191 MachineOperand *Preserve;
192
193public:
194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
195 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
196 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
197 Preserve(PreserveOp) {}
198
199 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
200
201 MachineOperand *getPreservedOperand() const { return Preserve; }
202
203#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
204 void print(raw_ostream& OS) const override;
205#endif
Sam Koltonf60ad582017-03-21 12:51:34 +0000206};
207
Eugene Zelenko59e12822017-08-08 00:47:13 +0000208} // end anonymous namespace
Sam Koltonf60ad582017-03-21 12:51:34 +0000209
210INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
211
212char SIPeepholeSDWA::ID = 0;
213
214char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
215
216FunctionPass *llvm::createSIPeepholeSDWAPass() {
217 return new SIPeepholeSDWA();
218}
219
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000220
221#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Matt Arsenaultc24d5e22018-02-08 22:46:38 +0000222static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000223 switch(Sel) {
224 case BYTE_0: OS << "BYTE_0"; break;
225 case BYTE_1: OS << "BYTE_1"; break;
226 case BYTE_2: OS << "BYTE_2"; break;
227 case BYTE_3: OS << "BYTE_3"; break;
228 case WORD_0: OS << "WORD_0"; break;
229 case WORD_1: OS << "WORD_1"; break;
230 case DWORD: OS << "DWORD"; break;
231 }
232 return OS;
233}
234
235static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
236 switch(Un) {
237 case UNUSED_PAD: OS << "UNUSED_PAD"; break;
238 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
239 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
240 }
241 return OS;
242}
243
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000244static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
245 Operand.print(OS);
Sam Koltonf60ad582017-03-21 12:51:34 +0000246 return OS;
247}
248
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000249LLVM_DUMP_METHOD
250void SDWASrcOperand::print(raw_ostream& OS) const {
251 OS << "SDWA src: " << *getTargetOperand()
252 << " src_sel:" << getSrcSel()
253 << " abs:" << getAbs() << " neg:" << getNeg()
254 << " sext:" << getSext() << '\n';
Sam Koltonf60ad582017-03-21 12:51:34 +0000255}
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000256
257LLVM_DUMP_METHOD
258void SDWADstOperand::print(raw_ostream& OS) const {
259 OS << "SDWA dst: " << *getTargetOperand()
260 << " dst_sel:" << getDstSel()
261 << " dst_unused:" << getDstUnused() << '\n';
262}
263
264LLVM_DUMP_METHOD
265void SDWADstPreserveOperand::print(raw_ostream& OS) const {
266 OS << "SDWA preserve dst: " << *getTargetOperand()
267 << " dst_sel:" << getDstSel()
268 << " preserve:" << *getPreservedOperand() << '\n';
269}
270
Sam Koltonf60ad582017-03-21 12:51:34 +0000271#endif
272
Sam Koltonf60ad582017-03-21 12:51:34 +0000273static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
274 assert(To.isReg() && From.isReg());
275 To.setReg(From.getReg());
276 To.setSubReg(From.getSubReg());
277 To.setIsUndef(From.isUndef());
278 if (To.isUse()) {
279 To.setIsKill(From.isKill());
280 } else {
281 To.setIsDead(From.isDead());
282 }
283}
284
285static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
286 return LHS.isReg() &&
287 RHS.isReg() &&
288 LHS.getReg() == RHS.getReg() &&
289 LHS.getSubReg() == RHS.getSubReg();
290}
291
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000292static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
293 const MachineRegisterInfo *MRI) {
294 if (!Reg->isReg() || !Reg->isDef())
295 return nullptr;
Sam Kolton549c89d2017-06-21 08:53:38 +0000296
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000297 MachineOperand *ResMO = nullptr;
298 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
299 // If there exist use of subreg of Reg then return nullptr
300 if (!isSameReg(UseMO, *Reg))
301 return nullptr;
Sam Koltonf60ad582017-03-21 12:51:34 +0000302
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000303 // Check that there is only one instruction that uses Reg
304 if (!ResMO) {
305 ResMO = &UseMO;
306 } else if (ResMO->getParent() != UseMO.getParent()) {
307 return nullptr;
308 }
309 }
Sam Koltonf60ad582017-03-21 12:51:34 +0000310
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000311 return ResMO;
312}
Sam Koltonf60ad582017-03-21 12:51:34 +0000313
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000314static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
315 const MachineRegisterInfo *MRI) {
316 if (!Reg->isReg())
317 return nullptr;
318
319 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
320 if (!DefInstr)
321 return nullptr;
322
323 for (auto &DefMO : DefInstr->defs()) {
324 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
325 return &DefMO;
326 }
327
Matt Arsenault8ae38bc2017-12-05 20:32:01 +0000328 // Ignore implicit defs.
329 return nullptr;
Sam Koltonf60ad582017-03-21 12:51:34 +0000330}
331
Stanislav Mekhanoshin03306602017-06-03 17:39:47 +0000332uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
333 const MachineOperand *SrcOp) const {
Sam Koltonf60ad582017-03-21 12:51:34 +0000334 uint64_t Mods = 0;
Stanislav Mekhanoshin03306602017-06-03 17:39:47 +0000335 const auto *MI = SrcOp->getParent();
336 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
337 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
338 Mods = Mod->getImm();
339 }
340 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
341 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
342 Mods = Mod->getImm();
343 }
344 }
Sam Koltonf60ad582017-03-21 12:51:34 +0000345 if (Abs || Neg) {
346 assert(!Sext &&
347 "Float and integer src modifiers can't be set simulteniously");
348 Mods |= Abs ? SISrcMods::ABS : 0;
Stanislav Mekhanoshin03306602017-06-03 17:39:47 +0000349 Mods ^= Neg ? SISrcMods::NEG : 0;
Sam Koltonf60ad582017-03-21 12:51:34 +0000350 } else if (Sext) {
351 Mods |= SISrcMods::SEXT;
352 }
353
354 return Mods;
355}
356
357MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
358 // For SDWA src operand potential instruction is one that use register
359 // defined by parent instruction
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000360 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
361 if (!PotentialMO)
362 return nullptr;
Sam Koltonf60ad582017-03-21 12:51:34 +0000363
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000364 return PotentialMO->getParent();
Sam Koltonf60ad582017-03-21 12:51:34 +0000365}
366
367bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
368 // Find operand in instruction that matches source operand and replace it with
369 // target operand. Set corresponding src_sel
Michael Bedy59e5ef72018-03-30 05:03:36 +0000370 bool IsPreserveSrc = false;
Sam Koltonf60ad582017-03-21 12:51:34 +0000371 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
372 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
373 MachineOperand *SrcMods =
374 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +0000375 assert(Src && (Src->isReg() || Src->isImm()));
Sam Koltonf60ad582017-03-21 12:51:34 +0000376 if (!isSameReg(*Src, *getReplacedOperand())) {
Michael Bedy59e5ef72018-03-30 05:03:36 +0000377 // If this is not src0 then it could be src1
Sam Koltonf60ad582017-03-21 12:51:34 +0000378 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
379 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
380 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
381
Michael Bedy59e5ef72018-03-30 05:03:36 +0000382 if (!Src ||
383 !isSameReg(*Src, *getReplacedOperand())) {
384 // It's possible this Src is a tied operand for
385 // UNUSED_PRESERVE, in which case we can either
386 // abandon the peephole attempt, or if legal we can
387 // copy the target operand into the tied slot
388 // if the preserve operation will effectively cause the same
389 // result by overwriting the rest of the dst.
390 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
391 MachineOperand *DstUnused =
392 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
393
394 if (Dst &&
395 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
396 // This will work if the tied src is acessing WORD_0, and the dst is
397 // writing WORD_1. Modifiers don't matter because all the bits that
398 // would be impacted are being overwritten by the dst.
399 // Any other case will not work.
400 SdwaSel DstSel = static_cast<SdwaSel>(
401 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
402 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
403 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
404 IsPreserveSrc = true;
405 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
406 AMDGPU::OpName::vdst);
407 auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
408 Src = &MI.getOperand(TiedIdx);
409 SrcSel = nullptr;
410 SrcMods = nullptr;
411 } else {
412 // Not legal to convert this src
413 return false;
414 }
415 }
416 }
Sam Koltonf60ad582017-03-21 12:51:34 +0000417 assert(Src && Src->isReg());
418
419 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
420 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000421 !isSameReg(*Src, *getReplacedOperand())) {
Sam Koltonf60ad582017-03-21 12:51:34 +0000422 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
423 // src2. This is not allowed.
424 return false;
425 }
426
Michael Bedy59e5ef72018-03-30 05:03:36 +0000427 assert(isSameReg(*Src, *getReplacedOperand()) &&
428 (IsPreserveSrc || (SrcSel && SrcMods)));
Sam Koltonf60ad582017-03-21 12:51:34 +0000429 }
430 copyRegOperand(*Src, *getTargetOperand());
Michael Bedy59e5ef72018-03-30 05:03:36 +0000431 if (!IsPreserveSrc) {
432 SrcSel->setImm(getSrcSel());
433 SrcMods->setImm(getSrcMods(TII, Src));
434 }
Sam Koltonf60ad582017-03-21 12:51:34 +0000435 getTargetOperand()->setIsKill(false);
436 return true;
437}
438
439MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
440 // For SDWA dst operand potential instruction is one that defines register
441 // that this operand uses
442 MachineRegisterInfo *MRI = getMRI();
443 MachineInstr *ParentMI = getParentInst();
Sam Koltonf60ad582017-03-21 12:51:34 +0000444
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000445 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
446 if (!PotentialMO)
447 return nullptr;
Sam Koltonf60ad582017-03-21 12:51:34 +0000448
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000449 // Check that ParentMI is the only instruction that uses replaced register
450 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
451 if (&UseInst != ParentMI)
Sam Koltonf60ad582017-03-21 12:51:34 +0000452 return nullptr;
Sam Koltonf60ad582017-03-21 12:51:34 +0000453 }
454
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000455 return PotentialMO->getParent();
Sam Koltonf60ad582017-03-21 12:51:34 +0000456}
457
458bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
459 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
460
461 if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
462 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
463 getDstSel() != AMDGPU::SDWA::DWORD) {
464 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
465 return false;
466 }
467
468 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
469 assert(Operand &&
470 Operand->isReg() &&
471 isSameReg(*Operand, *getReplacedOperand()));
472 copyRegOperand(*Operand, *getTargetOperand());
473 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
474 assert(DstSel);
475 DstSel->setImm(getDstSel());
476 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
477 assert(DstUnused);
478 DstUnused->setImm(getDstUnused());
479
480 // Remove original instruction because it would conflict with our new
481 // instruction by register definition
482 getParentInst()->eraseFromParent();
483 return true;
484}
485
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000486bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
487 const SIInstrInfo *TII) {
488 // MI should be moved right before v_or_b32.
489 // For this we should clear all kill flags on uses of MI src-operands or else
490 // we can encounter problem with use of killed operand.
491 for (MachineOperand &MO : MI.uses()) {
492 if (!MO.isReg())
493 continue;
494 getMRI()->clearKillFlags(MO.getReg());
495 }
496
497 // Move MI before v_or_b32
498 auto MBB = MI.getParent();
499 MBB->remove(&MI);
500 MBB->insert(getParentInst(), &MI);
501
502 // Add Implicit use of preserved register
503 MachineInstrBuilder MIB(*MBB->getParent(), MI);
504 MIB.addReg(getPreservedOperand()->getReg(),
505 RegState::ImplicitKill,
506 getPreservedOperand()->getSubReg());
507
508 // Tie dst to implicit use
509 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
510 MI.getNumOperands() - 1);
511
512 // Convert MI as any other SDWADstOperand and remove v_or_b32
513 return SDWADstOperand::convertToSDWA(MI, TII);
514}
515
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000516Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
517 if (Op.isImm()) {
518 return Op.getImm();
519 }
520
521 // If this is not immediate then it can be copy of immediate value, e.g.:
Francis Visoiu Mistriha8a83d12017-12-07 10:40:31 +0000522 // %1 = S_MOV_B32 255;
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000523 if (Op.isReg()) {
524 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
525 if (!isSameReg(Op, Def))
526 continue;
527
528 const MachineInstr *DefInst = Def.getParent();
Sam Koltonaff83412017-04-12 09:36:05 +0000529 if (!TII->isFoldableCopy(*DefInst))
Sam Kolton27e0f8b2017-03-31 11:42:43 +0000530 return None;
531
532 const MachineOperand &Copied = DefInst->getOperand(1);
533 if (!Copied.isImm())
534 return None;
535
536 return Copied.getImm();
537 }
538 }
539
540 return None;
541}
542
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000543std::unique_ptr<SDWAOperand>
544SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
545 unsigned Opcode = MI.getOpcode();
546 switch (Opcode) {
547 case AMDGPU::V_LSHRREV_B32_e32:
548 case AMDGPU::V_ASHRREV_I32_e32:
549 case AMDGPU::V_LSHLREV_B32_e32:
550 case AMDGPU::V_LSHRREV_B32_e64:
551 case AMDGPU::V_ASHRREV_I32_e64:
552 case AMDGPU::V_LSHLREV_B32_e64: {
553 // from: v_lshrrev_b32_e32 v1, 16/24, v0
554 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
555
556 // from: v_ashrrev_i32_e32 v1, 16/24, v0
557 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
558
559 // from: v_lshlrev_b32_e32 v1, 16/24, v0
560 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
561 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
562 auto Imm = foldToImm(*Src0);
563 if (!Imm)
564 break;
565
566 if (*Imm != 16 && *Imm != 24)
567 break;
568
569 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
570 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
571 if (TRI->isPhysicalRegister(Src1->getReg()) ||
572 TRI->isPhysicalRegister(Dst->getReg()))
573 break;
574
575 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
576 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
577 return make_unique<SDWADstOperand>(
578 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
579 } else {
580 return make_unique<SDWASrcOperand>(
581 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
582 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
583 Opcode != AMDGPU::V_LSHRREV_B32_e64);
584 }
585 break;
586 }
587
588 case AMDGPU::V_LSHRREV_B16_e32:
589 case AMDGPU::V_ASHRREV_I16_e32:
590 case AMDGPU::V_LSHLREV_B16_e32:
591 case AMDGPU::V_LSHRREV_B16_e64:
592 case AMDGPU::V_ASHRREV_I16_e64:
593 case AMDGPU::V_LSHLREV_B16_e64: {
594 // from: v_lshrrev_b16_e32 v1, 8, v0
595 // to SDWA src:v0 src_sel:BYTE_1
596
597 // from: v_ashrrev_i16_e32 v1, 8, v0
598 // to SDWA src:v0 src_sel:BYTE_1 sext:1
599
600 // from: v_lshlrev_b16_e32 v1, 8, v0
601 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
602 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
603 auto Imm = foldToImm(*Src0);
604 if (!Imm || *Imm != 8)
605 break;
606
607 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
608 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
609
610 if (TRI->isPhysicalRegister(Src1->getReg()) ||
611 TRI->isPhysicalRegister(Dst->getReg()))
612 break;
613
614 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
615 Opcode == AMDGPU::V_LSHLREV_B16_e64) {
616 return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
617 } else {
618 return make_unique<SDWASrcOperand>(
619 Src1, Dst, BYTE_1, false, false,
620 Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
621 Opcode != AMDGPU::V_LSHRREV_B16_e64);
622 }
623 break;
624 }
625
626 case AMDGPU::V_BFE_I32:
627 case AMDGPU::V_BFE_U32: {
628 // e.g.:
629 // from: v_bfe_u32 v1, v0, 8, 8
630 // to SDWA src:v0 src_sel:BYTE_1
631
632 // offset | width | src_sel
633 // ------------------------
634 // 0 | 8 | BYTE_0
635 // 0 | 16 | WORD_0
636 // 0 | 32 | DWORD ?
637 // 8 | 8 | BYTE_1
638 // 16 | 8 | BYTE_2
639 // 16 | 16 | WORD_1
640 // 24 | 8 | BYTE_3
641
642 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
643 auto Offset = foldToImm(*Src1);
644 if (!Offset)
645 break;
646
647 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
648 auto Width = foldToImm(*Src2);
649 if (!Width)
650 break;
651
652 SdwaSel SrcSel = DWORD;
653
654 if (*Offset == 0 && *Width == 8)
655 SrcSel = BYTE_0;
656 else if (*Offset == 0 && *Width == 16)
657 SrcSel = WORD_0;
658 else if (*Offset == 0 && *Width == 32)
659 SrcSel = DWORD;
660 else if (*Offset == 8 && *Width == 8)
661 SrcSel = BYTE_1;
662 else if (*Offset == 16 && *Width == 8)
663 SrcSel = BYTE_2;
664 else if (*Offset == 16 && *Width == 16)
665 SrcSel = WORD_1;
666 else if (*Offset == 24 && *Width == 8)
667 SrcSel = BYTE_3;
668 else
669 break;
670
671 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
672 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
673
674 if (TRI->isPhysicalRegister(Src0->getReg()) ||
675 TRI->isPhysicalRegister(Dst->getReg()))
676 break;
677
678 return make_unique<SDWASrcOperand>(
679 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
680 }
681
682 case AMDGPU::V_AND_B32_e32:
683 case AMDGPU::V_AND_B32_e64: {
684 // e.g.:
685 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
686 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
687
688 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
689 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
690 auto ValSrc = Src1;
691 auto Imm = foldToImm(*Src0);
692
693 if (!Imm) {
694 Imm = foldToImm(*Src1);
695 ValSrc = Src0;
696 }
697
698 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
699 break;
700
701 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
702
Nicolai Haehnlecbebba42018-04-23 13:06:03 +0000703 if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000704 TRI->isPhysicalRegister(Dst->getReg()))
705 break;
706
707 return make_unique<SDWASrcOperand>(
708 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
709 }
710
711 case AMDGPU::V_OR_B32_e32:
712 case AMDGPU::V_OR_B32_e64: {
713 // Patterns for dst_unused:UNUSED_PRESERVE.
714 // e.g., from:
715 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
716 // src1_sel:WORD_1 src2_sel:WORD1
717 // v_add_f16_e32 v3, v1, v2
718 // v_or_b32_e32 v4, v0, v3
719 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
720
721 // Check if one of operands of v_or_b32 is SDWA instruction
722 using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
723 auto CheckOROperandsForSDWA =
724 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
725 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
726 return CheckRetType(None);
727
728 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
729 if (!Op1Def)
730 return CheckRetType(None);
731
732 MachineInstr *Op1Inst = Op1Def->getParent();
733 if (!TII->isSDWA(*Op1Inst))
734 return CheckRetType(None);
735
736 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
737 if (!Op2Def)
738 return CheckRetType(None);
739
740 return CheckRetType(std::make_pair(Op1Def, Op2Def));
741 };
742
743 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
744 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
745 assert(OrSDWA && OrOther);
746 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
747 if (!Res) {
748 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
749 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
750 assert(OrSDWA && OrOther);
751 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
752 if (!Res)
753 break;
754 }
755
756 MachineOperand *OrSDWADef = Res->first;
757 MachineOperand *OrOtherDef = Res->second;
758 assert(OrSDWADef && OrOtherDef);
759
760 MachineInstr *SDWAInst = OrSDWADef->getParent();
761 MachineInstr *OtherInst = OrOtherDef->getParent();
762
763 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
764 // destination patterns don't overlap. Compatible instruction can be either
765 // regular instruction with compatible bitness or SDWA instruction with
766 // correct dst_sel
767 // SDWAInst | OtherInst bitness / OtherInst dst_sel
768 // -----------------------------------------------------
769 // DWORD | no / no
770 // WORD_0 | no / BYTE_2/3, WORD_1
771 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
772 // BYTE_0 | no / BYTE_1/2/3, WORD_1
773 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
774 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
775 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
776 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
777 // but v_add_f32 is not.
778
779 // TODO: add support for non-SDWA instructions as OtherInst.
780 // For now this only works with SDWA instructions. For regular instructions
Michael Bedy80cf9ff2018-03-11 03:27:50 +0000781 // there is no way to determine if the instruction writes only 8/16/24-bit
782 // out of full register size and all registers are at min 32-bit wide.
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000783 if (!TII->isSDWA(*OtherInst))
784 break;
785
786 SdwaSel DstSel = static_cast<SdwaSel>(
787 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
788 SdwaSel OtherDstSel = static_cast<SdwaSel>(
789 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
790
791 bool DstSelAgree = false;
792 switch (DstSel) {
793 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
794 (OtherDstSel == BYTE_3) ||
795 (OtherDstSel == WORD_1));
796 break;
797 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
798 (OtherDstSel == BYTE_1) ||
799 (OtherDstSel == WORD_0));
800 break;
801 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
802 (OtherDstSel == BYTE_2) ||
803 (OtherDstSel == BYTE_3) ||
804 (OtherDstSel == WORD_1));
805 break;
806 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
807 (OtherDstSel == BYTE_2) ||
808 (OtherDstSel == BYTE_3) ||
809 (OtherDstSel == WORD_1));
810 break;
811 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
812 (OtherDstSel == BYTE_1) ||
813 (OtherDstSel == BYTE_3) ||
814 (OtherDstSel == WORD_0));
815 break;
816 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
817 (OtherDstSel == BYTE_1) ||
818 (OtherDstSel == BYTE_2) ||
819 (OtherDstSel == WORD_0));
820 break;
821 default: DstSelAgree = false;
822 }
823
824 if (!DstSelAgree)
825 break;
826
827 // Also OtherInst dst_unused should be UNUSED_PAD
828 DstUnused OtherDstUnused = static_cast<DstUnused>(
829 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
830 if (OtherDstUnused != DstUnused::UNUSED_PAD)
831 break;
832
833 // Create DstPreserveOperand
834 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
835 assert(OrDst && OrDst->isReg());
836
837 return make_unique<SDWADstPreserveOperand>(
838 OrDst, OrSDWADef, OrOtherDef, DstSel);
839
840 }
841 }
842
843 return std::unique_ptr<SDWAOperand>(nullptr);
844}
845
Matt Arsenault9c2f3c42018-02-08 22:46:41 +0000846void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
847 for (MachineInstr &MI : MBB) {
848 if (auto Operand = matchSDWAOperand(MI)) {
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000849 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
Matt Arsenault9c2f3c42018-02-08 22:46:41 +0000850 SDWAOperands[&MI] = std::move(Operand);
851 ++NumSDWAPatternsFound;
Sam Koltonf60ad582017-03-21 12:51:34 +0000852 }
853 }
854}
855
Sam Kolton3c4933f2017-06-22 06:26:41 +0000856bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
857 const SISubtarget &ST) const {
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000858 // Check if this is already an SDWA instruction
859 unsigned Opc = MI.getOpcode();
860 if (TII->isSDWA(Opc))
861 return true;
862
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +0000863 // Check if this instruction has opcode that supports SDWA
Sam Kolton3c4933f2017-06-22 06:26:41 +0000864 if (AMDGPU::getSDWAOp(Opc) == -1)
865 Opc = AMDGPU::getVOPe32(Opc);
866
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000867 if (AMDGPU::getSDWAOp(Opc) == -1)
Sam Kolton3c4933f2017-06-22 06:26:41 +0000868 return false;
869
870 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
871 return false;
872
873 if (TII->isVOPC(Opc)) {
874 if (!ST.hasSDWASdst()) {
875 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
876 if (SDst && SDst->getReg() != AMDGPU::VCC)
877 return false;
878 }
879
Sam Koltona179d252017-06-27 15:02:23 +0000880 if (!ST.hasSDWAOutModsVOPC() &&
881 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
882 TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
Sam Kolton549c89d2017-06-21 08:53:38 +0000883 return false;
884
Sam Koltona179d252017-06-27 15:02:23 +0000885 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
886 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
Sam Kolton3c4933f2017-06-22 06:26:41 +0000887 return false;
Sam Kolton549c89d2017-06-21 08:53:38 +0000888 }
Sam Kolton3c4933f2017-06-22 06:26:41 +0000889
890 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
891 Opc == AMDGPU::V_MAC_F32_e32))
892 return false;
893
Dmitry Preobrazhensky4c45e6f2018-04-16 12:41:38 +0000894 // FIXME: has SDWA but require handling of implicit VCC use
895 if (Opc == AMDGPU::V_CNDMASK_B32_e32)
896 return false;
897
Sam Kolton3c4933f2017-06-22 06:26:41 +0000898 return true;
Sam Koltonebfdaf72017-05-18 12:12:03 +0000899}
900
901bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
902 const SDWAOperandsVector &SDWAOperands) {
Michael Bedy59e5ef72018-03-30 05:03:36 +0000903
Nicola Zaghend34e60c2018-05-14 12:53:11 +0000904 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
Michael Bedy59e5ef72018-03-30 05:03:36 +0000905
Sam Koltonf60ad582017-03-21 12:51:34 +0000906 // Convert to sdwa
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000907 int SDWAOpcode;
908 unsigned Opcode = MI.getOpcode();
909 if (TII->isSDWA(Opcode)) {
910 SDWAOpcode = Opcode;
911 } else {
912 SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
913 if (SDWAOpcode == -1)
914 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
915 }
Sam Koltonf60ad582017-03-21 12:51:34 +0000916 assert(SDWAOpcode != -1);
917
918 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
919
920 // Create SDWA version of instruction MI and initialize its operands
921 MachineInstrBuilder SDWAInst =
922 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
923
Sam Koltona179d252017-06-27 15:02:23 +0000924 // Copy dst, if it is present in original then should also be present in SDWA
925 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
Sam Koltonf60ad582017-03-21 12:51:34 +0000926 if (Dst) {
927 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
928 SDWAInst.add(*Dst);
Sam Koltona179d252017-06-27 15:02:23 +0000929 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
Sam Kolton549c89d2017-06-21 08:53:38 +0000930 assert(Dst &&
931 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
932 SDWAInst.add(*Dst);
Sam Koltona179d252017-06-27 15:02:23 +0000933 } else {
934 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
935 SDWAInst.addReg(AMDGPU::VCC, RegState::Define);
Sam Koltonf60ad582017-03-21 12:51:34 +0000936 }
937
938 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
939 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
940 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
941 assert(
942 Src0 &&
943 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
944 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
Stanislav Mekhanoshin03306602017-06-03 17:39:47 +0000945 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
946 SDWAInst.addImm(Mod->getImm());
947 else
948 SDWAInst.addImm(0);
Sam Koltonf60ad582017-03-21 12:51:34 +0000949 SDWAInst.add(*Src0);
950
951 // Copy src1 if present, initialize src1_modifiers.
952 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
953 if (Src1) {
954 assert(
955 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
956 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
Stanislav Mekhanoshin03306602017-06-03 17:39:47 +0000957 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
958 SDWAInst.addImm(Mod->getImm());
959 else
960 SDWAInst.addImm(0);
Sam Koltonf60ad582017-03-21 12:51:34 +0000961 SDWAInst.add(*Src1);
Sam Koltonf60ad582017-03-21 12:51:34 +0000962 }
963
964 if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
965 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
966 // v_mac_f16/32 has additional src2 operand tied to vdst
967 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
968 assert(Src2);
969 SDWAInst.add(*Src2);
970 }
971
Sam Kolton3c4933f2017-06-22 06:26:41 +0000972 // Copy clamp if present, initialize otherwise
973 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
974 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
975 if (Clamp) {
976 SDWAInst.add(*Clamp);
977 } else {
Sam Kolton549c89d2017-06-21 08:53:38 +0000978 SDWAInst.addImm(0);
Sam Kolton3c4933f2017-06-22 06:26:41 +0000979 }
Sam Kolton549c89d2017-06-21 08:53:38 +0000980
Sam Kolton3c4933f2017-06-22 06:26:41 +0000981 // Copy omod if present, initialize otherwise if needed
Sam Koltona179d252017-06-27 15:02:23 +0000982 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
983 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
984 if (OMod) {
985 SDWAInst.add(*OMod);
986 } else {
987 SDWAInst.addImm(0);
988 }
Sam Kolton3c4933f2017-06-22 06:26:41 +0000989 }
Sam Koltonf60ad582017-03-21 12:51:34 +0000990
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000991 // Copy dst_sel if present, initialize otherwise if needed
Sam Koltona179d252017-06-27 15:02:23 +0000992 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
Sam Kolton5f7f32c2017-12-04 16:22:32 +0000993 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
994 if (DstSel) {
995 SDWAInst.add(*DstSel);
996 } else {
997 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
998 }
999 }
1000
1001 // Copy dst_unused if present, initialize otherwise if needed
1002 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
1003 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1004 if (DstUnused) {
1005 SDWAInst.add(*DstUnused);
1006 } else {
1007 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1008 }
1009 }
1010
1011 // Copy src0_sel if present, initialize otherwise
1012 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
1013 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1014 if (Src0Sel) {
1015 SDWAInst.add(*Src0Sel);
1016 } else {
Sam Koltonf60ad582017-03-21 12:51:34 +00001017 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
Sam Koltona179d252017-06-27 15:02:23 +00001018 }
1019
Sam Kolton5f7f32c2017-12-04 16:22:32 +00001020 // Copy src1_sel if present, initialize otherwise if needed
Sam Koltonf60ad582017-03-21 12:51:34 +00001021 if (Src1) {
1022 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
Sam Kolton5f7f32c2017-12-04 16:22:32 +00001023 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1024 if (Src1Sel) {
1025 SDWAInst.add(*Src1Sel);
1026 } else {
1027 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1028 }
Sam Koltonf60ad582017-03-21 12:51:34 +00001029 }
1030
Michael Bedy59e5ef72018-03-30 05:03:36 +00001031 // Check for a preserved register that needs to be copied.
1032 auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1033 if (DstUnused &&
1034 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
1035 // We expect, if we are here, that the instruction was already in it's SDWA form,
1036 // with a tied operand.
1037 assert(Dst && Dst->isTied());
1038 assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1039 // We also expect a vdst, since sdst can't preserve.
1040 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1041 assert(PreserveDstIdx != -1);
1042
1043 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1044 auto Tied = MI.getOperand(TiedIdx);
1045
1046 SDWAInst.add(Tied);
1047 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1048 }
1049
Matt Arsenaultc24d5e22018-02-08 22:46:38 +00001050 // Apply all sdwa operand patterns.
Sam Koltonf60ad582017-03-21 12:51:34 +00001051 bool Converted = false;
1052 for (auto &Operand : SDWAOperands) {
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001053 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
Sam Koltonebfdaf72017-05-18 12:12:03 +00001054 // There should be no intesection between SDWA operands and potential MIs
1055 // e.g.:
1056 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1057 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1058 // v_add_u32 v3, v4, v2
1059 //
1060 // In that example it is possible that we would fold 2nd instruction into 3rd
1061 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1062 // already destroyed). So if SDWAOperand is also a potential MI then do not
1063 // apply it.
1064 if (PotentialMatches.count(Operand->getParentInst()) == 0)
1065 Converted |= Operand->convertToSDWA(*SDWAInst, TII);
Sam Koltonf60ad582017-03-21 12:51:34 +00001066 }
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +00001067 if (Converted) {
1068 ConvertedInstructions.push_back(SDWAInst);
1069 } else {
Sam Koltonf60ad582017-03-21 12:51:34 +00001070 SDWAInst->eraseFromParent();
1071 return false;
1072 }
1073
Nicola Zaghend34e60c2018-05-14 12:53:11 +00001074 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
Sam Koltonf60ad582017-03-21 12:51:34 +00001075 ++NumSDWAInstructionsPeepholed;
1076
1077 MI.eraseFromParent();
1078 return true;
1079}
1080
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +00001081// If an instruction was converted to SDWA it should not have immediates or SGPR
Sam Kolton3c4933f2017-06-22 06:26:41 +00001082// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
Matt Arsenaultc24d5e22018-02-08 22:46:38 +00001083void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1084 const SISubtarget &ST) const {
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +00001085 const MCInstrDesc &Desc = TII->get(MI.getOpcode());
Sam Kolton3c4933f2017-06-22 06:26:41 +00001086 unsigned ConstantBusCount = 0;
Sam Kolton5f7f32c2017-12-04 16:22:32 +00001087 for (MachineOperand &Op : MI.explicit_uses()) {
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +00001088 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1089 continue;
Sam Kolton3c4933f2017-06-22 06:26:41 +00001090
1091 unsigned I = MI.getOperandNo(&Op);
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +00001092 if (Desc.OpInfo[I].RegClass == -1 ||
1093 !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
1094 continue;
Sam Kolton3c4933f2017-06-22 06:26:41 +00001095
1096 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1097 TRI->isSGPRReg(*MRI, Op.getReg())) {
1098 ++ConstantBusCount;
1099 continue;
1100 }
1101
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +00001102 unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1103 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1104 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1105 if (Op.isImm())
1106 Copy.addImm(Op.getImm());
1107 else if (Op.isReg())
1108 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1109 Op.getSubReg());
1110 Op.ChangeToRegister(VGPR, false);
1111 }
1112}
1113
Sam Koltonf60ad582017-03-21 12:51:34 +00001114bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
1115 const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1116
Matthias Braunf1caa282017-12-15 22:22:58 +00001117 if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
Sam Koltonf60ad582017-03-21 12:51:34 +00001118 return false;
Sam Koltonf60ad582017-03-21 12:51:34 +00001119
1120 MRI = &MF.getRegInfo();
1121 TRI = ST.getRegisterInfo();
1122 TII = ST.getInstrInfo();
Sam Kolton549c89d2017-06-21 08:53:38 +00001123
Sam Koltonebfdaf72017-05-18 12:12:03 +00001124 // Find all SDWA operands in MF.
Sam Kolton5f7f32c2017-12-04 16:22:32 +00001125 bool Ret = false;
Matt Arsenault9c2f3c42018-02-08 22:46:41 +00001126 for (MachineBasicBlock &MBB : MF) {
1127 bool Changed = false;
1128 do {
1129 matchSDWAOperands(MBB);
Sam Koltonf60ad582017-03-21 12:51:34 +00001130
Matt Arsenault9c2f3c42018-02-08 22:46:41 +00001131 for (const auto &OperandPair : SDWAOperands) {
1132 const auto &Operand = OperandPair.second;
1133 MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1134 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1135 PotentialMatches[PotentialMI].push_back(Operand.get());
1136 }
Sam Kolton5f7f32c2017-12-04 16:22:32 +00001137 }
Sam Koltonaff83412017-04-12 09:36:05 +00001138
Matt Arsenault9c2f3c42018-02-08 22:46:41 +00001139 for (auto &PotentialPair : PotentialMatches) {
1140 MachineInstr &PotentialMI = *PotentialPair.first;
1141 convertToSDWA(PotentialMI, PotentialPair.second);
1142 }
Sam Koltonaff83412017-04-12 09:36:05 +00001143
Matt Arsenault9c2f3c42018-02-08 22:46:41 +00001144 PotentialMatches.clear();
1145 SDWAOperands.clear();
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +00001146
Matt Arsenault9c2f3c42018-02-08 22:46:41 +00001147 Changed = !ConvertedInstructions.empty();
Sam Kolton5f7f32c2017-12-04 16:22:32 +00001148
Matt Arsenault9c2f3c42018-02-08 22:46:41 +00001149 if (Changed)
1150 Ret = true;
1151 while (!ConvertedInstructions.empty())
1152 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1153 } while (Changed);
1154 }
Stanislav Mekhanoshin56ea4882017-05-30 16:49:24 +00001155
Stanislav Mekhanoshine4cda742017-06-06 16:42:30 +00001156 return Ret;
Sam Koltonf60ad582017-03-21 12:51:34 +00001157}