[AMDGPU] Untangle SDWA pass from SIShrinkInstructions
Remove dependency of SDWA pass on SIShrinkInstructions.
The goal is to move SDWA even higher in the stack to avoid second run
of MachineLICM, MachineCSE and SIFoldOperands.
Also added handling to preserve original src modifiers.
Differential Revision: https://reviews.llvm.org/D33860
llvm-svn: 304665
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index fae249b..522cc0a 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -129,7 +129,8 @@
bool getNeg() const { return Neg; }
bool getSext() const { return Sext; }
- uint64_t getSrcMods() const;
+ uint64_t getSrcMods(const SIInstrInfo *TII,
+ const MachineOperand *SrcOp) const;
};
class SDWADstOperand : public SDWAOperand {
@@ -240,13 +241,24 @@
return SuperMask.all();
}
-uint64_t SDWASrcOperand::getSrcMods() const {
+uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
+ const MachineOperand *SrcOp) const {
uint64_t Mods = 0;
+ const auto *MI = SrcOp->getParent();
+ if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
+ if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
+ Mods = Mod->getImm();
+ }
+ } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
+ if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
+ Mods = Mod->getImm();
+ }
+ }
if (Abs || Neg) {
assert(!Sext &&
"Float and integer src modifiers can't be set simulteniously");
Mods |= Abs ? SISrcMods::ABS : 0;
- Mods |= Neg ? SISrcMods::NEG : 0;
+ Mods ^= Neg ? SISrcMods::NEG : 0;
} else if (Sext) {
Mods |= SISrcMods::SEXT;
}
@@ -312,7 +324,7 @@
}
copyRegOperand(*Src, *getTargetOperand());
SrcSel->setImm(getSrcSel());
- SrcMods->setImm(getSrcMods());
+ SrcMods->setImm(getSrcMods(TII, Src));
getTargetOperand()->setIsKill(false);
return true;
}
@@ -409,7 +421,10 @@
switch (Opcode) {
case AMDGPU::V_LSHRREV_B32_e32:
case AMDGPU::V_ASHRREV_I32_e32:
- case AMDGPU::V_LSHLREV_B32_e32: {
+ case AMDGPU::V_LSHLREV_B32_e32:
+ case AMDGPU::V_LSHRREV_B32_e64:
+ case AMDGPU::V_ASHRREV_I32_e64:
+ case AMDGPU::V_LSHLREV_B32_e64: {
// from: v_lshrrev_b32_e32 v1, 16/24, v0
// to SDWA src:v0 src_sel:WORD_1/BYTE_3
@@ -432,7 +447,8 @@
TRI->isPhysicalRegister(Dst->getReg()))
break;
- if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
+ if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
+ Opcode == AMDGPU::V_LSHLREV_B32_e64) {
auto SDWADst = make_unique<SDWADstOperand>(
Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
@@ -441,7 +457,8 @@
} else {
auto SDWASrc = make_unique<SDWASrcOperand>(
Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
- Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
+ Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
+ Opcode != AMDGPU::V_LSHRREV_B32_e64);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
SDWAOperands[&MI] = std::move(SDWASrc);
++NumSDWAPatternsFound;
@@ -451,7 +468,10 @@
case AMDGPU::V_LSHRREV_B16_e32:
case AMDGPU::V_ASHRREV_I16_e32:
- case AMDGPU::V_LSHLREV_B16_e32: {
+ case AMDGPU::V_LSHLREV_B16_e32:
+ case AMDGPU::V_LSHRREV_B16_e64:
+ case AMDGPU::V_ASHRREV_I16_e64:
+ case AMDGPU::V_LSHLREV_B16_e64: {
// from: v_lshrrev_b16_e32 v1, 8, v0
// to SDWA src:v0 src_sel:BYTE_1
@@ -472,7 +492,8 @@
TRI->isPhysicalRegister(Dst->getReg()))
break;
- if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
+ if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
+ Opcode == AMDGPU::V_LSHLREV_B16_e64) {
auto SDWADst =
make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
@@ -481,7 +502,8 @@
} else {
auto SDWASrc = make_unique<SDWASrcOperand>(
Src1, Dst, BYTE_1, false, false,
- Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
+ Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
+ Opcode != AMDGPU::V_LSHRREV_B16_e64);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
SDWAOperands[&MI] = std::move(SDWASrc);
++NumSDWAPatternsFound;
@@ -549,20 +571,25 @@
++NumSDWAPatternsFound;
break;
}
- case AMDGPU::V_AND_B32_e32: {
+ case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::V_AND_B32_e64: {
// e.g.:
// from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
// to SDWA src:v0 src_sel:WORD_0/BYTE_0
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- auto Imm = foldToImm(*Src0);
- if (!Imm)
- break;
-
- if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
- break;
-
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ auto ValSrc = Src1;
+ auto Imm = foldToImm(*Src0);
+
+ if (!Imm) {
+ Imm = foldToImm(*Src1);
+ ValSrc = Src0;
+ }
+
+ if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
+ break;
+
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
if (TRI->isPhysicalRegister(Src1->getReg()) ||
@@ -570,7 +597,7 @@
break;
auto SDWASrc = make_unique<SDWASrcOperand>(
- Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
+ ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
SDWAOperands[&MI] = std::move(SDWASrc);
++NumSDWAPatternsFound;
@@ -583,28 +610,38 @@
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
// Check if this instruction has opcode that supports SDWA
- return AMDGPU::getSDWAOp(MI.getOpcode()) != -1;
+ unsigned Opc = MI.getOpcode();
+ if (AMDGPU::getSDWAOp(Opc) != -1)
+ return true;
+ int Opc32 = AMDGPU::getVOPe32(Opc);
+ if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1)
+ return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
+ !TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+ return false;
}
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
// Convert to sdwa
int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
+ if (SDWAOpcode == -1)
+ SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode()));
assert(SDWAOpcode != -1);
+ // Copy dst, if it is present in original then should also be present in SDWA
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (!Dst && !TII->isVOPC(MI))
+ return false;
+
const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
// Create SDWA version of instruction MI and initialize its operands
MachineInstrBuilder SDWAInst =
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
- // Copy dst, if it is present in original then should also be present in SDWA
- MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
if (Dst) {
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
SDWAInst.add(*Dst);
- } else {
- assert(TII->isVOPC(MI));
}
// Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
@@ -614,7 +651,10 @@
Src0 &&
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
- SDWAInst.addImm(0);
+ if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
+ SDWAInst.addImm(Mod->getImm());
+ else
+ SDWAInst.addImm(0);
SDWAInst.add(*Src0);
// Copy src1 if present, initialize src1_modifiers.
@@ -623,10 +663,11 @@
assert(
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
- SDWAInst.addImm(0);
+ if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
+ SDWAInst.addImm(Mod->getImm());
+ else
+ SDWAInst.addImm(0);
SDWAInst.add(*Src1);
- } else {
- assert(TII->isVOP1(MI));
}
if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||