[AMDGPU] SDWA Peephole: improve search for immediates in SDWA patterns
Previously compiler often extracted common immediates into specific register, e.g.:
```
%vreg0 = S_MOV_B32 0xff;
%vreg2 = V_AND_B32_e32 %vreg0, %vreg1
%vreg4 = V_AND_B32_e32 %vreg0, %vreg3
```
Because of this SDWA peephole failed to find SDWA convertible pattern. E.g. in previous example this could be converted into 2 SDWA src operands:
```
SDWA src: %vreg2 src_sel:BYTE_0
SDWA src: %vreg4 src_sel:BYTE_0
```
With this change peephole check if operand is either immediate or register that is copy of immediate.
llvm-svn: 299202
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index d2844c3..67c86c3 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -51,6 +51,8 @@
std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+ Optional<int64_t> foldToImm(const MachineOperand &Op) const;
+
public:
static char ID;
@@ -375,6 +377,33 @@
return true;
}
+Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
+ if (Op.isImm()) {
+ return Op.getImm();
+ }
+
+ // If this is not immediate then it can be copy of immediate value, e.g.:
+ // %vreg1<def> = S_MOV_B32 255;
+ if (Op.isReg()) {
+ for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
+ if (!isSameReg(Op, Def))
+ continue;
+
+ const MachineInstr *DefInst = Def.getParent();
+ if (!TII->isFoldableCopy(*DefInst) || !isSameBB(Op.getParent(), DefInst))
+ return None;
+
+ const MachineOperand &Copied = DefInst->getOperand(1);
+ if (!Copied.isImm())
+ return None;
+
+ return Copied.getImm();
+ }
+ }
+
+ return None;
+}
+
void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
for (MachineInstr &MI : MBB) {
unsigned Opcode = MI.getOpcode();
@@ -391,11 +420,11 @@
// from: v_lshlrev_b32_e32 v1, 16/24, v0
// to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- if (!Src0->isImm())
+ auto Imm = foldToImm(*Src0);
+ if (!Imm)
break;
- int64_t Imm = Src0->getImm();
- if (Imm != 16 && Imm != 24)
+ if (*Imm != 16 && *Imm != 24)
break;
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
@@ -406,13 +435,13 @@
if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
auto SDWADst = make_unique<SDWADstOperand>(
- Dst, Src1, Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
+ Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
SDWAOperands[&MI] = std::move(SDWADst);
++NumSDWAPatternsFound;
} else {
auto SDWASrc = make_unique<SDWASrcOperand>(
- Src1, Dst, Imm == 16 ? WORD_1 : BYTE_3, false, false,
+ Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
SDWAOperands[&MI] = std::move(SDWASrc);
@@ -433,7 +462,8 @@
// from: v_lshlrev_b16_e32 v1, 8, v0
// to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- if (!Src0->isImm() || Src0->getImm() != 8)
+ auto Imm = foldToImm(*Src0);
+ if (!Imm || *Imm != 8)
break;
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
@@ -477,30 +507,30 @@
// 24 | 8 | BYTE_3
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- if (!Src1->isImm())
+ auto Offset = foldToImm(*Src1);
+ if (!Offset)
break;
- int64_t Offset = Src1->getImm();
MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
- if (!Src2->isImm())
+ auto Width = foldToImm(*Src2);
+ if (!Width)
break;
- int64_t Width = Src2->getImm();
SdwaSel SrcSel = DWORD;
- if (Offset == 0 && Width == 8)
+ if (*Offset == 0 && *Width == 8)
SrcSel = BYTE_0;
- else if (Offset == 0 && Width == 16)
+ else if (*Offset == 0 && *Width == 16)
SrcSel = WORD_0;
- else if (Offset == 0 && Width == 32)
+ else if (*Offset == 0 && *Width == 32)
SrcSel = DWORD;
- else if (Offset == 8 && Width == 8)
+ else if (*Offset == 8 && *Width == 8)
SrcSel = BYTE_1;
- else if (Offset == 16 && Width == 8)
+ else if (*Offset == 16 && *Width == 8)
SrcSel = BYTE_2;
- else if (Offset == 16 && Width == 16)
+ else if (*Offset == 16 && *Width == 16)
SrcSel = WORD_1;
- else if (Offset == 24 && Width == 8)
+ else if (*Offset == 24 && *Width == 8)
SrcSel = BYTE_3;
else
break;
@@ -526,11 +556,11 @@
// to SDWA src:v0 src_sel:WORD_0/BYTE_0
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- if (!Src0->isImm())
+ auto Imm = foldToImm(*Src0);
+ if (!Imm)
break;
- int64_t Imm = Src0->getImm();
- if (Imm != 0x0000ffff && Imm != 0x000000ff)
+ if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
break;
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
@@ -541,7 +571,7 @@
break;
auto SDWASrc = make_unique<SDWASrcOperand>(
- Src1, Dst, Imm == 0x0000ffff ? WORD_0 : BYTE_0);
+ Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
SDWAOperands[&MI] = std::move(SDWASrc);
++NumSDWAPatternsFound;