[AMDGPU] Asm/disasm clamp modifier on vop3 int arithmetic
Allow the clamp modifier on vop3 int arithmetic instructions in assembly
and disassembly.
This involved adding a clamp operand to the affected instructions in MIR
and MC, and thus having to fix up several places in codegen and MIR
tests.
Differential Revision: https://reviews.llvm.org/D59267
Change-Id: Ic7775105f02a985b668fa658a0cd7837846a534e
llvm-svn: 356399
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index bbe642e..73c824a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -932,7 +932,8 @@
AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
- { N->getOperand(0), N->getOperand(1) });
+ {N->getOperand(0), N->getOperand(1),
+ CurDAG->getConstant(0, {}, MVT::i1)/*clamp bit*/});
}
void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
@@ -1032,13 +1033,19 @@
Zero, Addr.getOperand(1));
if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
- // FIXME: Select to VOP3 version for with-carry.
- unsigned SubOp = Subtarget->hasAddNoCarry() ?
- AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+ SmallVector<SDValue, 3> Opnds;
+ Opnds.push_back(Zero);
+ Opnds.push_back(Addr.getOperand(1));
- MachineSDNode *MachineSub
- = CurDAG->getMachineNode(SubOp, DL, MVT::i32,
- Zero, Addr.getOperand(1));
+ // FIXME: Select to VOP3 version for with-carry.
+ unsigned SubOp = AMDGPU::V_SUB_I32_e32;
+ if (Subtarget->hasAddNoCarry()) {
+ SubOp = AMDGPU::V_SUB_U32_e64;
+ Opnds.push_back(Zero); // clamp bit
+ }
+
+ MachineSDNode *MachineSub =
+ CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
Base = SDValue(MachineSub, 0);
Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
@@ -1106,12 +1113,17 @@
Zero, Addr.getOperand(1));
if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
- unsigned SubOp = Subtarget->hasAddNoCarry() ?
- AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
+ SmallVector<SDValue, 3> Opnds;
+ Opnds.push_back(Zero);
+ Opnds.push_back(Addr.getOperand(1));
+ unsigned SubOp = AMDGPU::V_SUB_I32_e32;
+ if (Subtarget->hasAddNoCarry()) {
+ SubOp = AMDGPU::V_SUB_U32_e64;
+ Opnds.push_back(Zero); // clamp bit
+ }
MachineSDNode *MachineSub
- = CurDAG->getMachineNode(SubOp, DL, MVT::i32,
- Zero, Addr.getOperand(1));
+ = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
Base = SDValue(MachineSub, 0);
Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 196ecd7..abd324e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1092,7 +1092,8 @@
// (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
getAddNoCarry(Entry, Insert, DL, TIDReg)
.addReg(TIDReg)
- .addReg(TIDIGZReg);
+ .addReg(TIDIGZReg)
+ .addImm(0); // clamp bit
} else {
// Get the wave id
BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
@@ -1117,7 +1118,8 @@
unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
getAddNoCarry(MBB, MI, DL, TmpReg)
.addImm(LDSOffset)
- .addReg(TIDReg);
+ .addReg(TIDReg)
+ .addImm(0); // clamp bit
return TmpReg;
}
@@ -4443,6 +4445,7 @@
Inst.RemoveOperand(3);
Inst.setDesc(get(NewOpc));
+ Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
Inst.addImplicitDefUseOperands(*MBB.getParent());
MRI.replaceRegWith(OldDstReg, ResultReg);
legalizeOperands(Inst, MDT);
@@ -4703,7 +4706,8 @@
BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
.addReg(CarryReg, RegState::Define)
.add(SrcReg0Sub0)
- .add(SrcReg1Sub0);
+ .add(SrcReg1Sub0)
+ .addImm(0); // clamp bit
unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
MachineInstr *HiHalf =
@@ -4711,7 +4715,8 @@
.addReg(DeadCarryReg, RegState::Define | RegState::Dead)
.add(SrcReg0Sub1)
.add(SrcReg1Sub1)
- .addReg(CarryReg, RegState::Kill);
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0); // clamp bit
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 680c287..336404f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1760,10 +1760,12 @@
int Pattern = 1;
}
-class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0> {
+class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
+ bit _EnableClamp = 0> {
field list<ValueType> ArgVT = _ArgVT;
field bit EnableF32SrcMods = _EnableF32SrcMods;
+ field bit EnableClamp = _EnableClamp;
field ValueType DstVT = ArgVT[0];
field ValueType Src0VT = ArgVT[1];
@@ -1817,7 +1819,7 @@
field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
- field bit HasClamp = isModifierType<Src0VT>.ret;
+ field bit HasClamp = BitOr<isModifierType<Src0VT>.ret, EnableClamp>.ret;
field bit HasSDWAClamp = EmitDst;
field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp);
@@ -1943,6 +1945,7 @@
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], 0, /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index f6bb7b3..d663616 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -723,7 +723,8 @@
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
.addReg(ImmReg)
- .addReg(AddrReg->getReg(), 0, BaseSubReg);
+ .addReg(AddrReg->getReg(), 0, BaseSubReg)
+ .addImm(0); // clamp bit
BaseSubReg = 0;
}
@@ -816,7 +817,8 @@
TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
.addReg(ImmReg)
- .addReg(AddrReg->getReg(), 0, BaseSubReg);
+ .addReg(AddrReg->getReg(), 0, BaseSubReg)
+ .addImm(0); // clamp bit
BaseSubReg = 0;
}
@@ -1144,7 +1146,8 @@
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
.addReg(CarryReg, RegState::Define)
.addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
- .add(OffsetLo);
+ .add(OffsetLo)
+ .addImm(0); // clamp bit
(void)LoHalf;
LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
@@ -1153,7 +1156,8 @@
.addReg(DeadCarryReg, RegState::Define | RegState::Dead)
.addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
.add(OffsetHi)
- .addReg(CarryReg, RegState::Kill);
+ .addReg(CarryReg, RegState::Kill)
+ .addImm(0); // clamp bit
(void)HiHalf;
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 726e73c..c3e13a6 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -334,7 +334,8 @@
TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
.addReg(OffsetReg, RegState::Kill)
- .addReg(FIReg);
+ .addReg(FIReg)
+ .addImm(0); // clamp bit
}
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
@@ -1108,7 +1109,8 @@
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
.addImm(Offset)
- .addReg(ScaledReg, RegState::Kill);
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(0); // clamp bit
} else {
unsigned ConstOffsetReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -1117,7 +1119,8 @@
.addImm(Offset);
TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
.addReg(ConstOffsetReg, RegState::Kill)
- .addReg(ScaledReg, RegState::Kill);
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(0); // clamp bit
}
}
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 1cb9bdb..27db3f3 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -277,9 +277,9 @@
def VOP_MAC_F32 : VOP_MAC <f32>;
// Write out to vcc or arbitrary SGPR.
-def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], 0, /*EnableClamp=*/1> {
let Asm32 = "$vdst, vcc, $src0, $src1";
- let Asm64 = "$vdst, $sdst, $src0, $src1";
+ let Asm64 = "$vdst, $sdst, $src0, $src1$clamp";
let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
@@ -289,7 +289,7 @@
// Write out to vcc or arbitrary SGPR and read in from vcc or
// arbitrary SGPR.
-def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], 0, /*EnableClamp=*/1> {
// We use VCSrc_b32 to exclude literal constants, even though the
// encoding normally allows them since the implicit VCC use means
// using one would always violate the constant bus
@@ -297,7 +297,7 @@
// technically be possible to use VCC again as src0.
let Src0RC32 = VCSrc_b32;
let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
- let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
+ let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
@@ -440,9 +440,9 @@
let SubtargetPredicate = HasAddNoCarryInsts in {
-defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32, null_frag, "v_add_u32", 1>;
-defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>;
-defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>;
+defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_add_u32", 1>;
+defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
+defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
}
} // End isCommutable = 1
@@ -473,12 +473,12 @@
def : GCNPat<
(AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
- (V_ADDC_U32_e64 $src0, $src1, $src2)
+ (V_ADDC_U32_e64 $src0, $src1, $src2, 0)
>;
def : GCNPat<
(AMDGPUsube i32:$src0, i32:$src1, i1:$src2),
- (V_SUBB_U32_e64 $src0, $src1, $src2)
+ (V_SUBB_U32_e64 $src0, $src1, $src2, 0)
>;
// These instructions only exist on SI and CI
@@ -505,6 +505,15 @@
)
>;
+class DivergentClampingBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
+ GCNPat<
+ (getDivergentFrag<Op>.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1),
+ !if(!cast<Commutable_REV>(Inst).IsOrig,
+ (Inst $src0, $src1, 0),
+ (Inst $src1, $src0, 0)
+ )
+ >;
+
let AddedComplexity = 1 in {
def : DivergentBinOp<srl, V_LSHRREV_B32_e64>;
def : DivergentBinOp<sra, V_ASHRREV_I32_e64>;
@@ -520,7 +529,7 @@
def : DivergentBinOp<add, V_ADD_I32_e32>;
-def : DivergentBinOp<add, V_ADD_I32_e64>;
+def : DivergentClampingBinOp<add, V_ADD_I32_e64>;
def : DivergentBinOp<sub, V_SUB_I32_e32>;
def : DivergentBinOp<sub, V_SUBREV_I32_e32>;