[AMDGPU] gfx1010 premlane instructions
Differential Revision: https://reviews.llvm.org/D63202
llvm-svn: 363185
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 3787952..bc4fe3d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1148,6 +1148,7 @@
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst);
bool validateLdsDirect(const MCInst &Inst);
+ bool validateOpSel(const MCInst &Inst);
bool validateVOP3Literal(const MCInst &Inst) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -3003,6 +3004,19 @@
return NumLiterals <= 1;
}
+bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) {
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ if (OpSel & ~3)
+ return false;
+ }
+ return true;
+}
+
// VOP3 literal is only allowed in GFX10+ and only one can be used
bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
unsigned Opcode = Inst.getOpcode();
@@ -3071,6 +3085,11 @@
"integer clamping is not supported on this GPU");
return false;
}
+ if (!validateOpSel(Inst)) {
+ Error(IDLoc,
+ "invalid op_sel operand");
+ return false;
+ }
// For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
if (!validateMIMGD16(Inst)) {
Error(IDLoc,
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 87a8c06..f19fa68 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -115,6 +115,12 @@
}
}
+static bool isPermlane(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ return Opcode == AMDGPU::V_PERMLANE16_B32 ||
+ Opcode == AMDGPU::V_PERMLANEX16_B32;
+}
+
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);
@@ -835,11 +841,49 @@
void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVMEMtoScalarWriteHazards(MI);
+ fixVcmpxPermlaneHazards(MI);
fixSMEMtoVectorWriteHazards(MI);
fixVcmpxExecWARHazard(MI);
fixLdsBranchVmemWARHazard(MI);
}
+bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
+ if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ auto IsHazardFn = [TII] (MachineInstr *MI) {
+ return TII->isVOPC(*MI);
+ };
+
+ auto IsExpiredFn = [] (MachineInstr *MI, int) {
+ if (!MI)
+ return false;
+ unsigned Opc = MI->getOpcode();
+ return SIInstrInfo::isVALU(*MI) &&
+ Opc != AMDGPU::V_NOP_e32 &&
+ Opc != AMDGPU::V_NOP_e64 &&
+ Opc != AMDGPU::V_NOP_sdwa;
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ // V_NOP will be discarded by SQ.
+ // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
+ // which is always a VGPR and available.
+ auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
+ unsigned Reg = Src0->getReg();
+ bool IsUndef = Src0->isUndef();
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_MOV_B32_e32))
+ .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
+ .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
+
+ return true;
+}
+
bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
if (!ST.hasVMEMtoScalarWriteHazard())
return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index bf55976..0c4c9d9 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -84,8 +84,8 @@
int checkAnyInstHazards(MachineInstr *MI);
int checkReadM0Hazards(MachineInstr *SMovRel);
int checkNSAtoVMEMHazard(MachineInstr *MI);
-
void fixHazards(MachineInstr *MI);
+ bool fixVcmpxPermlaneHazards(MachineInstr *MI);
bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
bool fixSMEMtoVectorWriteHazards(MachineInstr *MI);
bool fixVcmpxExecWARHazard(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 0e27063..114f98c 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1005,6 +1005,18 @@
void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) {
+ auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
+ unsigned FI = !!(MI->getOperand(FIN).getImm() & SISrcMods::OP_SEL_0);
+ unsigned BC = !!(MI->getOperand(BCN).getImm() & SISrcMods::OP_SEL_0);
+ if (FI || BC)
+ O << " op_sel:[" << FI << ',' << BC << ']';
+ return;
+ }
+
printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O);
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c444c08..3bb1ddc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9721,6 +9721,24 @@
Ops.push_back(ImpDef.getValue(1));
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
+ case AMDGPU::V_PERMLANE16_B32:
+ case AMDGPU::V_PERMLANEX16_B32: {
+ ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0));
+ ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2));
+ if (!FI->getZExtValue() && !BC->getZExtValue())
+ break;
+ SDValue VDstIn = Node->getOperand(6);
+ if (VDstIn.isMachineOpcode()
+ && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF)
+ break;
+ MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ SDLoc(Node), MVT::i32);
+ SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1),
+ SDValue(BC, 0), Node->getOperand(3),
+ Node->getOperand(4), Node->getOperand(5),
+ SDValue(ImpDef, 0), Node->getOperand(7) };
+ return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ }
default:
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 775b35d..dc35bf4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3790,6 +3790,26 @@
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
};
+ if (Opc == AMDGPU::V_PERMLANE16_B32 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32) {
+ // src1 and src2 must be scalar
+ MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
+ MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
+ const DebugLoc &DL = MI.getDebugLoc();
+ if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src1);
+ Src1.ChangeToRegister(Reg, false);
+ }
+ if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src2);
+ Src2.ChangeToRegister(Reg, false);
+ }
+ }
+
// Find the one SGPR operand we are allowed to use.
int ConstantBusLimit = ST.getConstantBusLimit(Opc);
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 4129aac..a15c057 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -625,9 +625,35 @@
} // End SubtargetPredicate = isGFX9Plus
+def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
+ let Src0RC64 = VRegSrc_32;
+ let Src1RC64 = SCSrc_b32;
+ let Src2RC64 = SCSrc_b32;
+ let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,
+ IntOpSelMods:$src1_modifiers, SCSrc_b32:$src1,
+ IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2,
+ VGPR_32:$vdst_in, op_sel:$op_sel);
+ let HasClamp = 0;
+ let HasOMod = 0;
+}
+
let SubtargetPredicate = isGFX10Plus in {
def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32>;
+
+ let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ def V_PERMLANE16_B32 : VOP3Inst <"v_permlane16_b32", VOP3_PERMLANE_Profile>;
+ def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
+ } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+
+ def : GCNPat<
+ (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
+ >;
+ def : GCNPat<
+ (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
+ >;
} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
@@ -790,6 +816,8 @@
defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16_e64", "v_min_u16">;
defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16_e64", "v_min_i16">;
defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_e64", "v_lshlrev_b16">;
+defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>;
+defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
//===----------------------------------------------------------------------===//
// GFX7, GFX10.