[AMDGPU] Add support for multi-dword s.buffer.load intrinsic
Summary:
Patch by Marek Olsak and David Stuttard, both of AMD.
This adds a new amdgcn intrinsic supporting s.buffer.load, in particular
multiple dword variants. These are convenient to use from some front-end
implementations.
Also modified the existing llvm.SI.load.const intrinsic to common up the
underlying implementation.
This modification also requires that we can lower to non-uniform loads correctly
by splitting larger dword variants into sizes supported by the non-uniform
versions of the load.
V2: Addressed minor review comments.
V3: i1 glc is now i32 cachepolicy for consistency with buffer and
tbuffer intrinsics, plus fixed formatting issue.
V4: Added glc test.
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D51098
Change-Id: I83a6e00681158bb243591a94a51c7baa445f169b
llvm-svn: 340684
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8c2f828..093a6f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4170,6 +4170,7 @@
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
+ NODE_NAME_CASE(SBUFFER_LOAD)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_FORMAT)
NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 52c3838..ae029be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -486,6 +486,7 @@
BUFFER_LOAD,
BUFFER_LOAD_FORMAT,
BUFFER_LOAD_FORMAT_D16,
+ SBUFFER_LOAD,
BUFFER_STORE,
BUFFER_STORE_FORMAT,
BUFFER_STORE_FORMAT_D16,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0cf23c5..4544156 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4921,8 +4921,9 @@
MFI->getArgInfo().WorkItemIDZ);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops[] = {
- Op.getOperand(1),
- Op.getOperand(2)
+ Op.getOperand(1), // Ptr
+ Op.getOperand(2), // Offset
+ DAG.getTargetConstant(0, DL, MVT::i1) // glc
};
MachineMemOperand *MMO = MF.getMachineMemOperand(
@@ -4930,7 +4931,26 @@
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
VT.getStoreSize(), 4);
- return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+ SDVTList VTList = DAG.getVTList(MVT::i32);
+ SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+ VTList, Ops, MVT::i32, MMO);
+
+ return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
+ }
+ case Intrinsic::amdgcn_s_buffer_load: {
+ unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ SDValue Ops[] = {
+ Op.getOperand(1), // Ptr
+ Op.getOperand(2), // Offset
+ DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc
+ };
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), VT.getStoreSize());
+ return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
Op->getVTList(), Ops, VT, MMO);
}
case Intrinsic::amdgcn_fdiv_fast:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d0548f6..4c30600 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3904,8 +3904,34 @@
Inst.eraseFromParent();
continue;
- case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: {
- unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: {
+ unsigned VDst;
+ unsigned NewOpcode;
+
+ switch(Opcode) {
+ case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
+ NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+ VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+ splitScalarBuffer(Worklist, Inst);
+ Inst.eraseFromParent();
+ continue;
+ }
+
const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff);
auto Add = MRI.getUniqueVRegDef(VAddr->getReg());
unsigned Offset = 0;
@@ -3956,7 +3982,7 @@
MachineInstr *NewInstr =
BuildMI(*MBB, Inst, Inst.getDebugLoc(),
- get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
+ get(NewOpcode), VDst)
.add(*VAddr) // vaddr
.add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
.addImm(0) // soffset
@@ -4457,6 +4483,73 @@
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist,
+ MachineInstr &Inst) const {
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+ MachineBasicBlock::iterator MII = Inst;
+ auto &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);;
+ MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase);
+ MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff);
+ MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc);
+
+ unsigned Opcode = Inst.getOpcode();
+ unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
+ unsigned Count = 0;
+ const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
+ const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+
+ switch(Opcode) {
+ default:
+ return;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
+ Count = 2;
+ break;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR:
+ Count = 4;
+ break;
+ }
+
+ // FIXME: Should also attempt to build VAddr and Offset like the non-split
+ // case (see call site for this function)
+
+ // Create a vector of result registers
+ SmallVector<unsigned, 8> ResultRegs;
+ for (unsigned i = 0; i < Count ; ++i) {
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass);
+ MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg)
+ .addReg(Offset.getReg()) // offset
+ .addReg(Rsrc.getReg()) // rsrc
+ .addImm(0) // soffset
+ .addImm(i << 4) // inst_offset
+ .addImm(Glc.getImm()) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addMemOperand(*Inst.memoperands_begin());
+ // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE
+ auto &NewDestOp = NewMI.getOperand(0);
+ for (unsigned i = 0 ; i < 4 ; i++)
+ ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass,
+ RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass));
+ }
+ // Create a new combined result to replace original with
+ unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+ MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL,
+ get(TargetOpcode::REG_SEQUENCE), FullDestReg);
+
+ for (unsigned i = 0 ; i < Count * 4 ; ++i) {
+ CombinedResBuilder
+ .addReg(ResultRegs[i])
+ .addImm(RI.getSubRegFromChannel(i));
+ }
+
+ MRI.replaceRegWith(Dest.getReg(), FullDestReg);
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
+}
+
void SIInstrInfo::addUsersToMoveToVALUWorklist(
unsigned DstReg,
MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 2ba6671..a85d434 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -101,6 +101,8 @@
MachineInstr &Inst) const;
void splitScalar64BitBFE(SetVectorType &Worklist,
MachineInstr &Inst) const;
+ void splitScalarBuffer(SetVectorType &Worklist,
+ MachineInstr &Inst) const;
void movePackToVALU(SetVectorType &Worklist,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 0248b47..f6ce31f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -40,9 +40,9 @@
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
-def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
- SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
- [SDNPMayLoad, SDNPMemOperand]
+def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
+ SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>,
+ [SDNPMayLoad, SDNPMemOperand]
>;
def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 7485326..2d68908 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -409,6 +409,22 @@
>;
}
+multiclass SMLoad_Pattern <string Instr, ValueType vt> {
+ // 1. Offset as an immediate
+ // name this pattern to reuse AddedComplexity on CI
+ def _IMM : GCNPat <
+ (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc)))
+ >;
+
+ // 2. Offset loaded in an 32bit SGPR
+ def : GCNPat <
+ (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc)))
+ >;
+}
+
+
let OtherPredicates = [isSICI] in {
def : GCNPat <
(i64 (readcyclecounter)),
@@ -427,18 +443,12 @@
defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
-// 1. Offset as an immediate
-def SM_LOAD_PATTERN : GCNPat < // name this pattern to reuse AddedComplexity on CI
- (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
->;
-
-// 2. Offset loaded in an 32bit SGPR
-def : GCNPat <
- (SIload_constant v4i32:$sbase, i32:$offset),
- (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
->;
-
+// Name the pattern to reuse AddedComplexity on CI
+defm SM_LOAD_PATTERN : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>;
} // End let AddedComplexity = 100
let OtherPredicates = [isVI] in {
@@ -757,7 +767,7 @@
def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
-let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in {
+let AddedComplexity = SM_LOAD_PATTERN_IMM.AddedComplexity in {
class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
@@ -771,11 +781,17 @@
def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>;
def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
-def : GCNPat <
- (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
- (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
+class SMLoad_Pattern_ci <string Instr, ValueType vt> : GCNPat <
+ (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)),
+ (!cast<InstSI>(Instr) $sbase, $offset, (as_i1imm $glc))> {
let OtherPredicates = [isCI]; // should this be isCIOnly?
}
+def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORD_IMM_ci", i32>;
+def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX2_IMM_ci", v2i32>;
+def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX4_IMM_ci", v4i32>;
+def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX8_IMM_ci", v8i32>;
+def : SMLoad_Pattern_ci <"S_BUFFER_LOAD_DWORDX16_IMM_ci", v16i32>;
+
} // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity