[AMDGPU] New tbuffer intrinsics
Summary:
This commit adds new intrinsics
llvm.amdgcn.raw.tbuffer.load
llvm.amdgcn.struct.tbuffer.load
llvm.amdgcn.raw.tbuffer.store
llvm.amdgcn.struct.tbuffer.store
with the following changes from the llvm.amdgcn.tbuffer.* intrinsics:
* there are separate raw and struct versions: raw does not have an index
arg and sets idxen=0 in the instruction, and struct always sets
idxen=1 in the instruction even if the index is 0, to allow for the
fact that gfx9 does bounds checking differently depending on whether
idxen is set;
* there is a combined format arg (dfmt+nfmt)
* there is a combined cachepolicy arg (glc+slc)
* there are now only two offset args: one for the offset that is
included in bounds checking and swizzling, to be split between the
instruction's voffset and immoffset fields, and one for the offset
that is excluded from bounds checking and swizzling, to go into the
instruction's soffset field.
The AMDISD::TBUFFER_* SD nodes always have an index operand, all three
offset operands, combined format operand, combined cachepolicy operand,
and an extra idxen operand.
The tbuffer pseudo- and real instructions now also have a combined
format operand.
The obsolescent llvm.amdgcn.tbuffer.* and llvm.SI.tbuffer.store
intrinsics continue to work.
V2: Separate raw and struct intrinsics.
V3: Moved extract_glc and extract_slc defs to a more sensible place.
V4: Rebased on D49995.
V5: Only two separate offset args instead of three.
V6: Pseudo- and real instructions have joint format operand.
V7: Restored optionality of dfmt and nfmt in assembler.
V8: Addressed minor review comments.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D49026
Change-Id: If22ad77e349fac3a5d2f72dda53c010377d470d4
llvm-svn: 340268
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c73e6b5..0248b58 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5158,6 +5158,13 @@
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
+ unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
+ unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
+ unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
+ unsigned IdxEn = 1;
+ if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
+ IdxEn = Idx->getZExtValue() != 0;
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -5165,10 +5172,57 @@
Op.getOperand(4), // voffset
Op.getOperand(5), // soffset
Op.getOperand(6), // offset
- Op.getOperand(7), // dfmt
- Op.getOperand(8), // nfmt
- Op.getOperand(9), // glc
- Op.getOperand(10) // slc
+ DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ };
+
+ if (LoadVT.getScalarType() == MVT::f16)
+ return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
+ M, DAG, Ops);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, LoadVT,
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_raw_tbuffer_load: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+ auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(4), // soffset
+ Offsets.second, // offset
+ Op.getOperand(5), // format
+ Op.getOperand(6), // cachepolicy
+ DAG.getConstant(0, DL, MVT::i1), // idxen
+ };
+
+ if (LoadVT.getScalarType() == MVT::f16)
+ return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
+ M, DAG, Ops);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+ Op->getVTList(), Ops, LoadVT,
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_struct_tbuffer_load: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+ auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // format
+ Op.getOperand(7), // cachepolicy
+ DAG.getConstant(1, DL, MVT::i1), // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -5407,6 +5461,10 @@
auto Opcode = NumChannels->getZExtValue() == 3 ?
AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
+ unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
+ unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
+ unsigned Glc = cast<ConstantSDNode>(Op.getOperand(12))->getZExtValue();
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(13))->getZExtValue();
SDValue Ops[] = {
Chain,
Op.getOperand(3), // vdata
@@ -5415,10 +5473,9 @@
VOffset,
Op.getOperand(6), // soffset
Op.getOperand(7), // inst_offset
- Op.getOperand(8), // dfmt
- Op.getOperand(9), // nfmt
- Op.getOperand(12), // glc
- Op.getOperand(13), // slc
+ DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn->isOne(), DL, MVT::i1), // idxen
};
assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
@@ -5438,6 +5495,13 @@
bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
if (IsD16)
VData = handleD16VData(VData, DAG);
+ unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
+ unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
+ unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
+ unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
+ unsigned IdxEn = 1;
+ if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
+ IdxEn = Idx->getZExtValue() != 0;
SDValue Ops[] = {
Chain,
VData, // vdata
@@ -5446,10 +5510,59 @@
Op.getOperand(5), // voffset
Op.getOperand(6), // soffset
Op.getOperand(7), // offset
- Op.getOperand(8), // dfmt
- Op.getOperand(9), // nfmt
- Op.getOperand(10), // glc
- Op.getOperand(11) // slc
+ DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
+ };
+ unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+ AMDGPUISD::TBUFFER_STORE_FORMAT;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
+ }
+
+ case Intrinsic::amdgcn_struct_tbuffer_store: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
+ auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ SDValue Ops[] = {
+ Chain,
+ VData, // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(6), // soffset
+ Offsets.second, // offset
+ Op.getOperand(7), // format
+ Op.getOperand(8), // cachepolicy
+ DAG.getConstant(1, DL, MVT::i1), // idexen
+ };
+ unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+ AMDGPUISD::TBUFFER_STORE_FORMAT;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
+ }
+
+ case Intrinsic::amdgcn_raw_tbuffer_store: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
+ auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ SDValue Ops[] = {
+ Chain,
+ VData, // vdata
+ Op.getOperand(3), // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // format
+ Op.getOperand(7), // cachepolicy
+ DAG.getConstant(0, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -5490,6 +5603,50 @@
}
}
+// The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset
+// (the offset that is included in bounds checking and swizzling, to be split
+// between the instruction's voffset and immoffset fields) and soffset (the
+// offset that is excluded from bounds checking and swizzling, to go in the
+// instruction's soffset field). This function takes the first kind of offset
+// and figures out how to split it between voffset and immoffset.
+std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
+ SDValue Offset, SelectionDAG &DAG) const {
+ SDLoc DL(Offset);
+ const unsigned MaxImm = 4095;
+ SDValue N0 = Offset;
+ ConstantSDNode *C1 = nullptr;
+ if (N0.getOpcode() == ISD::ADD) {
+ if ((C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))))
+ N0 = N0.getOperand(0);
+ } else if ((C1 = dyn_cast<ConstantSDNode>(N0)))
+ N0 = SDValue();
+
+ if (C1) {
+ unsigned ImmOffset = C1->getZExtValue();
+ // If the immediate value is too big for the immoffset field, put the value
+ // mod 4096 into the immoffset field so that the value that is copied/added
+ // for the voffset field is a multiple of 4096, and it stands more chance
+ // of being CSEd with the copy/add for another similar load/store.
+ unsigned Overflow = ImmOffset & ~MaxImm;
+ ImmOffset -= Overflow;
+ C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
+ if (Overflow) {
+ auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
+ if (!N0)
+ N0 = OverflowVal;
+ else {
+ SDValue Ops[] = { N0, OverflowVal };
+ N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
+ }
+ }
+ }
+ if (!N0)
+ N0 = DAG.getConstant(0, DL, MVT::i32);
+ if (!C1)
+ C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
+ return {N0, SDValue(C1, 0)};
+}
+
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
ISD::LoadExtType ExtType, SDValue Op,
const SDLoc &SL, EVT VT) {