AMDGPU: Dimension-aware image intrinsics
Summary:
These new image intrinsics contain the texture type as part of
their name and have each component of the address/coordinate as
individual parameters.
This is a preparatory step for implementing the A16 feature, where
coordinates are passed as half-floats or -ints, but the Z compare
value and texel offsets are still full dwords, making it difficult
or impossible to distinguish between A16 on or off in the old-style
intrinsics.
Additionally, these intrinsics pass the 'texfailpolicy' and
'cachectrl' as i32 bit fields to reduce operand clutter and allow
for future extensibility.
v2:
- gather4 supports 2darray images
- fix a bug with 1D images on SI
Change-Id: I099f309e0a394082a5901ea196c3967afb867f04
Reviewers: arsenm, rampitec, b-sumner
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D44939
llvm-svn: 329166
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 7a7b54e..2d2aaf7 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -464,6 +464,201 @@
//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
}
+/********** ============================== **********/
+/********** Dimension-aware image patterns **********/
+/********** ============================== **********/
+
+class getDwordsType<int dwords> {
+ int NumDwords = dwords;
+ string suffix = !if(!lt(dwords, 1), ?,
+ !if(!eq(dwords, 1), "_V1",
+ !if(!eq(dwords, 2), "_V2",
+ !if(!le(dwords, 4), "_V4",
+ !if(!le(dwords, 8), "_V8",
+ !if(!le(dwords, 16), "_V16", ?))))));
+ ValueType VT = !if(!lt(dwords, 1), ?,
+ !if(!eq(dwords, 1), f32,
+ !if(!eq(dwords, 2), v2f32,
+ !if(!le(dwords, 4), v4f32,
+ !if(!le(dwords, 8), v8f32,
+ !if(!le(dwords, 16), v16f32, ?))))));
+ RegisterClass VReg = !if(!lt(dwords, 1), ?,
+ !if(!eq(dwords, 1), VGPR_32,
+ !if(!eq(dwords, 2), VReg_64,
+ !if(!le(dwords, 4), VReg_128,
+ !if(!le(dwords, 8), VReg_256,
+ !if(!le(dwords, 16), VReg_512, ?))))));
+}
+
+class makeRegSequence_Fold<int i, dag d> {
+ int idx = i;
+ dag lhs = d;
+}
+
+// Generate a dag node which returns a vector register of class RC into which
+// the source operands given by names have been inserted (assuming that each
+// name corresponds to an operand whose size is equal to a subregister).
+class makeRegSequence<ValueType vt, RegisterClass RC, list<string> names> {
+ dag ret =
+ !if(!eq(!size(names), 1),
+ !dag(COPY_TO_REGCLASS, [?, RC], [names[0], ?]),
+ !foldl(makeRegSequence_Fold<0, (vt (IMPLICIT_DEF))>, names, f, name,
+ makeRegSequence_Fold<
+ !add(f.idx, 1),
+ !con((INSERT_SUBREG f.lhs),
+ !dag(INSERT_SUBREG, [?, !cast<SubRegIndex>("sub"#f.idx)],
+ [name, ?]))>).lhs);
+}
+
+class ImageDimPattern<AMDGPUImageDimIntrinsic I,
+ string dop, ValueType dty,
+ string suffix = ""> : GCNPat<(undef), (undef)> {
+ list<AMDGPUArg> AddrArgs = I.P.AddrDefaultArgs;
+ getDwordsType AddrDwords = getDwordsType<!size(AddrArgs)>;
+
+ Instruction MI =
+ !cast<Instruction>(!strconcat("IMAGE_", I.P.OpMod, dop, AddrDwords.suffix, suffix));
+
+ // DAG fragment to match data arguments (vdata for store/atomic, dmask
+ // for non-atomic).
+ dag MatchDataDag =
+ !con(!dag(I, !foreach(arg, I.P.DataArgs, dty),
+ !foreach(arg, I.P.DataArgs, arg.Name)),
+ !if(I.P.IsAtomic, (I), (I i32:$dmask)));
+
+ // DAG fragment to match vaddr arguments.
+ dag MatchAddrDag = !dag(I, !foreach(arg, AddrArgs, arg.Type.VT),
+ !foreach(arg, AddrArgs, arg.Name));
+
+ // DAG fragment to match sampler resource and unorm arguments.
+ dag MatchSamplerDag = !if(I.P.IsSample, (I v4i32:$sampler, i1:$unorm), (I));
+
+ // DAG node that generates the MI vdata for store/atomic
+ getDwordsType DataDwords = getDwordsType<!size(I.P.DataArgs)>;
+ dag GenDataDag =
+ !if(I.P.IsAtomic, (MI makeRegSequence<DataDwords.VT, DataDwords.VReg,
+ !foreach(arg, I.P.DataArgs, arg.Name)>.ret),
+ !if(!size(I.P.DataArgs), (MI $vdata), (MI)));
+
+ // DAG node that generates the MI vaddr
+ dag GenAddrDag = makeRegSequence<AddrDwords.VT, AddrDwords.VReg,
+ !foreach(arg, AddrArgs, arg.Name)>.ret;
+ // DAG fragments that generate various inline flags
+ dag GenDmask =
+ !if(I.P.IsAtomic, (MI !add(!shl(1, DataDwords.NumDwords), -1)),
+ (MI (as_i32imm $dmask)));
+ dag GenGLC =
+ !if(I.P.IsAtomic, (MI 1),
+ (MI (bitextract_imm<0> $cachepolicy)));
+
+ dag MatchIntrinsic = !con(MatchDataDag,
+ MatchAddrDag,
+ (I v8i32:$rsrc),
+ MatchSamplerDag,
+ (I 0/*texfailctrl*/,
+ i32:$cachepolicy));
+ let PatternToMatch =
+ !if(!size(I.RetTypes), (dty MatchIntrinsic), MatchIntrinsic);
+
+ bit IsCmpSwap = !and(I.P.IsAtomic, !eq(!size(I.P.DataArgs), 2));
+ dag ImageInstruction =
+ !con(GenDataDag,
+ (MI GenAddrDag),
+ (MI $rsrc),
+ !if(I.P.IsSample, (MI $sampler), (MI)),
+ GenDmask,
+ !if(I.P.IsSample, (MI (as_i1imm $unorm)), (MI 1)),
+ GenGLC,
+ (MI (bitextract_imm<1> $cachepolicy),
+ 0, /* r128 */
+ 0, /* tfe */
+ 0 /*(as_i1imm $lwe)*/,
+ { I.P.Dim.DA }));
+ let ResultInstrs = [
+ !if(IsCmpSwap, (EXTRACT_SUBREG ImageInstruction, sub0), ImageInstruction)
+ ];
+}
+
+foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
+ AMDGPUImageDimGetResInfoIntrinsics) in {
+ def intr#_pat_v1 : ImageDimPattern<intr, "_V1", f32>;
+ def intr#_pat_v2 : ImageDimPattern<intr, "_V2", v2f32>;
+ def intr#_pat_v4 : ImageDimPattern<intr, "_V4", v4f32>;
+}
+
+// v2f16 and v4f16 are used as data types to signal that D16 should be used.
+// However, they are not (always) legal types, and the SelectionDAG requires us
+// to legalize them before running any patterns. So we legalize them by
+// converting to an int type of equal size and using an internal 'd16helper'
+// intrinsic instead which signifies both the use of D16 and actually allows
+// this integer-based return type.
+multiclass ImageDimD16Helper<AMDGPUImageDimIntrinsic I,
+ AMDGPUImageDimIntrinsic d16helper> {
+ let SubtargetPredicate = HasUnpackedD16VMem in {
+ def _unpacked_v1 : ImageDimPattern<I, "_V1", f16, "_D16_gfx80">;
+ def _unpacked_v2 : ImageDimPattern<d16helper, "_V2", v2i32, "_D16_gfx80">;
+ def _unpacked_v4 : ImageDimPattern<d16helper, "_V4", v4i32, "_D16_gfx80">;
+ } // End HasUnpackedD16VMem.
+
+ let SubtargetPredicate = HasPackedD16VMem in {
+ def _packed_v1 : ImageDimPattern<I, "_V1", f16, "_D16">;
+ // used on gfx810
+ def _packed_v2 : ImageDimPattern<d16helper, "_V1", i32, "_D16">;
+ // used on gfx900
+ def _packed_v2_gfx9 : ImageDimPattern<I, "_V1", v2f16, "_D16">;
+ def _packed_v4 : ImageDimPattern<d16helper, "_V2", v2i32, "_D16">;
+ } // End HasPackedD16VMem.
+}
+
+foreach intr = AMDGPUImageDimIntrinsics in {
+ def intr#_d16helper_profile : AMDGPUDimProfileCopy<intr.P> {
+ let RetTypes = !foreach(ty, intr.P.RetTypes, llvm_any_ty);
+ let DataArgs = !foreach(arg, intr.P.DataArgs, AMDGPUArg<llvm_any_ty, arg.Name>);
+ }
+
+ let TargetPrefix = "SI", isTarget = 1 in
+ def int_SI_image_d16helper_ # intr.P.OpMod # intr.P.Dim.Name :
+ AMDGPUImageDimIntrinsic<!cast<AMDGPUDimProfile>(intr#"_d16helper_profile"),
+ intr.IntrProperties, intr.Properties>;
+
+ defm intr#_d16 :
+ ImageDimD16Helper<
+ intr, !cast<AMDGPUImageDimIntrinsic>(
+ "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name)>;
+}
+
+foreach intr = AMDGPUImageDimGatherIntrinsics in {
+ def intr#_pat3 : ImageDimPattern<intr, "_V4", v4f32>;
+
+ def intr#_d16helper_profile : AMDGPUDimProfileCopy<intr.P> {
+ let RetTypes = !foreach(ty, intr.P.RetTypes, llvm_any_ty);
+ let DataArgs = !foreach(arg, intr.P.DataArgs, AMDGPUArg<llvm_any_ty, arg.Name>);
+ }
+
+ let TargetPrefix = "SI", isTarget = 1 in
+ def int_SI_image_d16helper_ # intr.P.OpMod # intr.P.Dim.Name :
+ AMDGPUImageDimIntrinsic<!cast<AMDGPUDimProfile>(intr#"_d16helper_profile"),
+ intr.IntrProperties, intr.Properties>;
+
+ let SubtargetPredicate = HasUnpackedD16VMem in {
+ def intr#_unpacked_v4 :
+ ImageDimPattern<!cast<AMDGPUImageDimIntrinsic>(
+ "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name),
+ "_V4", v4i32, "_D16_gfx80">;
+ } // End HasUnpackedD16VMem.
+
+ let SubtargetPredicate = HasPackedD16VMem in {
+ def intr#_packed_v4 :
+ ImageDimPattern<!cast<AMDGPUImageDimIntrinsic>(
+ "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name),
+ "_V2", v2i32, "_D16">;
+ } // End HasPackedD16VMem.
+}
+
+foreach intr = AMDGPUImageDimAtomicIntrinsics in {
+ def intr#_pat1 : ImageDimPattern<intr, "_V1", i32>;
+}
+
/********** ======================= **********/
/********** Image sampling patterns **********/
/********** ======================= **********/