AMDGPU: Dimension-aware image intrinsics

Summary:
These new image intrinsics contain the texture type as part of
their name and have each component of the address/coordinate as
individual parameters.

This is a preparatory step for implementing the A16 feature, where
coordinates are passed as half-floats or -ints, but the Z compare
value and texel offsets are still full dwords, making it difficult
or impossible to distinguish between A16 on or off in the old-style
intrinsics.

Additionally, these intrinsics pass the 'texfailpolicy' and
'cachectrl' as i32 bit fields to reduce operand clutter and allow
for future extensibility.

v2:
- gather4 supports 2darray images
- fix a bug with 1D images on SI

Change-Id: I099f309e0a394082a5901ea196c3967afb867f04

Reviewers: arsenm, rampitec, b-sumner

Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye

Differential Revision: https://reviews.llvm.org/D44939

llvm-svn: 329166
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index cb2064c..32118df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -29,6 +29,9 @@
 namespace AMDGPU {
 #define GET_RSRCINTRINSIC_IMPL
 #include "AMDGPUGenSearchableTables.inc"
+
+#define GET_D16IMAGEDIMINTRINSIC_IMPL
+#include "AMDGPUGenSearchableTables.inc"
 }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 8f75b42..766ee3d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -63,6 +63,12 @@
 };
 const RsrcIntrinsic *lookupRsrcIntrinsicByIntr(unsigned Intr);
 
+struct D16ImageDimIntrinsic {
+  unsigned Intr;
+  unsigned D16HelperIntr;
+};
+const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsicByIntr(unsigned Intr);
+
 } // end AMDGPU namespace
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 0b9fe07..fce7499 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -23,7 +23,11 @@
 }
 
 foreach intr = !listconcat(AMDGPUBufferIntrinsics,
-                           AMDGPUImageIntrinsics) in {
+                           AMDGPUImageIntrinsics,
+                           AMDGPUImageDimIntrinsics,
+                           AMDGPUImageDimGatherIntrinsics,
+                           AMDGPUImageDimGetResInfoIntrinsics,
+                           AMDGPUImageDimAtomicIntrinsics) in {
   def : RsrcIntrinsic<!cast<AMDGPURsrcIntrinsic>(intr)>;
 }
 
@@ -76,3 +80,20 @@
 def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_ps_live>;
 def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
+
+foreach intr = AMDGPUImageDimAtomicIntrinsics in
+def : SourceOfDivergence<intr>;
+
+class D16ImageDimIntrinsic<AMDGPUImageDimIntrinsic intr> : SearchableTable {
+  let SearchableFields = ["Intr"];
+  let EnumNameField = ?;
+
+  Intrinsic Intr = intr;
+  code D16HelperIntr =
+      !cast<code>("AMDGPUIntrinsic::SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name);
+}
+
+foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
+                           AMDGPUImageDimGatherIntrinsics) in {
+  def : D16ImageDimIntrinsic<intr>;
+}
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 7a7b54e..2d2aaf7 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -464,6 +464,201 @@
 //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
 }
 
+/********** ============================== **********/
+/********** Dimension-aware image patterns **********/
+/********** ============================== **********/
+
+class getDwordsType<int dwords> {
+  int NumDwords = dwords;
+  string suffix = !if(!lt(dwords, 1), ?,
+                  !if(!eq(dwords, 1), "_V1",
+                  !if(!eq(dwords, 2), "_V2",
+                  !if(!le(dwords, 4), "_V4",
+                  !if(!le(dwords, 8), "_V8",
+                  !if(!le(dwords, 16), "_V16", ?))))));
+  ValueType VT = !if(!lt(dwords, 1), ?,
+                 !if(!eq(dwords, 1), f32,
+                 !if(!eq(dwords, 2), v2f32,
+                 !if(!le(dwords, 4), v4f32,
+                 !if(!le(dwords, 8), v8f32,
+                 !if(!le(dwords, 16), v16f32, ?))))));
+  RegisterClass VReg = !if(!lt(dwords, 1), ?,
+                       !if(!eq(dwords, 1), VGPR_32,
+                       !if(!eq(dwords, 2), VReg_64,
+                       !if(!le(dwords, 4), VReg_128,
+                       !if(!le(dwords, 8), VReg_256,
+                       !if(!le(dwords, 16), VReg_512, ?))))));
+}
+
+class makeRegSequence_Fold<int i, dag d> {
+  int idx = i;
+  dag lhs = d;
+}
+
+// Generate a dag node which returns a vector register of class RC into which
+// the source operands given by names have been inserted (assuming that each
+// name corresponds to an operand whose size is equal to a subregister).
+class makeRegSequence<ValueType vt, RegisterClass RC, list<string> names> {
+  dag ret =
+    !if(!eq(!size(names), 1),
+        !dag(COPY_TO_REGCLASS, [?, RC], [names[0], ?]),
+        !foldl(makeRegSequence_Fold<0, (vt (IMPLICIT_DEF))>, names, f, name,
+               makeRegSequence_Fold<
+                 !add(f.idx, 1),
+                 !con((INSERT_SUBREG f.lhs),
+                      !dag(INSERT_SUBREG, [?,    !cast<SubRegIndex>("sub"#f.idx)],
+                                          [name, ?]))>).lhs);
+}
+
+class ImageDimPattern<AMDGPUImageDimIntrinsic I,
+                      string dop, ValueType dty,
+                      string suffix = ""> : GCNPat<(undef), (undef)> {
+  list<AMDGPUArg> AddrArgs = I.P.AddrDefaultArgs;
+  getDwordsType AddrDwords = getDwordsType<!size(AddrArgs)>;
+
+  Instruction MI =
+    !cast<Instruction>(!strconcat("IMAGE_", I.P.OpMod, dop, AddrDwords.suffix, suffix));
+
+  // DAG fragment to match data arguments (vdata for store/atomic, dmask
+  // for non-atomic).
+  dag MatchDataDag =
+    !con(!dag(I, !foreach(arg, I.P.DataArgs, dty),
+                 !foreach(arg, I.P.DataArgs, arg.Name)),
+         !if(I.P.IsAtomic, (I), (I i32:$dmask)));
+
+  // DAG fragment to match vaddr arguments.
+  dag MatchAddrDag = !dag(I, !foreach(arg, AddrArgs, arg.Type.VT),
+                             !foreach(arg, AddrArgs, arg.Name));
+
+  // DAG fragment to match sampler resource and unorm arguments.
+  dag MatchSamplerDag = !if(I.P.IsSample, (I v4i32:$sampler, i1:$unorm), (I));
+
+  // DAG node that generates the MI vdata for store/atomic
+  getDwordsType DataDwords = getDwordsType<!size(I.P.DataArgs)>;
+  dag GenDataDag =
+    !if(I.P.IsAtomic, (MI makeRegSequence<DataDwords.VT, DataDwords.VReg,
+                                          !foreach(arg, I.P.DataArgs, arg.Name)>.ret),
+    !if(!size(I.P.DataArgs), (MI $vdata), (MI)));
+
+  // DAG node that generates the MI vaddr
+  dag GenAddrDag = makeRegSequence<AddrDwords.VT, AddrDwords.VReg,
+                                   !foreach(arg, AddrArgs, arg.Name)>.ret;
+  // DAG fragments that generate various inline flags
+  dag GenDmask =
+    !if(I.P.IsAtomic, (MI !add(!shl(1, DataDwords.NumDwords), -1)),
+                      (MI (as_i32imm $dmask)));
+  dag GenGLC =
+    !if(I.P.IsAtomic, (MI 1),
+                      (MI (bitextract_imm<0> $cachepolicy)));
+
+  dag MatchIntrinsic = !con(MatchDataDag,
+                            MatchAddrDag,
+                            (I v8i32:$rsrc),
+                            MatchSamplerDag,
+                            (I 0/*texfailctrl*/,
+                               i32:$cachepolicy));
+  let PatternToMatch =
+    !if(!size(I.RetTypes), (dty MatchIntrinsic), MatchIntrinsic);
+
+  bit IsCmpSwap = !and(I.P.IsAtomic, !eq(!size(I.P.DataArgs), 2));
+  dag ImageInstruction =
+    !con(GenDataDag,
+         (MI GenAddrDag),
+         (MI $rsrc),
+         !if(I.P.IsSample, (MI $sampler), (MI)),
+         GenDmask,
+         !if(I.P.IsSample, (MI (as_i1imm $unorm)), (MI 1)),
+         GenGLC,
+         (MI (bitextract_imm<1> $cachepolicy),
+             0, /* r128 */
+             0, /* tfe */
+             0 /*(as_i1imm $lwe)*/,
+             { I.P.Dim.DA }));
+  let ResultInstrs = [
+    !if(IsCmpSwap, (EXTRACT_SUBREG ImageInstruction, sub0), ImageInstruction)
+  ];
+}
+
+foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
+                           AMDGPUImageDimGetResInfoIntrinsics) in {
+  def intr#_pat_v1 : ImageDimPattern<intr, "_V1", f32>;
+  def intr#_pat_v2 : ImageDimPattern<intr, "_V2", v2f32>;
+  def intr#_pat_v4 : ImageDimPattern<intr, "_V4", v4f32>;
+}
+
+// v2f16 and v4f16 are used as data types to signal that D16 should be used.
+// However, they are not (always) legal types, and the SelectionDAG requires us
+// to legalize them before running any patterns. So we legalize them by
+// converting to an int type of equal size and using an internal 'd16helper'
+// intrinsic instead which signifies both the use of D16 and actually allows
+// this integer-based return type.
+multiclass ImageDimD16Helper<AMDGPUImageDimIntrinsic I,
+                             AMDGPUImageDimIntrinsic d16helper> {
+  let SubtargetPredicate = HasUnpackedD16VMem in {
+    def _unpacked_v1 : ImageDimPattern<I, "_V1", f16, "_D16_gfx80">;
+    def _unpacked_v2 : ImageDimPattern<d16helper, "_V2", v2i32, "_D16_gfx80">;
+    def _unpacked_v4 : ImageDimPattern<d16helper, "_V4", v4i32, "_D16_gfx80">;
+  } // End HasUnpackedD16VMem.
+
+  let SubtargetPredicate = HasPackedD16VMem in {
+    def _packed_v1 : ImageDimPattern<I, "_V1", f16, "_D16">;
+    // used on gfx810
+    def _packed_v2 : ImageDimPattern<d16helper, "_V1", i32, "_D16">;
+    // used on gfx900
+    def _packed_v2_gfx9 : ImageDimPattern<I, "_V1", v2f16, "_D16">;
+    def _packed_v4 : ImageDimPattern<d16helper, "_V2", v2i32, "_D16">;
+  } // End HasPackedD16VMem.
+}
+
+foreach intr = AMDGPUImageDimIntrinsics in {
+  def intr#_d16helper_profile : AMDGPUDimProfileCopy<intr.P> {
+    let RetTypes = !foreach(ty, intr.P.RetTypes, llvm_any_ty);
+    let DataArgs = !foreach(arg, intr.P.DataArgs, AMDGPUArg<llvm_any_ty, arg.Name>);
+  }
+
+  let TargetPrefix = "SI", isTarget = 1 in
+  def int_SI_image_d16helper_ # intr.P.OpMod # intr.P.Dim.Name :
+      AMDGPUImageDimIntrinsic<!cast<AMDGPUDimProfile>(intr#"_d16helper_profile"),
+                              intr.IntrProperties, intr.Properties>;
+
+  defm intr#_d16 :
+      ImageDimD16Helper<
+          intr, !cast<AMDGPUImageDimIntrinsic>(
+                    "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name)>;
+}
+
+foreach intr = AMDGPUImageDimGatherIntrinsics in {
+  def intr#_pat3 : ImageDimPattern<intr, "_V4", v4f32>;
+
+  def intr#_d16helper_profile : AMDGPUDimProfileCopy<intr.P> {
+    let RetTypes = !foreach(ty, intr.P.RetTypes, llvm_any_ty);
+    let DataArgs = !foreach(arg, intr.P.DataArgs, AMDGPUArg<llvm_any_ty, arg.Name>);
+  }
+
+  let TargetPrefix = "SI", isTarget = 1 in
+  def int_SI_image_d16helper_ # intr.P.OpMod # intr.P.Dim.Name :
+      AMDGPUImageDimIntrinsic<!cast<AMDGPUDimProfile>(intr#"_d16helper_profile"),
+                              intr.IntrProperties, intr.Properties>;
+
+  let SubtargetPredicate = HasUnpackedD16VMem in {
+    def intr#_unpacked_v4 :
+        ImageDimPattern<!cast<AMDGPUImageDimIntrinsic>(
+                            "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name),
+                        "_V4", v4i32, "_D16_gfx80">;
+  } // End HasUnpackedD16VMem.
+
+  let SubtargetPredicate = HasPackedD16VMem in {
+    def intr#_packed_v4 :
+        ImageDimPattern<!cast<AMDGPUImageDimIntrinsic>(
+                            "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name),
+                        "_V2", v2i32, "_D16">;
+  } // End HasPackedD16VMem.
+}
+
+foreach intr = AMDGPUImageDimAtomicIntrinsics in {
+  def intr#_pat1 : ImageDimPattern<intr, "_V1", i32>;
+}
+
 /********** ======================= **********/
 /********** Image sampling patterns **********/
 /********** ======================= **********/
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2a7549e..6f68f63 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3677,9 +3677,23 @@
     Chain = Res.getValue(1);
     return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
   }
-  default:
+  default: {
+    const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
+        AMDGPU::lookupD16ImageDimIntrinsicByIntr(IID);
+    if (D16ImageDimIntr) {
+      SmallVector<SDValue, 20> Ops;
+      for (auto Value : Op.getNode()->op_values())
+        Ops.push_back(Value);
+      Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
+      Res = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTList, Ops,
+                                    M->getMemoryVT(), M->getMemOperand());
+      Chain = Res.getValue(1);
+      return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked);
+    }
+
     return SDValue();
   }
+  }
 }
 
 void SITargetLowering::ReplaceNodeResults(SDNode *N,
@@ -5151,9 +5165,32 @@
                                    M->getMemoryVT(), M->getMemOperand());
   }
 
-  default:
+  default: {
+    const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
+        AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID);
+    if (D16ImageDimIntr) {
+      SDValue VData = Op.getOperand(2);
+      EVT StoreVT = VData.getValueType();
+      if ((StoreVT == MVT::v2f16 && !isTypeLegal(StoreVT)) ||
+          StoreVT == MVT::v4f16) {
+        VData = handleD16VData(VData, DAG);
+
+        SmallVector<SDValue, 12> Ops;
+        for (auto Value : Op.getNode()->op_values())
+          Ops.push_back(Value);
+        Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
+        Ops[2] = VData;
+
+        MemSDNode *M = cast<MemSDNode>(Op);
+        return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(),
+                                       Ops, M->getMemoryVT(),
+                                       M->getMemOperand());
+      }
+    }
+
     return Op;
   }
+  }
 }
 
 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 777bb0a..8797253 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -572,6 +572,12 @@
   N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
 }]>;
 
+class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
+  uint64_t Imm = N->getZExtValue();
+  unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1;
+  return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1);
+}]>;
+
 def SIMM16bit : PatLeaf <(imm),
   [{return isInt<16>(N->getSExtValue());}]
 >;