AMDGPU: Stay in WQM for non-intrinsic stores Summary: Two types of stores are possible in pixel shaders: stores to memory that are explicitly requested at the API level, and stores that are an implementation detail of register spilling or lowering of arrays. For the first kind of store, we must ensure that helper pixels have no effect and hence WQM must be disabled. The second kind of store must always be executed, because the written value may be loaded again in a way that is relevant for helper pixels as well -- and there are no externally visible effects anyway. This is a candidate for the 3.9 release branch. Reviewers: arsenm, tstellarAMD, mareko Subscribers: arsenm, kzhuravl, llvm-commits Differential Revision: https://reviews.llvm.org/D22675 llvm-svn: 277504

commit: 8a482b33fed526b17a63e4539ca3036a89aea579 [log] [tgz]
author: Nicolai Haehnle <nhaehnle@gmail.com> Tue Aug 02 19:31:14 2016 +0000
committer: Nicolai Haehnle <nhaehnle@gmail.com> Tue Aug 02 19:31:14 2016 +0000
tree: 246d55a61c69f1a3d98cb04230b0505da4058b39
parent: 3a9f2a5a8d1f3a01086b2816359f76479d1ec58e [diff]
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 54efdc0..f4b04e3 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h

@@ -41,7 +41,8 @@
   WQM = 1 << 22,
   VGPRSpill = 1 << 23,
   VOPAsmPrefer32Bit = 1 << 24,
-  Gather4 = 1 << 25
+  Gather4 = 1 << 25,
+  DisableWQM = 1 << 26
 };
 }
 

diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 4a9d8db..7641205 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td

@@ -41,6 +41,8 @@
   field bits<1> DS = 0;
   field bits<1> MIMG = 0;
   field bits<1> FLAT = 0;
+
+  // Whether WQM _must_ be enabled for this instruction.
   field bits<1> WQM = 0;
   field bits<1> VGPRSpill = 0;
 
@@ -50,6 +52,9 @@
 
   field bits<1> Gather4 = 0;
 
+  // Whether WQM _must_ be disabled for this instruction.
+  field bits<1> DisableWQM = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
@@ -81,6 +86,7 @@
   let TSFlags{23} = VGPRSpill;
   let TSFlags{24} = VOPAsmPrefer32Bit;
   let TSFlags{25} = Gather4;
+  let TSFlags{26} = DisableWQM;
 
   let SchedRW = [Write32Bit];
 

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 7ed7c83..4503466 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h

@@ -340,6 +340,14 @@
     return get(Opcode).TSFlags & SIInstrFlags::WQM;
   }
 
+  static bool isDisableWQM(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
+  }
+
+  bool isDisableWQM(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
+  }
+
   static bool isVGPRSpill(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
   }

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 63de741..bbe1b5a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td

@@ -2723,6 +2723,10 @@
   def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
            MUBUFAddr64Table <0>;
 
+  let DisableWQM = 1 in {
+    def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>;
+  }
+
   let addr64 = 0, isCodeGenOnly = 0 in {
     def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
   }
@@ -2793,7 +2797,8 @@
 multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
                          ValueType vt, SDPatternOperator atomic> {
 
-  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in {
+  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1,
+      DisableWQM = 1 in {
 
     // No return variants
     let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
@@ -3197,6 +3202,7 @@
   let mayStore = 1;
   let hasSideEffects = 1;
   let hasPostISelHook = 0;
+  let DisableWQM = 1;
 }
 
 multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
@@ -3228,6 +3234,7 @@
   let mayStore = 1;
   let hasSideEffects = 1;
   let hasPostISelHook = 0;
+  let DisableWQM = 1;
   let Constraints = "$vdst = $vdata";
   let AsmMatchConverter = "cvtMIMGAtomic";
 }

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f8db0b7..f6c2719 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td

@@ -2050,7 +2050,7 @@
     (name vt:$vdata, v4i32:$rsrc, 0,
           (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+    (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
                                     (as_i1imm $glc), (as_i1imm $slc), 0)
   >;
 
@@ -2058,7 +2058,7 @@
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
           (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+    (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
                                    (as_i16imm $offset), (as_i1imm $glc),
                                    (as_i1imm $slc), 0)
   >;
@@ -2067,7 +2067,7 @@
     (name vt:$vdata, v4i32:$rsrc, 0,
           (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+    (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
                                    (as_i16imm $offset), (as_i1imm $glc),
                                    (as_i1imm $slc), 0)
   >;
@@ -2076,7 +2076,7 @@
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
           (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
           imm:$glc, imm:$slc),
-    (!cast<MUBUF>(opcode # _BOTHEN)
+    (!cast<MUBUF>(opcode # _BOTHEN_exact)
       $vdata,
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
       $rsrc, $soffset, (as_i16imm $offset),

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index cb35a05..c8bfc5a 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

@@ -185,7 +185,7 @@
 
       if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
         Flags = StateWQM;
-      } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
+      } else if (TII->isDisableWQM(MI)) {
         Flags = StateExact;
       } else {
         // Handle export instructions with the exec mask valid flag set
@@ -237,9 +237,10 @@
   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
   BlockInfo &BI = Blocks[MBB];
 
-  // Control flow-type instructions that are followed by WQM computations
-  // must themselves be in WQM.
-  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
+  // Control flow-type instructions and stores to temporary memory that are
+  // followed by WQM computations must themselves be in WQM.
+  if ((II.OutNeeds & StateWQM) && !II.Needs &&
+      (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
     Instructions[&MI].Needs = StateWQM;
     II.Needs = StateWQM;
   }
commit	8a482b33fed526b17a63e4539ca3036a89aea579	[log] [tgz]
author	Nicolai Haehnle <nhaehnle@gmail.com>	Tue Aug 02 19:31:14 2016 +0000
committer	Nicolai Haehnle <nhaehnle@gmail.com>	Tue Aug 02 19:31:14 2016 +0000
tree	246d55a61c69f1a3d98cb04230b0505da4058b39
parent	3a9f2a5a8d1f3a01086b2816359f76479d1ec58e [diff]