AMDGPU: Stay in WQM for non-intrinsic stores
Summary:
Two types of stores are possible in pixel shaders: stores to memory that are
explicitly requested at the API level, and stores that are an implementation
detail of register spilling or lowering of arrays.
For the first kind of store, we must ensure that helper pixels have no effect
and hence WQM must be disabled. The second kind of store must always be
executed, because the written value may be loaded again in a way that is
relevant for helper pixels as well -- and there are no externally visible
effects anyway.
This is a candidate for the 3.9 release branch.
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, kzhuravl, llvm-commits
Differential Revision: https://reviews.llvm.org/D22675
llvm-svn: 277504
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 54efdc0..f4b04e3 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -41,7 +41,8 @@
WQM = 1 << 22,
VGPRSpill = 1 << 23,
VOPAsmPrefer32Bit = 1 << 24,
- Gather4 = 1 << 25
+ Gather4 = 1 << 25,
+ DisableWQM = 1 << 26
};
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 4a9d8db..7641205 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -41,6 +41,8 @@
field bits<1> DS = 0;
field bits<1> MIMG = 0;
field bits<1> FLAT = 0;
+
+ // Whether WQM _must_ be enabled for this instruction.
field bits<1> WQM = 0;
field bits<1> VGPRSpill = 0;
@@ -50,6 +52,9 @@
field bits<1> Gather4 = 0;
+ // Whether WQM _must_ be disabled for this instruction.
+ field bits<1> DisableWQM = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = VM_CNT;
let TSFlags{1} = EXP_CNT;
@@ -81,6 +86,7 @@
let TSFlags{23} = VGPRSpill;
let TSFlags{24} = VOPAsmPrefer32Bit;
let TSFlags{25} = Gather4;
+ let TSFlags{26} = DisableWQM;
let SchedRW = [Write32Bit];
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 7ed7c83..4503466 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -340,6 +340,14 @@
return get(Opcode).TSFlags & SIInstrFlags::WQM;
}
+ static bool isDisableWQM(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
+ }
+
+ bool isDisableWQM(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
+ }
+
static bool isVGPRSpill(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 63de741..bbe1b5a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2723,6 +2723,10 @@
def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
MUBUFAddr64Table <0>;
+ let DisableWQM = 1 in {
+ def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>;
+ }
+
let addr64 = 0, isCodeGenOnly = 0 in {
def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
}
@@ -2793,7 +2797,8 @@
multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
ValueType vt, SDPatternOperator atomic> {
- let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in {
+ let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1,
+ DisableWQM = 1 in {
// No return variants
let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
@@ -3197,6 +3202,7 @@
let mayStore = 1;
let hasSideEffects = 1;
let hasPostISelHook = 0;
+ let DisableWQM = 1;
}
multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
@@ -3228,6 +3234,7 @@
let mayStore = 1;
let hasSideEffects = 1;
let hasPostISelHook = 0;
+ let DisableWQM = 1;
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f8db0b7..f6c2719 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2050,7 +2050,7 @@
(name vt:$vdata, v4i32:$rsrc, 0,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+ (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
@@ -2058,7 +2058,7 @@
(name vt:$vdata, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+ (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (as_i1imm $glc),
(as_i1imm $slc), 0)
>;
@@ -2067,7 +2067,7 @@
(name vt:$vdata, v4i32:$rsrc, 0,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+ (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (as_i1imm $glc),
(as_i1imm $slc), 0)
>;
@@ -2076,7 +2076,7 @@
(name vt:$vdata, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _BOTHEN)
+ (!cast<MUBUF>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index cb35a05..c8bfc5a 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -185,7 +185,7 @@
if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
Flags = StateWQM;
- } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
+ } else if (TII->isDisableWQM(MI)) {
Flags = StateExact;
} else {
// Handle export instructions with the exec mask valid flag set
@@ -237,9 +237,10 @@
InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
BlockInfo &BI = Blocks[MBB];
- // Control flow-type instructions that are followed by WQM computations
- // must themselves be in WQM.
- if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
+ // Control flow-type instructions and stores to temporary memory that are
+ // followed by WQM computations must themselves be in WQM.
+ if ((II.OutNeeds & StateWQM) && !II.Needs &&
+ (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
Instructions[&MI].Needs = StateWQM;
II.Needs = StateWQM;
}