AMDGPU: Correct maximum possible private allocation size
We were assuming a much larger possible per-wave visible stack
allocation than is possible:
https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/faa3ae51388517353afcdaf9c16621f879ef0a59/src/core/runtime/amd_gpu_agent.cpp#L70
Based on this, we can assume the high 15 bits of a frame index or sret
are 0. The frame index value is the per-lane offset, so the maximum
frame index value is MAX_WAVE_SCRATCH / wavesize.
Remove the corresponding subtarget feature and option that made
this configurable.
llvm-svn: 361541
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 341ef73..9938eea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -458,13 +458,6 @@
def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
-def FeatureEnableHugePrivateBuffer : SubtargetFeature<
- "huge-private-buffer",
- "EnableHugePrivateBuffer",
- "true",
- "Enable private/scratch buffer sizes greater than 128 GB"
->;
-
def FeatureDumpCode : SubtargetFeature <"DumpCode",
"DumpCode",
"true",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index a88218f..09b806b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -190,7 +190,6 @@
EnableCuMode(false),
TrapHandler(false),
- EnableHugePrivateBuffer(false),
EnableLoadStoreOpt(false),
EnableUnsafeDSOffsetFolding(false),
EnableSIScheduler(false),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 1ef7262..34166aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -299,7 +299,6 @@
bool TrapHandler;
// Used as options.
- bool EnableHugePrivateBuffer;
bool EnableLoadStoreOpt;
bool EnableUnsafeDSOffsetFolding;
bool EnableSIScheduler;
@@ -377,6 +376,9 @@
SITargetLowering TLInfo;
SIFrameLowering FrameLowering;
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
+ static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
+
public:
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const GCNTargetMachine &TM);
@@ -436,6 +438,11 @@
return Log2_32(WavefrontSize);
}
+ /// Return the number of high bits known to be zero fror a frame index.
+ unsigned getKnownHighZeroBitsForFrameIndex() const {
+ return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
+ }
+
int getLDSBankCount() const {
return LDSBankCount;
}
@@ -526,10 +533,6 @@
return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
}
- bool enableHugePrivateBuffer() const {
- return EnableHugePrivateBuffer;
- }
-
bool unsafeDSOffsetFoldingEnabled() const {
return EnableUnsafeDSOffsetFolding;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c4c0e40..c2cda5e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -93,12 +93,6 @@
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
-static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
- "amdgpu-frame-index-zero-bits",
- cl::desc("High bits of frame index assumed to be zero"),
- cl::init(5),
- cl::ReallyHidden);
-
static cl::opt<bool> DisableLoopAlignment(
"amdgpu-disable-loop-alignment",
cl::desc("Do not align and prefetch loops"),
@@ -2059,13 +2053,14 @@
Reg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
- if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
+ if (Arg.Flags.isSRet()) {
// The return object should be reasonably addressable.
// FIXME: This helps when the return is a real sret. If it is a
// automatically inserted sret (i.e. CanLowerReturn returns false), an
// extra copy is inserted in SelectionDAGBuilder which obscures this.
- unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
+ unsigned NumBits
+ = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
}
@@ -9970,14 +9965,10 @@
TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
DAG, Depth);
- if (getSubtarget()->enableHugePrivateBuffer())
- return;
-
- // Technically it may be possible to have a dispatch with a single workitem
- // that uses the full private memory size, but that's not really useful. We
- // can't use vaddr in MUBUF instructions if we don't know the address
+ // Set the high bits to zero based on the maximum allowed scratch size per
+ // wave. We can't use vaddr in MUBUF instructions if we don't know the address
// calculation won't overflow, so assume the sign bit is never set.
- Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
+ Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
}
unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {