[AMDGPU] Packed thread ids in function call ABI
Differential Revision: https://reviews.llvm.org/D63851
llvm-svn: 364619
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index 81c3356..99a01ca3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -9,6 +9,7 @@
#include "AMDGPU.h"
#include "AMDGPUArgumentUsageInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -26,9 +27,16 @@
}
if (isRegister())
- OS << "Reg " << printReg(getRegister(), TRI) << '\n';
+ OS << "Reg " << printReg(getRegister(), TRI);
else
- OS << "Stack offset " << getStackOffset() << '\n';
+ OS << "Stack offset " << getStackOffset();
+
+ if (isMasked()) {
+ OS << " & ";
+ llvm::write_hex(OS, Mask, llvm::HexPrintStyle::PrefixLower);
+ }
+
+ OS << '\n';
}
char AMDGPUArgumentUsageInfo::ID = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 277f361..ab0024b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -32,18 +32,27 @@
unsigned StackOffset;
};
+ // Bitmask to locate argument within the register.
+ unsigned Mask;
+
bool IsStack : 1;
bool IsSet : 1;
- ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false)
- : Register(Val), IsStack(IsStack), IsSet(IsSet) {}
public:
- static ArgDescriptor createRegister(unsigned Reg) {
- return ArgDescriptor(Reg, false, true);
+ ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
+ bool IsStack = false, bool IsSet = false)
+ : Register(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
+
+ static ArgDescriptor createRegister(unsigned Reg, unsigned Mask = ~0u) {
+ return ArgDescriptor(Reg, Mask, false, true);
}
- static ArgDescriptor createStack(unsigned Reg) {
- return ArgDescriptor(Reg, true, true);
+ static ArgDescriptor createStack(unsigned Reg, unsigned Mask = ~0u) {
+ return ArgDescriptor(Reg, Mask, true, true);
+ }
+
+ static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
+ return ArgDescriptor(Arg.Register, Mask, Arg.IsStack, Arg.IsSet);
}
bool isSet() const {
@@ -68,6 +77,14 @@
return StackOffset;
}
+ unsigned getMask() const {
+ return Mask;
+ }
+
+ bool isMasked() const {
+ return Mask != ~0u;
+ }
+
void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d0af336..766294d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4233,9 +4233,19 @@
const ArgDescriptor &Arg) const {
assert(Arg && "Attempting to load missing argument");
- if (Arg.isRegister())
- return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
- return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
+ SDValue V = Arg.isRegister() ?
+ CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
+ loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
+
+ if (!Arg.isMasked())
+ return V;
+
+ unsigned Mask = Arg.getMask();
+ unsigned Shift = countTrailingZeros<unsigned>(Mask);
+ V = DAG.getNode(ISD::SRL, SL, VT, V,
+ DAG.getShiftAmountConstant(Shift, VT, SL));
+ return DAG.getNode(ISD::AND, SL, VT, V,
+ DAG.getConstant(Mask >> Shift, SL, VT));
}
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2500050..398f688 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1585,7 +1585,13 @@
// Try to allocate a VGPR at the end of the argument list, or if no argument
// VGPRs are left allocating a stack slot.
-static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
+// If \p Mask is is given it indicates bitfield position in the register.
+// If \p Arg is given use it with new ]p Mask instead of allocating new.
+static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
+ ArgDescriptor Arg = ArgDescriptor()) {
+ if (Arg.isSet())
+ return ArgDescriptor::createArg(Arg, Mask);
+
ArrayRef<MCPhysReg> ArgVGPRs
= makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
@@ -1593,7 +1599,7 @@
// Spill to stack required.
int64_t Offset = CCInfo.AllocateStack(4, 4);
- return ArgDescriptor::createStack(Offset);
+ return ArgDescriptor::createStack(Offset, Mask);
}
unsigned Reg = ArgVGPRs[RegIdx];
@@ -1602,7 +1608,7 @@
MachineFunction &MF = CCInfo.getMachineFunction();
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
- return ArgDescriptor::createRegister(Reg);
+ return ArgDescriptor::createRegister(Reg, Mask);
}
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
@@ -1634,14 +1640,21 @@
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
- if (Info.hasWorkItemIDX())
- Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
+ const unsigned Mask = 0x3ff;
+ ArgDescriptor Arg;
- if (Info.hasWorkItemIDY())
- Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
+ if (Info.hasWorkItemIDX()) {
+ Arg = allocateVGPR32Input(CCInfo, Mask);
+ Info.setWorkItemIDX(Arg);
+ }
+
+ if (Info.hasWorkItemIDY()) {
+ Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
+ Info.setWorkItemIDY(Arg);
+ }
if (Info.hasWorkItemIDZ())
- Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
+ Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
}
static void allocateSpecialInputSGPRs(CCState &CCInfo,
@@ -2387,9 +2400,6 @@
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
- AMDGPUFunctionArgInfo::WORKITEM_ID_X,
- AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
- AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
};
@@ -2429,6 +2439,71 @@
MemOpChains.push_back(ArgStore);
}
}
+
+ // Pack workitem IDs into a single register or pass it as is if already
+ // packed.
+ const ArgDescriptor *OutgoingArg;
+ const TargetRegisterClass *ArgRC;
+
+ std::tie(OutgoingArg, ArgRC) =
+ CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+ if (!OutgoingArg)
+ std::tie(OutgoingArg, ArgRC) =
+ CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+ if (!OutgoingArg)
+ std::tie(OutgoingArg, ArgRC) =
+ CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+ if (!OutgoingArg)
+ return;
+
+ const ArgDescriptor *IncomingArgX
+ = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first;
+ const ArgDescriptor *IncomingArgY
+ = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first;
+ const ArgDescriptor *IncomingArgZ
+ = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first;
+
+ SDValue InputReg;
+ SDLoc SL;
+
+ // If incoming ids are not packed we need to pack them.
+ if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX)
+ InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
+
+ if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) {
+ SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
+ Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
+ DAG.getShiftAmountConstant(10, MVT::i32, SL));
+ InputReg = InputReg.getNode() ?
+ DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
+ }
+
+ if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) {
+ SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
+ Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
+ DAG.getShiftAmountConstant(20, MVT::i32, SL));
+ InputReg = InputReg.getNode() ?
+ DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
+ }
+
+ if (!InputReg.getNode()) {
+ // Workitem ids are already packed, any of present incoming arguments
+ // will carry all required fields.
+ ArgDescriptor IncomingArg = ArgDescriptor::createArg(
+ IncomingArgX ? *IncomingArgX :
+ IncomingArgY ? *IncomingArgY :
+ *IncomingArgZ, ~0u);
+ InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
+ }
+
+ if (OutgoingArg->isRegister()) {
+ RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ } else {
+ unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4);
+ SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
+ SpecialArgOffset);
+ MemOpChains.push_back(ArgStore);
+ }
}
static bool canGuaranteeTCO(CallingConv::ID CC) {