AMDGPU: Initial implementation of calls
Includes a hack to fix the type selected for
the GlobalAddress of the function, which will be
fixed by changing the default datalayout to use
generic pointers for 0.
llvm-svn: 309732
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 4bef7a8..c1c066f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -163,6 +163,10 @@
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
CCDelegateTo<CC_SI>>,
CCIf<"static_cast<const AMDGPUSubtarget&>"
+ "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
+ "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
+ CCDelegateTo<CC_AMDGPU_Func>>,
+ CCIf<"static_cast<const AMDGPUSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() < "
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
CCDelegateTo<CC_R600>>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 8e187c7..2329fff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -33,10 +33,6 @@
/// \returns The number of 32-bit sub-registers that are used when storing
/// values to the stack.
unsigned getStackWidth(const MachineFunction &MF) const;
-
- bool hasFP(const MachineFunction &MF) const override {
- return false;
- }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 258b173..9aa0234 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -20,6 +20,7 @@
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
#include "R600MachineFunctionInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 69dc5298..36a60b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -30,7 +30,9 @@
void AMDGPUInstrInfo::anchor() {}
AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
- : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {}
+ : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ ST(ST),
+ AMDGPUASI(ST.getAMDGPUAS()) {}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
// the first 16 loads will be interleaved with the stores, and the next 16 will
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index bcf89bb..196e699 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -82,6 +82,22 @@
def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
+def callseq_start : SDNode<"ISD::CALLSEQ_START",
+ SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
+ [SDNPHasChain, SDNPOutGlue]
+>;
+
+def callseq_end : SDNode<"ISD::CALLSEQ_END",
+ SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
+>;
+
+def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
+ SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]
+>;
+
def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 63dd0d72..c665bc3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -121,6 +121,9 @@
MCOp = MCOperand::createExpr(Expr);
return true;
}
+ case MachineOperand::MO_RegisterMask:
+ // Regmasks are like implicit defs.
+ return false;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index ff58aa5..eafc778 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -56,6 +56,20 @@
}
}
+const MCPhysReg *
+SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
+ // FIXME
+ static MCPhysReg Regs[2];
+
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ assert(!MFI->isEntryFunction());
+
+ Regs[0] = MFI->getFrameOffsetReg();
+ Regs[1] = AMDGPU::NoRegister;
+
+ return Regs;
+}
+
const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
switch (CC) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 343bca5..571ee97 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -123,6 +123,12 @@
cl::init(false),
cl::Hidden);
+static cl::opt<bool> EnableAMDGPUFunctionCalls(
+ "amdgpu-function-calls",
+ cl::Hidden,
+ cl::desc("Enable AMDGPU function call support"),
+ cl::init(false));
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -269,6 +275,11 @@
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
+bool AMDGPUTargetMachine::enableFunctionCalls() const {
+ return EnableAMDGPUFunctionCalls &&
+ getTargetTriple().getArch() == Triple::amdgcn;
+}
+
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
return GPUAttr.hasAttribute(Attribute::None) ?
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index a3c7c19..f388d8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -69,6 +69,9 @@
return -1;
return 0;
}
+
+ LLVM_READONLY
+ bool enableFunctionCalls() const;
};
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/llvm/lib/Target/AMDGPU/R600FrameLowering.h
index 142f709..fe367d7 100644
--- a/llvm/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@@ -27,6 +27,10 @@
MachineBasicBlock &MBB) const override {}
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
+
+ bool hasFP(const MachineFunction &MF) const override {
+ return false;
+ }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7334781..79bae0a 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -575,6 +575,41 @@
}
}
+MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
+ MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ int64_t Amount = I->getOperand(0).getImm();
+ if (Amount == 0)
+ return MBB.erase(I);
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const DebugLoc &DL = I->getDebugLoc();
+ unsigned Opc = I->getOpcode();
+ bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+ uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+ if (!TFI->hasReservedCallFrame(MF)) {
+ unsigned Align = getStackAlignment();
+
+ Amount = alignTo(Amount, Align);
+ assert(isUInt<32>(Amount) && "exceeded stack address space size");
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned SPReg = MFI->getStackPtrOffsetReg();
+
+ unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
+ BuildMI(MBB, I, DL, TII->get(Op), SPReg)
+ .addReg(SPReg)
+ .addImm(Amount * ST.getWavefrontSize());
+ } else if (CalleePopAmount != 0) {
+ llvm_unreachable("is this used?");
+ }
+
+ return MBB.erase(I);
+}
+
void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index d4dfa1c..c23969d 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -39,6 +39,11 @@
MachineFunction &MF,
RegScavenger *RS = nullptr) const override;
+ MachineBasicBlock::iterator
+ eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const override;
+
private:
void emitFlatScratchInit(const SISubtarget &ST,
MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 247a011..6be94ba 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1201,9 +1201,13 @@
if (TM.getOptLevel() == CodeGenOpt::None)
HasStackObjects = true;
+ // For now assume stack access is needed in any callee functions, so we need
+ // the scratch registers to pass in.
+ bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
+
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
if (ST.isAmdCodeObjectV2(MF)) {
- if (HasStackObjects) {
+ if (RequiresStackAccess) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
@@ -1212,9 +1216,23 @@
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
- unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ if (MFI.hasCalls()) {
+ // If we have calls, we need to keep the frame register in a register
+ // that won't be clobbered by a call, so ensure it is copied somewhere.
+
+ // This is not a problem for the scratch wave offset, because the same
+ // registers are reserved in all functions.
+
+ // FIXME: Nothing is really ensuring this is a call preserved register,
+ // it's just selected from the end so it happens to be.
+ unsigned ReservedOffsetReg
+ = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info.setScratchWaveOffsetReg(ReservedOffsetReg);
+ } else {
+ unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ }
} else {
unsigned ReservedBufferReg
= TRI.reservedPrivateSegmentBufferReg(MF);
@@ -1237,7 +1255,7 @@
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
- if (HasStackObjects) {
+ if (HasStackObjects && !MFI.hasCalls()) {
unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
@@ -1249,6 +1267,50 @@
}
}
+bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ return !Info->isEntryFunction();
+}
+
+void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+
+}
+
+void SITargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (AMDGPU::SReg_64RegClass.contains(*I))
+ RC = &AMDGPU::SGPR_64RegClass;
+ else if (AMDGPU::SReg_32RegClass.contains(*I))
+ RC = &AMDGPU::SGPR_32RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator.
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
+
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1589,6 +1651,22 @@
}
// FIXME: Does sret work properly?
+ if (!Info->isEntryFunction()) {
+ const SIRegisterInfo *TRI
+ = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (AMDGPU::SReg_64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else if (AMDGPU::SReg_32RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i32));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
+ }
// Update chain and glue.
RetOps[0] = Chain;
@@ -1601,6 +1679,296 @@
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
+SDValue SITargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
+ SDValue ThisVal) const {
+ CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign VA = RVLocs[i];
+ SDValue Val;
+
+ if (VA.isRegLoc()) {
+ Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+ } else if (VA.isMemLoc()) {
+ report_fatal_error("TODO: return values in memory");
+ } else
+ llvm_unreachable("unknown argument location type");
+
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::ZExt:
+ Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::SExt:
+ Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::AExt:
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+// The wave scratch offset register is used as the global base pointer.
+SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ const AMDGPUTargetMachine &TM =
+ static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
+ if (!TM.enableFunctionCalls())
+ return AMDGPUTargetLowering::LowerCall(CLI, InVals);
+
+ SelectionDAG &DAG = CLI.DAG;
+ const SDLoc &DL = CLI.DL;
+ SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ bool IsSibCall = false;
+ bool IsThisReturn = false;
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // TODO: Implement tail calls.
+ IsTailCall = false;
+
+ if (IsVarArg || MF.getTarget().Options.GuaranteedTailCallOpt) {
+ report_fatal_error("varargs and tail calls not implemented");
+ }
+
+ if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // FIXME: Remove this hack for function pointer types.
+ const GlobalValue *GV = GA->getGlobal();
+ assert(Callee.getValueType() == MVT::i32);
+ Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
+ false, GA->getTargetFlags());
+ }
+
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+ CCInfo.AnalyzeCallOperands(Outs, AssignFn);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ if (IsSibCall) {
+ // Since we're not changing the ABI to make this a tail call, the memory
+ // operands are already available in the caller's incoming argument space.
+ NumBytes = 0;
+ }
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0. Completely unused for non-tail calls.
+ int FPDiff = 0;
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ if (!IsSibCall) {
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
+
+ unsigned OffsetReg = Info->getScratchWaveOffsetReg();
+
+ // In the HSA case, this should be an identity copy.
+ SDValue ScratchRSrcReg
+ = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+
+ // TODO: Don't hardcode these registers and get from the callee function.
+ SDValue ScratchWaveOffsetReg
+ = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
+ }
+
+ // Stack pointer relative accesses are done by changing the offset SGPR. This
+ // is just the VGPR offset component.
+ SDValue StackPtr = DAG.getConstant(0, DL, MVT::i32);
+
+ SmallVector<SDValue, 8> MemOpChains;
+ MVT PtrVT = MVT::i32;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+ ++i, ++realArgIdx) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[realArgIdx];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::FPExt:
+ Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ int32_t Offset = LocMemOffset;
+ SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
+ PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+
+ if (!IsTailCall) {
+ SDValue PtrOff = DAG.getTargetConstant(Offset, DL, MVT::i32);
+
+ DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+ DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
+ }
+
+ if (Outs[i].Flags.isByVal()) {
+ SDValue SizeNode =
+ DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
+ SDValue Cpy = DAG.getMemcpy(
+ Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ /*isVol = */ false, /*AlwaysInline = */ true,
+ /*isTailCall = */ false,
+ DstInfo, MachinePointerInfo());
+
+ MemOpChains.push_back(Cpy);
+ } else {
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+ MemOpChains.push_back(Store);
+ }
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (auto &RegToPass : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+ RegToPass.second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // We don't usually want to end the call-sequence here because we would tidy
+ // the frame up *after* the call, however in the ABI-changing tail-call case
+ // we've carefully laid out the parameters so that when sp is reset they'll be
+ // in the correct location.
+ if (IsTailCall && !IsSibCall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getTargetConstant(NumBytes, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
+
+ std::vector<SDValue> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (IsTailCall) {
+ // Each tail call may have to adjust the stack by a different amount, so
+ // this information must travel along with the operation for eventual
+ // consumption by emitEpilogue.
+ Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
+ }
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (auto &RegToPass : RegsToPass) {
+ Ops.push_back(DAG.getRegister(RegToPass.first,
+ RegToPass.second.getValueType()));
+ }
+
+ // Add a register mask operand representing the call-preserved registers.
+
+ const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ // If we're doing a tall call, use a TC_RETURN here rather than an
+ // actual call instruction.
+ if (IsTailCall) {
+ MF.getFrameInfo().setHasTailCall();
+ llvm_unreachable("not implemented");
+ }
+
+ // Returns a chain and a flag for retval copy to use.
+ SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
+ Chain = Call.getValue(0);
+ InFlag = Call.getValue(1);
+
+ uint64_t CalleePopBytes = 0;
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32),
+ DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
+ InFlag, DL);
+ if (!Ins.empty())
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ InVals, IsThisReturn,
+ IsThisReturn ? OutVals[0] : SDValue());
+}
+
unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = StringSwitch<unsigned>(RegName)
@@ -2266,6 +2634,27 @@
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::ADJCALLSTACKUP:
+ case AMDGPU::ADJCALLSTACKDOWN: {
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ MachineInstrBuilder MIB(*MF, &MI);
+ MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
+ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+ return BB;
+ }
+ case AMDGPU::SI_CALL: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_SWAPPC_B64), ReturnAddrReg);
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
+ MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return BB;
+ }
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
@@ -2931,13 +3320,16 @@
SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
- GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS)
+ GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
+ // FIXME: It isn't correct to rely on the type of the pointer. This should
+ // be removed when address space 0 is 64-bit.
+ !GV->getType()->getElementType()->isFunctionTy())
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
SDLoc DL(GSD);
- const GlobalValue *GV = GSD->getGlobal();
EVT PtrVT = Op.getValueType();
if (shouldEmitFixup(GV))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 8eec325..dbe7887 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -183,6 +183,12 @@
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+ bool supportSplitCSR(MachineFunction *MF) const override;
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+ void insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -199,6 +205,15 @@
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &DL, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+ SDValue ThisVal) const;
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
unsigned getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f6b723d..c8b208e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -317,6 +317,45 @@
let DisableWQM = 1;
}
+// Return for returning function calls.
+def SI_RETURN : SPseudoInstSI <
+ (outs), (ins), [],
+ "; return"> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+ let SchedRW = [WriteBranch];
+}
+
+// Return for returning function calls.
+def SI_CALL : SPseudoInstSI <
+ (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)],
+ "; call $src0"> {
+ let Size = 4;
+ let isCall = 1;
+ let SchedRW = [WriteBranch];
+ let usesCustomInserter = 1;
+}
+
+def ADJCALLSTACKUP : SPseudoInstSI<
+ (outs), (ins i32imm:$amt0, i32imm:$amt1),
+ [(callseq_start timm:$amt0, timm:$amt1)],
+ "; adjcallstackup $amt0 $amt1"> {
+ let Size = 8; // Worst case. (s_add_u32 + constant)
+ let FixedSize = 1;
+ let hasSideEffects = 1;
+ let usesCustomInserter = 1;
+}
+
+def ADJCALLSTACKDOWN : SPseudoInstSI<
+ (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(callseq_end timm:$amt1, timm:$amt2)],
+ "; adjcallstackdown $amt1"> {
+ let Size = 8; // Worst case. (s_add_u32 + constant)
+ let hasSideEffects = 1;
+ let usesCustomInserter = 1;
+}
+
let Defs = [M0, EXEC],
UseNamedOperandTable = 1 in {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 8559c27..ea0bfb2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -236,8 +236,15 @@
return true;
}
-bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
- return MF.getFrameInfo().hasStackObjects();
+bool SIRegisterInfo::requiresFrameIndexScavenging(
+ const MachineFunction &MF) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (MFI.hasStackObjects())
+ return true;
+
+ // May need to deal with callee saved registers.
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ return !Info->isEntryFunction();
}
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 600cc88..3a8dea2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -63,6 +63,7 @@
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+ const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 4049ecd..d685326 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -269,6 +269,18 @@
// Register classes used as source and destination
//===----------------------------------------------------------------------===//
+def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+ (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
+ let isAllocatable = 0;
+ let CopyCost = -1;
+}
+
+def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32,
+ (add PRIVATE_RSRC_REG)> {
+ let isAllocatable = 0;
+ let CopyCost = -1;
+}
+
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,