AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills
If some lanes weren't active on entry to the function, this could
clobber their VGPR values.
llvm-svn: 361655
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index d2dd349..1eea77b 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -523,22 +523,20 @@
// but we would then have to make sure that we were in fact saving at least one
// callee-save register in the prologue, which is additional complexity that
// doesn't seem worth the benefit.
-static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
- MachineFunction *MF = MBB.getParent();
-
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
+static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF,
+ LivePhysRegs &LiveRegs,
+ const TargetRegisterClass &RC) {
+ const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
- LivePhysRegs LiveRegs(TRI);
- LiveRegs.addLiveIns(MBB);
// Mark callee saved registers as used so we will not choose them.
- const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
+ const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
for (unsigned i = 0; CSRegs[i]; ++i)
LiveRegs.addReg(CSRegs[i]);
- MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
- for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
+ for (unsigned Reg : RC) {
if (LiveRegs.available(MRI, Reg))
return Reg;
}
@@ -561,6 +559,7 @@
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+ LivePhysRegs LiveRegs;
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
@@ -578,7 +577,12 @@
RoundedSize += Alignment;
- unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+
+ unsigned ScratchSPReg
+ = findScratchNonCalleeSaveRegister(MF, LiveRegs,
+ AMDGPU::SReg_32_XM0RegClass);
assert(ScratchSPReg != AMDGPU::NoRegister);
// s_add_u32 tmp_reg, s32, NumBytes
@@ -609,13 +613,33 @@
.setMIFlag(MachineInstr::FrameSetup);
}
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
- : FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
- continue;
- TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
- Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
- &TII->getRegisterInfo());
+ if (!FuncInfo->getSGPRSpillVGPRs().empty()) {
+ if (LiveRegs.empty()) {
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+ }
+
+ // To avoid clobbering VGPRs in lanes that weren't active on function entry,
+ // turn on all lanes before doing the spill to memory.
+ unsigned ScratchExecCopy
+ = findScratchNonCalleeSaveRegister(MF, LiveRegs,
+ AMDGPU::SReg_64_XEXECRegClass);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy)
+ .addImm(-1);
+
+ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+ : FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI.hasValue())
+ continue;
+ TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
+ Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
+ &TII->getRegisterInfo());
+ }
+
+ // FIXME: Split block and make terminator.
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(ScratchExecCopy);
}
}
@@ -628,14 +652,32 @@
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ DebugLoc DL;
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
- : FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
- continue;
- TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
- Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
- &TII->getRegisterInfo());
+ if (!FuncInfo->getSGPRSpillVGPRs().empty()) {
+ // See emitPrologue
+ LivePhysRegs LiveRegs(*ST.getRegisterInfo());
+ LiveRegs.addLiveIns(MBB);
+
+ unsigned ScratchExecCopy
+ = findScratchNonCalleeSaveRegister(MF, LiveRegs,
+ AMDGPU::SReg_64_XEXECRegClass);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy)
+ .addImm(-1);
+
+ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+ : FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI.hasValue())
+ continue;
+ TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
+ Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
+ &TII->getRegisterInfo());
+ }
+
+ // FIXME: Split block and make terminator.
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(ScratchExecCopy);
}
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
@@ -645,8 +687,6 @@
const MachineFrameInfo &MFI = MF.getFrameInfo();
uint32_t NumBytes = MFI.getStackSize();
- DebugLoc DL;
-
// FIXME: Clarify distinction between no set SP and SP. For callee functions,
// it's really whether we need SP to be accurate or not.