[AMDGPU] Optimize SI_IF lowering for simple if regions
Currently SI_IF results in a s_and_saveexec_b64 followed by s_xor_b64.
The xor is used to extract only the changed bits. In case of a simple
if region where the only use of that value is in the SI_END_CF to
restore the old exec mask, we can omit the xor and perform an or of
the exec mask with the original exec value saved by the
s_and_saveexec_b64.
Differential Revision: https://reviews.llvm.org/D35861
llvm-svn: 309185
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 5f1c7f1..de86c19 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -149,9 +149,19 @@
MachineOperand &ImpDefSCC = MI.getOperand(4);
assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
+ // If there is only one use of save exec register and that use is SI_END_CF,
+ // we can optimize SI_IF by returning the full saved exec mask instead of
+ // just cleared bits.
+ bool SimpleIf = false;
+ auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
+ SimpleIf = U != MRI->use_instr_nodbg_end() &&
+ std::next(U) == MRI->use_instr_nodbg_end() &&
+ U->getOpcode() == AMDGPU::SI_END_CF;
+
// Add an implicit def of exec to discourage scheduling VALU after this which
// will interfere with trying to form s_and_saveexec_b64 later.
- unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ unsigned CopyReg = SimpleIf ? SaveExecReg
+ : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
MachineInstr *CopyExec =
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
.addReg(AMDGPU::EXEC)
@@ -166,11 +176,14 @@
.addReg(Cond.getReg());
setImpSCCDefDead(*And, true);
- MachineInstr *Xor =
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
- .addReg(Tmp)
- .addReg(CopyReg);
- setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+ MachineInstr *Xor = nullptr;
+ if (!SimpleIf) {
+ Xor =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+ .addReg(Tmp)
+ .addReg(CopyReg);
+ setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+ }
// Use a copy that is a terminator to get correct spill code placement it with
// fast regalloc.
@@ -194,7 +207,8 @@
// register.
LIS->ReplaceMachineInstrInMaps(MI, *And);
- LIS->InsertMachineInstrInMaps(*Xor);
+ if (!SimpleIf)
+ LIS->InsertMachineInstrInMaps(*Xor);
LIS->InsertMachineInstrInMaps(*SetExec);
LIS->InsertMachineInstrInMaps(*NewBr);
@@ -207,7 +221,8 @@
LIS->removeInterval(SaveExecReg);
LIS->createAndComputeVirtRegInterval(SaveExecReg);
LIS->createAndComputeVirtRegInterval(Tmp);
- LIS->createAndComputeVirtRegInterval(CopyReg);
+ if (!SimpleIf)
+ LIS->createAndComputeVirtRegInterval(CopyReg);
}
void SILowerControlFlow::emitElse(MachineInstr &MI) {