[AMDGPU] Optimize SI_IF lowering for simple if regions

Currently SI_IF results in a s_and_saveexec_b64 followed by s_xor_b64.
The xor is used to extract only the changed bits. In case of a simple
if region where the only use of that value is in the SI_END_CF to
restore the old exec mask, we can omit the xor and perform an or of
the exec mask with the original exec value saved by the
s_and_saveexec_b64.

Differential Revision: https://reviews.llvm.org/D35861

llvm-svn: 309185
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 5f1c7f1..de86c19 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -149,9 +149,19 @@
   MachineOperand &ImpDefSCC = MI.getOperand(4);
   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
 
+  // If there is only one use of save exec register and that use is SI_END_CF,
+  // we can optimize SI_IF by returning the full saved exec mask instead of
+  // just cleared bits.
+  bool SimpleIf = false;
+  auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
+  SimpleIf = U != MRI->use_instr_nodbg_end() &&
+             std::next(U) == MRI->use_instr_nodbg_end() &&
+             U->getOpcode() == AMDGPU::SI_END_CF;
+
   // Add an implicit def of exec to discourage scheduling VALU after this which
   // will interfere with trying to form s_and_saveexec_b64 later.
-  unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  unsigned CopyReg = SimpleIf ? SaveExecReg
+                       : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
   MachineInstr *CopyExec =
     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
     .addReg(AMDGPU::EXEC)
@@ -166,11 +176,14 @@
     .addReg(Cond.getReg());
   setImpSCCDefDead(*And, true);
 
-  MachineInstr *Xor =
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
-    .addReg(Tmp)
-    .addReg(CopyReg);
-  setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+  MachineInstr *Xor = nullptr;
+  if (!SimpleIf) {
+    Xor =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+      .addReg(Tmp)
+      .addReg(CopyReg);
+    setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+  }
 
   // Use a copy that is a terminator to get correct spill code placement it with
   // fast regalloc.
@@ -194,7 +207,8 @@
   // register.
   LIS->ReplaceMachineInstrInMaps(MI, *And);
 
-  LIS->InsertMachineInstrInMaps(*Xor);
+  if (!SimpleIf)
+    LIS->InsertMachineInstrInMaps(*Xor);
   LIS->InsertMachineInstrInMaps(*SetExec);
   LIS->InsertMachineInstrInMaps(*NewBr);
 
@@ -207,7 +221,8 @@
   LIS->removeInterval(SaveExecReg);
   LIS->createAndComputeVirtRegInterval(SaveExecReg);
   LIS->createAndComputeVirtRegInterval(Tmp);
-  LIS->createAndComputeVirtRegInterval(CopyReg);
+  if (!SimpleIf)
+    LIS->createAndComputeVirtRegInterval(CopyReg);
 }
 
 void SILowerControlFlow::emitElse(MachineInstr &MI) {