AMDGPU: Split SILowerControlFlow into two pieces

Do most of the lowering in a pre-RA pass. Keep the skip jump
insertion late, plus a few other things that require more
work to move out.

One concern I have is now there may be COPY instructions
which do not have the necessary implicit exec uses
if they will be lowered to v_mov_b32.

This has a positive effect on SGPR usage in shader-db.

llvm-svn: 279464
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 4fad185..c3db9d7 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -58,8 +58,6 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/MC/MCAsmInfo.h"
 
 using namespace llvm;
 
@@ -67,46 +65,41 @@
 
 namespace {
 
-static cl::opt<unsigned> SkipThresholdFlag(
-  "amdgpu-skip-threshold",
-  cl::desc("Number of instructions before jumping over divergent control flow"),
-  cl::init(12), cl::Hidden);
-
 class SILowerControlFlow : public MachineFunctionPass {
 private:
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
-  unsigned SkipThreshold;
+  LiveIntervals *LIS;
 
-  bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
+  void emitIf(MachineInstr &MI);
+  void emitElse(MachineInstr &MI);
+  void emitBreak(MachineInstr &MI);
+  void emitIfBreak(MachineInstr &MI);
+  void emitElseBreak(MachineInstr &MI);
+  void emitLoop(MachineInstr &MI);
+  void emitEndCf(MachineInstr &MI);
 
-  MachineInstr *Skip(MachineInstr &From, MachineOperand &To);
-  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
-
-  void If(MachineInstr &MI);
-  void Else(MachineInstr &MI);
-  void Break(MachineInstr &MI);
-  void IfBreak(MachineInstr &MI);
-  void ElseBreak(MachineInstr &MI);
-  void Loop(MachineInstr &MI);
-  void EndCf(MachineInstr &MI);
-
-  void Kill(MachineInstr &MI);
-  void Branch(MachineInstr &MI);
-
-  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
 public:
   static char ID;
 
   SILowerControlFlow() :
-    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
+    MachineFunctionPass(ID),
+    TRI(nullptr),
+    TII(nullptr),
+    LIS(nullptr) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   const char *getPassName() const override {
     return "SI Lower control flow pseudo instructions";
   }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 };
 
 } // End anonymous namespace
@@ -114,403 +107,236 @@
 char SILowerControlFlow::ID = 0;
 
 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
-                "SI lower control flow", false, false)
+               "SI lower control flow", false, false)
 
-char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
+char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
 
-
-FunctionPass *llvm::createSILowerControlFlowPass() {
-  return new SILowerControlFlow();
-}
-
-static bool opcodeEmitsNoInsts(unsigned Opc) {
-  switch (Opc) {
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-  case TargetOpcode::BUNDLE:
-  case TargetOpcode::CFI_INSTRUCTION:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::GC_LABEL:
-  case TargetOpcode::DBG_VALUE:
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
-                                    MachineBasicBlock *To) {
-  if (From->succ_empty())
-    return false;
-
-  unsigned NumInstr = 0;
-  MachineFunction *MF = From->getParent();
-
-  for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
-       MBBI != End && MBBI != ToI; ++MBBI) {
-    MachineBasicBlock &MBB = *MBBI;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         NumInstr < SkipThreshold && I != E; ++I) {
-      if (opcodeEmitsNoInsts(I->getOpcode()))
-        continue;
-
-      // When a uniform loop is inside non-uniform control flow, the branch
-      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
-      // when EXEC = 0. We should skip the loop lest it becomes infinite.
-      if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
-          I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
-        return true;
-
-      if (I->isInlineAsm()) {
-        const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
-        const char *AsmStr = I->getOperand(0).getSymbolName();
-
-        // inlineasm length estimate is number of bytes assuming the longest
-        // instruction.
-        uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
-        NumInstr += MaxAsmSize / MAI->getMaxInstLength();
-      } else {
-        ++NumInstr;
-      }
-
-      if (NumInstr >= SkipThreshold)
-        return true;
-    }
-  }
-
-  return false;
-}
-
-MachineInstr *SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
-  if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
-    return nullptr;
-
-  const DebugLoc &DL = From.getDebugLoc();
-  MachineInstr *Skip =
-    BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
-    .addOperand(To);
-  return Skip;
-}
-
-bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
+void SILowerControlFlow::emitIf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction *MF = MBB.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator I(&MI);
 
-  if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
-      !shouldSkip(&MBB, &MBB.getParent()->back()))
-    return false;
+  MachineOperand &SaveExec = MI.getOperand(0);
+  MachineOperand &Cond = MI.getOperand(1);
+  assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
+         Cond.getSubReg() == AMDGPU::NoSubRegister);
 
-  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
-  MBB.addSuccessor(SkipBB);
+  unsigned SaveExecReg = SaveExec.getReg();
 
+  MachineInstr *AndSaveExec =
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExecReg)
+    .addOperand(Cond);
+
+  MachineInstr *Xor =
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+    .addReg(AMDGPU::EXEC)
+    .addReg(SaveExecReg);
+
+  // Insert a pseudo terminator to help keep the verifier happy. This will also
+  // be used later when inserting skips.
+  MachineInstr *NewBr =
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    .addOperand(MI.getOperand(2))
+    .addReg(SaveExecReg, getKillRegState(SaveExec.isKill()));
+
+  if (!LIS) {
+    MI.eraseFromParent();
+    return;
+  }
+
+
+  LIS->ReplaceMachineInstrInMaps(MI, *AndSaveExec);
+  LIS->InsertMachineInstrInMaps(*Xor);
+  LIS->InsertMachineInstrInMaps(*NewBr);
+
+  MI.eraseFromParent();
+
+  // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
+  // hard to add another def here but I'm not sure how to correctly update the
+  // valno.
+  LIS->removeInterval(SaveExecReg);
+  LIS->createAndComputeVirtRegInterval(SaveExecReg);
+}
+
+void SILowerControlFlow::emitElse(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
 
-  // If the exec mask is non-zero, skip the next two instructions
-  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-    .addMBB(&NextBB);
+  unsigned DstReg = MI.getOperand(0).getReg();
+  assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
 
-  MachineBasicBlock::iterator Insert = SkipBB->begin();
+  bool ExecModified = MI.getOperand(3).getImm() != 0;
+  MachineBasicBlock::iterator Start = MBB.begin();
 
-  // Exec mask is zero: Export to NULL target...
-  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
-    .addImm(0)
-    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-    .addImm(0)
-    .addImm(1)
-    .addImm(1)
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addReg(AMDGPU::VGPR0, RegState::Undef);
+  // This must be inserted before phis and any spill code inserted before the
+  // else.
+  MachineInstr *OrSaveExec =
+    BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), DstReg)
+    .addOperand(MI.getOperand(1)); // Saved EXEC
+  MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
 
-  // ... and terminate wavefront.
-  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+  MachineBasicBlock::iterator ElsePt(MI);
 
-  return true;
-}
+  if (ExecModified) {
+    MachineInstr *And =
+      BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
+      .addReg(AMDGPU::EXEC)
+      .addReg(DstReg);
 
-void SILowerControlFlow::If(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  unsigned Reg = MI.getOperand(0).getReg();
-  unsigned Vcc = MI.getOperand(1).getReg();
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
-          .addReg(Vcc);
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
-          .addReg(AMDGPU::EXEC)
-          .addReg(Reg);
-
-  MachineInstr *SkipInst = Skip(MI, MI.getOperand(2));
-
-  // Insert before the new branch instruction.
-  MachineInstr *InsPt = SkipInst ? SkipInst : &MI;
-
-  // Insert a pseudo terminator to help keep the verifier happy.
-  BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
-    .addOperand(MI.getOperand(2))
-    .addReg(Reg);
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlow::Else(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Src = MI.getOperand(1).getReg();
-
-  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
-          TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
-          .addReg(Src); // Saved EXEC
-
-  if (MI.getOperand(3).getImm() != 0) {
-    // Adjust the saved exec to account for the modifications during the flow
-    // block that contains the ELSE. This can happen when WQM mode is switched
-    // off.
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
-            .addReg(AMDGPU::EXEC)
-            .addReg(Dst);
+    if (LIS)
+      LIS->InsertMachineInstrInMaps(*And);
   }
 
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC)
-          .addReg(Dst);
+  MachineInstr *Xor =
+    BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addReg(DstReg);
 
-  MachineInstr *SkipInst = Skip(MI, MI.getOperand(2));
-
-  // Insert before the new branch instruction.
-  MachineInstr *InsPt = SkipInst ? SkipInst : &MI;
-
+  MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
   // Insert a pseudo terminator to help keep the verifier happy.
-  BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
-    .addOperand(MI.getOperand(2))
-    .addReg(Dst);
+  MachineInstr *Branch =
+    BuildMI(MBB, Term, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    .addMBB(DestBB)
+    .addReg(DstReg);
 
+  if (!LIS) {
+    MI.eraseFromParent();
+    return;
+  }
+
+  LIS->RemoveMachineInstrFromMaps(MI);
   MI.eraseFromParent();
+
+  LIS->InsertMachineInstrInMaps(*OrSaveExec);
+
+  LIS->InsertMachineInstrInMaps(*Xor);
+  LIS->InsertMachineInstrInMaps(*Branch);
+
+  // src reg is tied to dst reg.
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+
+  // Let this be recomputed.
+  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
 }
 
-void SILowerControlFlow::Break(MachineInstr &MI) {
+void SILowerControlFlow::emitBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-
+  const DebugLoc &DL = MI.getDebugLoc();
   unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Src = MI.getOperand(1).getReg();
 
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-          .addReg(AMDGPU::EXEC)
-          .addReg(Src);
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlow::IfBreak(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-
-  unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Vcc = MI.getOperand(1).getReg();
-  unsigned Src = MI.getOperand(2).getReg();
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-          .addReg(Vcc)
-          .addReg(Src);
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-
-  unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Saved = MI.getOperand(1).getReg();
-  unsigned Src = MI.getOperand(2).getReg();
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-          .addReg(Saved)
-          .addReg(Src);
-
-  MI.eraseFromParent();
-}
-
-void SILowerControlFlow::Loop(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  unsigned Src = MI.getOperand(0).getReg();
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC)
-          .addReg(Src);
-
-  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+  MachineInstr *Or =
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+    .addReg(AMDGPU::EXEC)
     .addOperand(MI.getOperand(1));
 
+  if (LIS)
+    LIS->ReplaceMachineInstrInMaps(MI, *Or);
   MI.eraseFromParent();
 }
 
-void SILowerControlFlow::EndCf(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  unsigned Reg = MI.getOperand(0).getReg();
-
-  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
-          TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC)
-          .addReg(Reg);
-
-  MI.eraseFromParent();
+void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
+  MI.setDesc(TII->get(AMDGPU::S_OR_B64));
 }
 
-void SILowerControlFlow::Branch(MachineInstr &MI) {
-  MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
-  if (MBB == MI.getParent()->getNextNode())
-    MI.eraseFromParent();
-
-  // If these aren't equal, this is probably an infinite loop.
+void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
+  MI.setDesc(TII->get(AMDGPU::S_OR_B64));
 }
 
-void SILowerControlFlow::Kill(MachineInstr &MI) {
+void SILowerControlFlow::emitLoop(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  const MachineOperand &Op = MI.getOperand(0);
+  const DebugLoc &DL = MI.getDebugLoc();
 
-#ifndef NDEBUG
-  CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
-  // Kill is only allowed in pixel / geometry shaders.
-  assert(CallConv == CallingConv::AMDGPU_PS ||
-         CallConv == CallingConv::AMDGPU_GS);
-#endif
+  MachineInstr *AndN2 =
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addOperand(MI.getOperand(0));
 
-  // Clear this thread from the exec mask if the operand is negative
-  if ((Op.isImm())) {
-    // Constant operand: Set exec mask to 0 or do nothing
-    if (Op.getImm() & 0x80000000) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-              .addImm(0);
-    }
-  } else {
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
-           .addImm(0)
-           .addOperand(Op);
+  MachineInstr *Branch =
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addOperand(MI.getOperand(1));
+
+  if (LIS) {
+    LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
+    LIS->InsertMachineInstrInMaps(*Branch);
   }
 
   MI.eraseFromParent();
 }
 
-MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
-  MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
-  MachineFunction *MF = MBB.getParent();
+void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
 
-  MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
-  MachineFunction::iterator MBBI(MBB);
-  ++MBBI;
+  MachineBasicBlock::iterator InsPt = MBB.begin();
+  MachineInstr *NewMI =
+    BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addOperand(MI.getOperand(0));
 
-  MF->insert(MBBI, SkipBB);
+  if (LIS)
+    LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
 
-  return SkipBB;
+  MI.eraseFromParent();
+
+  if (LIS)
+    LIS->handleMove(*NewMI);
 }
 
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
-  SkipThreshold = SkipThresholdFlag;
 
-  bool HaveKill = false;
-  unsigned Depth = 0;
+  // This doesn't actually need LiveIntervals, but we can preserve them.
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
 
   MachineFunction::iterator NextBB;
-
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; BI = NextBB) {
     NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
 
-    MachineBasicBlock *EmptyMBBAtEnd = nullptr;
     MachineBasicBlock::iterator I, Next;
 
     for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
-
       MachineInstr &MI = *I;
 
       switch (MI.getOpcode()) {
-        default: break;
-        case AMDGPU::SI_IF:
-          ++Depth;
-          If(MI);
-          break;
+      case AMDGPU::SI_IF:
+        emitIf(MI);
+        break;
 
-        case AMDGPU::SI_ELSE:
-          Else(MI);
-          break;
+      case AMDGPU::SI_ELSE:
+        emitElse(MI);
+        break;
 
-        case AMDGPU::SI_BREAK:
-          Break(MI);
-          break;
+      case AMDGPU::SI_BREAK:
+        emitBreak(MI);
+        break;
 
-        case AMDGPU::SI_IF_BREAK:
-          IfBreak(MI);
-          break;
+      case AMDGPU::SI_IF_BREAK:
+        emitIfBreak(MI);
+        break;
 
-        case AMDGPU::SI_ELSE_BREAK:
-          ElseBreak(MI);
-          break;
+      case AMDGPU::SI_ELSE_BREAK:
+        emitElseBreak(MI);
+        break;
 
-        case AMDGPU::SI_LOOP:
-          ++Depth;
-          Loop(MI);
-          break;
+      case AMDGPU::SI_LOOP:
+        emitLoop(MI);
+        break;
 
-        case AMDGPU::SI_END_CF:
-          if (--Depth == 0 && HaveKill) {
-            HaveKill = false;
-            // TODO: Insert skip if exec is 0?
-          }
+      case AMDGPU::SI_END_CF:
+        emitEndCf(MI);
+        break;
 
-          EndCf(MI);
-          break;
-
-        case AMDGPU::SI_KILL_TERMINATOR:
-          if (Depth == 0) {
-            if (skipIfDead(MI, *NextBB)) {
-              NextBB = std::next(BI);
-              BE = MF.end();
-            }
-          } else
-            HaveKill = true;
-          Kill(MI);
-          break;
-
-        case AMDGPU::S_BRANCH:
-          Branch(MI);
-          break;
-
-        case AMDGPU::SI_RETURN: {
-          assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
-
-          // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
-          // because external bytecode will be appended at the end.
-          if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
-            // SI_RETURN is not the last instruction. Add an empty block at
-            // the end and jump there.
-            if (!EmptyMBBAtEnd) {
-              EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
-              MF.insert(MF.end(), EmptyMBBAtEnd);
-            }
-
-            MBB.addSuccessor(EmptyMBBAtEnd);
-            BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
-                    .addMBB(EmptyMBBAtEnd);
-            I->eraseFromParent();
-          }
-          break;
-        }
+      default:
+        break;
       }
     }
   }
+
   return true;
 }