Optimize code placement in loop to eliminate unconditional branches or move unconditional branch to the outside of the loop. e.g.

///       A:                                                                                                                                                                 
///       ...                                                                                                                                                                
///       <fallthrough to B>                                                                                                                                                 
///                                                                                                                                                                          
///       B:  --> loop header                                                                                                                                                
///       ...                                                                                                                                                                
///       jcc <cond> C, [exit]                                                                                                                                               
///                                                                                                                                                                          
///       C:                                                                                                                                                                 
///       ...                                                                                                                                                                
///       jmp B                                                                                                                                                              
///                                                                                                                                                                          
/// ==>                                                                                                                                                                      
///                                                                                                                                                                          
///       A:                                                                                                                                                                 
///       ...                                                                                                                                                                
///       jmp B                                                                                                                                                              
///                                                                                                                                                                          
///       C:  --> new loop header                                                                                                                                            
///       ...                                                                                                                                                                
///       <fallthough to B>                                                                                                                                                  
///                                                                                                                                                                          
///       B:                                                                                                                                                                 
///       ...                                                                                                                                                                
///       jcc <cond> C, [exit] 


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@71209 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index 924a969..d297d5a 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -16,15 +16,40 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
+static cl::opt<bool>
+OptLoopBBPlacement("opt-loop-bb-placement",
+                   cl::init(false), cl::Hidden,
+                   cl::desc("Optimize block placements in loops"));
+
+STATISTIC(NumHeaderAligned, "Number of loop header aligned");
+STATISTIC(NumIntraElim,     "Number of intra loop branches eliminated");
+STATISTIC(NumIntraMoved,    "Number of intra loop branches moved");
+
 namespace {
   class CodePlacementOpt : public MachineFunctionPass {
     const MachineLoopInfo *MLI;
+    const TargetInstrInfo *TII;
+    const TargetLowering  *TLI;
+
+    /// ChangedMBBs - BBs which are modified by OptimizeIntraLoopEdges.
+    SmallPtrSet<MachineBasicBlock*, 8> ChangedMBBs;
+
+    /// UncondJmpMBBs - A list of BBs which are in loops and end with
+    /// unconditional branches.
+    SmallVector<std::pair<MachineBasicBlock*,MachineBasicBlock*>, 4>
+    UncondJmpMBBs;
+
+    /// LoopHeaders - A list of BBs which are loop headers.
+    SmallVector<MachineBasicBlock*, 4> LoopHeaders;
 
   public:
     static char ID;
@@ -37,12 +62,12 @@
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<MachineLoopInfo>();
-      AU.addPreserved<MachineLoopInfo>();
       AU.addPreservedID(MachineDominatorsID);
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
   private:
+    bool OptimizeIntraLoopEdges();
     bool AlignLoops(MachineFunction &MF);
   };
 
@@ -53,32 +78,199 @@
   return new CodePlacementOpt();
 }
 
+/// OptimizeBackEdges - Place loop back edges to move unconditional branches
+/// out of the loop.
+///
+///       A:
+///       ...
+///       <fallthrough to B>
+///
+///       B:  --> loop header
+///       ...
+///       jcc <cond> C, [exit]
+///
+///       C:
+///       ...
+///       jmp B
+///
+/// ==>
+///
+///       A:
+///       ...
+///       jmp B
+///
+///       C:  --> new loop header
+///       ...
+///       <fallthough to B>
+///       
+///       B:
+///       ...
+///       jcc <cond> C, [exit]
+///
+bool CodePlacementOpt::OptimizeIntraLoopEdges() {
+  if (!OptLoopBBPlacement)
+    return false;
+
+  bool Changed = false;
+  for (unsigned i = 0, e = UncondJmpMBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = UncondJmpMBBs[i].first;
+    MachineBasicBlock *SuccMBB = UncondJmpMBBs[i].second;
+    MachineLoop *L = MLI->getLoopFor(MBB);
+    assert(L && "BB is expected to be in a loop!");
+
+    if (ChangedMBBs.count(MBB)) {
+      // BB has been modified, re-analyze.
+      MachineBasicBlock *TBB = 0, *FBB = 0;
+      SmallVector<MachineOperand, 4> Cond;
+      if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond) || !Cond.empty())
+        continue;
+      if (MLI->getLoopFor(TBB) != L || TBB->isLandingPad())
+        continue;
+      SuccMBB = TBB;
+    } else {
+      assert(MLI->getLoopFor(SuccMBB) == L &&
+             "Successor is not in the same loop!");
+    }
+
+    if (MBB->isLayoutSuccessor(SuccMBB)) {
+      // Successor is right after MBB, just eliminate the unconditional jmp.
+      // Can this happen?
+      TII->RemoveBranch(*MBB);
+      ChangedMBBs.insert(MBB);
+      ++NumIntraElim;
+      continue;
+    }
+
+    // Now check if the predecessor is fallthrough from any BB. If there is,
+    // that BB should be from outside the loop since edge will become a jmp.
+    bool OkToMove = true;
+    MachineBasicBlock *FtMBB = 0, *FtTBB = 0, *FtFBB = 0;
+    SmallVector<MachineOperand, 4> FtCond;    
+    for (MachineBasicBlock::pred_iterator PI = SuccMBB->pred_begin(),
+           PE = SuccMBB->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *PredMBB = *PI;
+      if (PredMBB->isLayoutSuccessor(SuccMBB)) {
+        if (TII->AnalyzeBranch(*PredMBB, FtTBB, FtFBB, FtCond)) {
+          OkToMove = false;
+          break;
+        }
+        if (!FtTBB)
+          FtTBB = SuccMBB;
+        else if (!FtFBB) {
+          assert(FtFBB != SuccMBB && "Unexpected control flow!");
+          FtFBB = SuccMBB;
+        }
+        
+        // A fallthrough.
+        FtMBB = PredMBB;
+        MachineLoop *PL = MLI->getLoopFor(PredMBB);
+        if (PL && (PL == L || PL->getLoopDepth() >= L->getLoopDepth())) {
+          OkToMove = false;
+          break;
+        }
+      }
+    }
+
+    if (!OkToMove)
+      continue;
+
+    // Is it profitable? If SuccMBB can fallthrough itself, that can be changed
+    // into a jmp.
+    MachineBasicBlock *TBB = 0, *FBB = 0;
+    SmallVector<MachineOperand, 4> Cond;
+    if (TII->AnalyzeBranch(*SuccMBB, TBB, FBB, Cond))
+      continue;
+    if (!TBB && Cond.empty())
+      TBB = next(MachineFunction::iterator(SuccMBB));
+    else if (!FBB && !Cond.empty())
+      FBB = next(MachineFunction::iterator(SuccMBB));
+
+    // This calculate the cost of the transformation. Also, it finds the *only*
+    // intra-loop edge if there is one.
+    int Cost = 0;
+    bool HasOneIntraSucc = true;
+    MachineBasicBlock *IntraSucc = 0;
+    for (MachineBasicBlock::succ_iterator SI = SuccMBB->succ_begin(),
+           SE = SuccMBB->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock *SSMBB = *SI;
+      if (MLI->getLoopFor(SSMBB) == L) {
+        if (!IntraSucc)
+          IntraSucc = SSMBB;
+        else
+          HasOneIntraSucc = false;
+      }
+
+      if (SuccMBB->isLayoutSuccessor(SSMBB))
+        // This will become a jmp.
+        ++Cost;
+      else if (MBB->isLayoutSuccessor(SSMBB))
+        // One of the successor will become the new fallthrough.
+        if (SSMBB == FBB) {
+          FBB = 0;
+          --Cost;
+        } else if (!FBB && SSMBB == TBB && Cond.empty()) {
+          TBB = 0;
+          --Cost;
+        } else if (!TII->ReverseBranchCondition(Cond)) {
+          TBB = FBB;
+          FBB = 0;
+          --Cost;
+        }
+    }
+    if (Cost)
+      continue;
+
+    // Now, let's move the successor to below the BB to eliminate the jmp.
+    SuccMBB->moveAfter(MBB);
+    TII->RemoveBranch(*MBB);
+    TII->RemoveBranch(*SuccMBB);
+    if (TBB)
+      TII->InsertBranch(*SuccMBB, TBB, FBB, Cond);
+    ChangedMBBs.insert(MBB);
+    ChangedMBBs.insert(SuccMBB);
+    if (FtMBB) {
+      TII->RemoveBranch(*FtMBB);
+      TII->InsertBranch(*FtMBB, FtTBB, FtFBB, FtCond);
+      ChangedMBBs.insert(FtMBB);
+    }
+
+    // If BB is the loop latch, we may have a new loop headr.
+    if (MBB == L->getLoopLatch()) {
+      assert(MLI->isLoopHeader(SuccMBB) &&
+             "Only succ of loop latch is not the header?");
+      if (HasOneIntraSucc && IntraSucc)
+        std::replace(LoopHeaders.begin(),LoopHeaders.end(), SuccMBB, IntraSucc);
+    }
+  }
+
+  ++NumIntraMoved;
+  return Changed;
+}
+
 /// AlignLoops - Align loop headers to target preferred alignments.
 ///
 bool CodePlacementOpt::AlignLoops(MachineFunction &MF) {
-  const TargetLowering *TLI = MF.getTarget().getTargetLowering();
-  if (!TLI)
+  const Function *F = MF.getFunction();
+  if (F->hasFnAttr(Attribute::OptimizeForSize))
     return false;
 
   unsigned Align = TLI->getPrefLoopAlignment();
   if (!Align)
     return false;  // Don't care about loop alignment.
 
-  const Function *F = MF.getFunction();
-  if (F->hasFnAttr(Attribute::OptimizeForSize))
-    return false;
+  // Make sure blocks are numbered in order
+  MF.RenumberBlocks();
 
   bool Changed = false;
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = I;
-    if (MLI->isLoopHeader(MBB)) {
-      MachineBasicBlock *PredBB = prior(I);
-      if (MLI->getLoopFor(MBB) == MLI->getLoopFor(PredBB))
-        // If previously BB is in the same loop, don't align this BB. We want
-        // to prevent adding noop's inside a loop.
-        continue;
-      MBB->setAlignment(Align);
+  for (unsigned i = 0, e = LoopHeaders.size(); i != e; ++i) {
+    MachineBasicBlock *HeaderMBB = LoopHeaders[i];
+    MachineBasicBlock *PredMBB = prior(MachineFunction::iterator(HeaderMBB));
+    if (MLI->getLoopFor(HeaderMBB) != MLI->getLoopFor(PredMBB)) {
+      // If previously BB is in the same loop, don't align this BB. We want
+      // to prevent adding noop's inside a loop.
+      HeaderMBB->setAlignment(Align);
       Changed = true;
+      ++NumHeaderAligned;
     }
   }
 
@@ -90,8 +282,36 @@
   if (MLI->empty())
     return false;  // No loops.
 
-  bool Changed = false;
+  TLI = MF.getTarget().getTargetLowering();
+  TII = MF.getTarget().getInstrInfo();
+
+  // Analyze the BBs first and keep track of loop headers and BBs that
+  // end with an unconditional jmp to another block in the same loop.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I;
+    if (MBB->isLandingPad())
+      continue;
+    MachineLoop *L = MLI->getLoopFor(MBB);
+    if (!L)
+      continue;
+    if (MLI->isLoopHeader(MBB))
+      LoopHeaders.push_back(MBB);
+
+    MachineBasicBlock *TBB = 0, *FBB = 0;
+    SmallVector<MachineOperand, 4> Cond;
+    if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond) || !Cond.empty())
+      continue;
+    if (MLI->getLoopFor(TBB) == L && !TBB->isLandingPad())
+      UncondJmpMBBs.push_back(std::make_pair(MBB, TBB));
+  }
+
+  bool Changed = OptimizeIntraLoopEdges();
+
   Changed |= AlignLoops(MF);
 
+  ChangedMBBs.clear();
+  UncondJmpMBBs.clear();
+  LoopHeaders.clear();
+
   return Changed;
 }
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 50115c2..369c4c6 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1508,7 +1508,7 @@
     if (I->getOpcode() == X86::JMP) {
       if (!AllowModify) {
         TBB = I->getOperand(0).getMBB();
-        return false;
+        continue;
       }
 
       // If the block has any instructions after a JMP, delete them.
diff --git a/test/CodeGen/X86/code_placement.ll b/test/CodeGen/X86/code_placement.ll
new file mode 100644
index 0000000..3ffb6ba
--- /dev/null
+++ b/test/CodeGen/X86/code_placement.ll
@@ -0,0 +1,134 @@
+; RUN: llvm-as < %s | llc -march=x86 -opt-loop-bb-placement | %prcontext jmp 1 | grep align
+
+@Te0 = external global [256 x i32]		; <[256 x i32]*> [#uses=5]
+@Te1 = external global [256 x i32]		; <[256 x i32]*> [#uses=4]
+@Te3 = external global [256 x i32]		; <[256 x i32]*> [#uses=2]
+
+define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r) nounwind ssp {
+entry:
+	%0 = load i32* %rk, align 4		; <i32> [#uses=1]
+	%1 = getelementptr i32* %rk, i64 1		; <i32*> [#uses=1]
+	%2 = load i32* %1, align 4		; <i32> [#uses=1]
+	%tmp15 = add i32 %r, -1		; <i32> [#uses=1]
+	%tmp.16 = zext i32 %tmp15 to i64		; <i64> [#uses=2]
+	br label %bb
+
+bb:		; preds = %bb1, %entry
+	%indvar = phi i64 [ 0, %entry ], [ %indvar.next, %bb1 ]		; <i64> [#uses=3]
+	%s1.0 = phi i32 [ %2, %entry ], [ %56, %bb1 ]		; <i32> [#uses=2]
+	%s0.0 = phi i32 [ %0, %entry ], [ %43, %bb1 ]		; <i32> [#uses=2]
+	%tmp18 = shl i64 %indvar, 4		; <i64> [#uses=4]
+	%rk26 = bitcast i32* %rk to i8*		; <i8*> [#uses=6]
+	%3 = lshr i32 %s0.0, 24		; <i32> [#uses=1]
+	%4 = zext i32 %3 to i64		; <i64> [#uses=1]
+	%5 = getelementptr [256 x i32]* @Te0, i64 0, i64 %4		; <i32*> [#uses=1]
+	%6 = load i32* %5, align 4		; <i32> [#uses=1]
+	%7 = lshr i32 %s1.0, 16		; <i32> [#uses=1]
+	%8 = and i32 %7, 255		; <i32> [#uses=1]
+	%9 = zext i32 %8 to i64		; <i64> [#uses=1]
+	%10 = getelementptr [256 x i32]* @Te1, i64 0, i64 %9		; <i32*> [#uses=1]
+	%11 = load i32* %10, align 4		; <i32> [#uses=1]
+	%ctg2.sum2728 = or i64 %tmp18, 8		; <i64> [#uses=1]
+	%12 = getelementptr i8* %rk26, i64 %ctg2.sum2728		; <i8*> [#uses=1]
+	%13 = bitcast i8* %12 to i32*		; <i32*> [#uses=1]
+	%14 = load i32* %13, align 4		; <i32> [#uses=1]
+	%15 = xor i32 %11, %6		; <i32> [#uses=1]
+	%16 = xor i32 %15, %14		; <i32> [#uses=3]
+	%17 = lshr i32 %s1.0, 24		; <i32> [#uses=1]
+	%18 = zext i32 %17 to i64		; <i64> [#uses=1]
+	%19 = getelementptr [256 x i32]* @Te0, i64 0, i64 %18		; <i32*> [#uses=1]
+	%20 = load i32* %19, align 4		; <i32> [#uses=1]
+	%21 = and i32 %s0.0, 255		; <i32> [#uses=1]
+	%22 = zext i32 %21 to i64		; <i64> [#uses=1]
+	%23 = getelementptr [256 x i32]* @Te3, i64 0, i64 %22		; <i32*> [#uses=1]
+	%24 = load i32* %23, align 4		; <i32> [#uses=1]
+	%ctg2.sum2930 = or i64 %tmp18, 12		; <i64> [#uses=1]
+	%25 = getelementptr i8* %rk26, i64 %ctg2.sum2930		; <i8*> [#uses=1]
+	%26 = bitcast i8* %25 to i32*		; <i32*> [#uses=1]
+	%27 = load i32* %26, align 4		; <i32> [#uses=1]
+	%28 = xor i32 %24, %20		; <i32> [#uses=1]
+	%29 = xor i32 %28, %27		; <i32> [#uses=4]
+	%30 = lshr i32 %16, 24		; <i32> [#uses=1]
+	%31 = zext i32 %30 to i64		; <i64> [#uses=1]
+	%32 = getelementptr [256 x i32]* @Te0, i64 0, i64 %31		; <i32*> [#uses=1]
+	%33 = load i32* %32, align 4		; <i32> [#uses=2]
+	%exitcond = icmp eq i64 %indvar, %tmp.16		; <i1> [#uses=1]
+	br i1 %exitcond, label %bb2, label %bb1
+
+bb1:		; preds = %bb
+	%ctg2.sum31 = add i64 %tmp18, 16		; <i64> [#uses=1]
+	%34 = getelementptr i8* %rk26, i64 %ctg2.sum31		; <i8*> [#uses=1]
+	%35 = bitcast i8* %34 to i32*		; <i32*> [#uses=1]
+	%36 = lshr i32 %29, 16		; <i32> [#uses=1]
+	%37 = and i32 %36, 255		; <i32> [#uses=1]
+	%38 = zext i32 %37 to i64		; <i64> [#uses=1]
+	%39 = getelementptr [256 x i32]* @Te1, i64 0, i64 %38		; <i32*> [#uses=1]
+	%40 = load i32* %39, align 4		; <i32> [#uses=1]
+	%41 = load i32* %35, align 4		; <i32> [#uses=1]
+	%42 = xor i32 %40, %33		; <i32> [#uses=1]
+	%43 = xor i32 %42, %41		; <i32> [#uses=1]
+	%44 = lshr i32 %29, 24		; <i32> [#uses=1]
+	%45 = zext i32 %44 to i64		; <i64> [#uses=1]
+	%46 = getelementptr [256 x i32]* @Te0, i64 0, i64 %45		; <i32*> [#uses=1]
+	%47 = load i32* %46, align 4		; <i32> [#uses=1]
+	%48 = and i32 %16, 255		; <i32> [#uses=1]
+	%49 = zext i32 %48 to i64		; <i64> [#uses=1]
+	%50 = getelementptr [256 x i32]* @Te3, i64 0, i64 %49		; <i32*> [#uses=1]
+	%51 = load i32* %50, align 4		; <i32> [#uses=1]
+	%ctg2.sum32 = add i64 %tmp18, 20		; <i64> [#uses=1]
+	%52 = getelementptr i8* %rk26, i64 %ctg2.sum32		; <i8*> [#uses=1]
+	%53 = bitcast i8* %52 to i32*		; <i32*> [#uses=1]
+	%54 = load i32* %53, align 4		; <i32> [#uses=1]
+	%55 = xor i32 %51, %47		; <i32> [#uses=1]
+	%56 = xor i32 %55, %54		; <i32> [#uses=1]
+	%indvar.next = add i64 %indvar, 1		; <i64> [#uses=1]
+	br label %bb
+
+bb2:		; preds = %bb
+	%tmp10 = shl i64 %tmp.16, 4		; <i64> [#uses=2]
+	%ctg2.sum = add i64 %tmp10, 16		; <i64> [#uses=1]
+	%tmp1213 = getelementptr i8* %rk26, i64 %ctg2.sum		; <i8*> [#uses=1]
+	%57 = bitcast i8* %tmp1213 to i32*		; <i32*> [#uses=1]
+	%58 = and i32 %33, -16777216		; <i32> [#uses=1]
+	%59 = lshr i32 %29, 16		; <i32> [#uses=1]
+	%60 = and i32 %59, 255		; <i32> [#uses=1]
+	%61 = zext i32 %60 to i64		; <i64> [#uses=1]
+	%62 = getelementptr [256 x i32]* @Te1, i64 0, i64 %61		; <i32*> [#uses=1]
+	%63 = load i32* %62, align 4		; <i32> [#uses=1]
+	%64 = and i32 %63, 16711680		; <i32> [#uses=1]
+	%65 = or i32 %64, %58		; <i32> [#uses=1]
+	%66 = load i32* %57, align 4		; <i32> [#uses=1]
+	%67 = xor i32 %65, %66		; <i32> [#uses=2]
+	%68 = lshr i32 %29, 8		; <i32> [#uses=1]
+	%69 = zext i32 %68 to i64		; <i64> [#uses=1]
+	%70 = getelementptr [256 x i32]* @Te0, i64 0, i64 %69		; <i32*> [#uses=1]
+	%71 = load i32* %70, align 4		; <i32> [#uses=1]
+	%72 = and i32 %71, -16777216		; <i32> [#uses=1]
+	%73 = and i32 %16, 255		; <i32> [#uses=1]
+	%74 = zext i32 %73 to i64		; <i64> [#uses=1]
+	%75 = getelementptr [256 x i32]* @Te1, i64 0, i64 %74		; <i32*> [#uses=1]
+	%76 = load i32* %75, align 4		; <i32> [#uses=1]
+	%77 = and i32 %76, 16711680		; <i32> [#uses=1]
+	%78 = or i32 %77, %72		; <i32> [#uses=1]
+	%ctg2.sum25 = add i64 %tmp10, 20		; <i64> [#uses=1]
+	%79 = getelementptr i8* %rk26, i64 %ctg2.sum25		; <i8*> [#uses=1]
+	%80 = bitcast i8* %79 to i32*		; <i32*> [#uses=1]
+	%81 = load i32* %80, align 4		; <i32> [#uses=1]
+	%82 = xor i32 %78, %81		; <i32> [#uses=2]
+	%83 = lshr i32 %67, 24		; <i32> [#uses=1]
+	%84 = trunc i32 %83 to i8		; <i8> [#uses=1]
+	store i8 %84, i8* %out, align 1
+	%85 = lshr i32 %67, 16		; <i32> [#uses=1]
+	%86 = trunc i32 %85 to i8		; <i8> [#uses=1]
+	%87 = getelementptr i8* %out, i64 1		; <i8*> [#uses=1]
+	store i8 %86, i8* %87, align 1
+	%88 = getelementptr i8* %out, i64 4		; <i8*> [#uses=1]
+	%89 = lshr i32 %82, 24		; <i32> [#uses=1]
+	%90 = trunc i32 %89 to i8		; <i8> [#uses=1]
+	store i8 %90, i8* %88, align 1
+	%91 = lshr i32 %82, 16		; <i32> [#uses=1]
+	%92 = trunc i32 %91 to i8		; <i8> [#uses=1]
+	%93 = getelementptr i8* %out, i64 5		; <i8*> [#uses=1]
+	store i8 %92, i8* %93, align 1
+	ret void
+}