LoopRotate: Also rotate loops with multiple exits.

The old PHI updating code in loop-rotate was replaced with SSAUpdater a while
ago, it has no problems with comples PHIs. What had to be fixed is detecting
whether a loop was already rotated and updating dominators when multiple exits
were present.

This change increases overall code size a bit, mostly due to additional loop
unrolling opportunities. Passes test-suite and selfhost with -verify-dom-info.
Fixes PR7447.

Thanks to Andy for the input on the domtree updating code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@162912 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 7eeb152..2c76706 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
@@ -256,6 +257,7 @@
     return false;
 
   BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
 
   BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
   if (BI == 0 || BI->isUnconditional())
@@ -267,13 +269,9 @@
   if (!L->isLoopExiting(OrigHeader))
     return false;
 
-  // Updating PHInodes in loops with multiple exits adds complexity.
-  // Keep it simple, and restrict loop rotation to loops with one exit only.
-  // In future, lift this restriction and support for multiple exits if
-  // required.
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  if (ExitBlocks.size() > 1)
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (OrigLatch == 0 || L->isLoopExiting(OrigLatch))
     return false;
 
   // Check size of original header and reject loop if it is very big.
@@ -286,11 +284,10 @@
 
   // Now, this loop is suitable for rotation.
   BasicBlock *OrigPreheader = L->getLoopPreheader();
-  BasicBlock *OrigLatch = L->getLoopLatch();
 
   // If the loop could not be converted to canonical form, it must have an
   // indirectbr in it, just give up.
-  if (OrigPreheader == 0 || OrigLatch == 0)
+  if (OrigPreheader == 0)
     return false;
 
   // Anything ScalarEvolution may know about this loop or the PHI nodes
@@ -298,6 +295,8 @@
   if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>())
     SE->forgetLoop(L);
 
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
   // Find new Loop header. NewHeader is a Header's one and only successor
   // that is inside loop.  Header's other successor is outside the
   // loop.  Otherwise loop is not suitable for rotation.
@@ -408,10 +407,16 @@
     // Update DominatorTree to reflect the CFG change we just made.  Then split
     // edges as necessary to preserve LoopSimplify form.
     if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
-      // Since OrigPreheader now has the conditional branch to Exit block, it is
-      // the dominator of Exit.
-      DT->changeImmediateDominator(Exit, OrigPreheader);
-      DT->changeImmediateDominator(NewHeader, OrigPreheader);
+      // Everything that was dominated by the old loop header is now dominated
+      // by the original loop preheader. Conceptually the header was merged
+      // into the preheader, even though we reuse the actual block as a new
+      // loop latch.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
+      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
+        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
 
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
@@ -440,6 +445,46 @@
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(NewHeader, OrigPreheader);
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
+
+      // Brute force incremental dominator tree update. Call
+      // findNearestCommonDominator on all CFG predecessors of each child of the
+      // original header.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> WorkList(OrigHeaderNode->begin(),
+                                             OrigHeaderNode->end());
+      while (!WorkList.empty()) {
+        DomTreeNode *Node = WorkList.pop_back_val();
+        BasicBlock *BB = Node->getBlock();
+        BasicBlock *NearestDom = 0;
+        for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;
+             ++PI) {
+          BasicBlock *Pred = *PI;
+
+          // We have to process predecessors of a node before we touch the
+          // actual node. If one of the predecessors is in our worklist, put it
+          // and the currently processed node on the worklist and go processing
+          // the predecessor.
+          SmallVectorImpl<DomTreeNode *>::iterator I =
+            std::find(WorkList.begin(), WorkList.end(), DT->getNode(Pred));
+          if (I != WorkList.end()) {
+            WorkList.push_back(Node);
+            std::swap(*I, WorkList.back());
+            // The predecessor is now at the end of the worklist.
+            NearestDom = 0;
+            break;
+          }
+
+          // On the first iteration start with Pred, on the other iterations we
+          // narrow it down to the nearest common dominator.
+          if (!NearestDom)
+            NearestDom = Pred;
+          else
+            NearestDom = DT->findNearestCommonDominator(NearestDom, Pred);
+        }
+
+        if (NearestDom)
+          DT->changeImmediateDominator(BB, NearestDom);
+      }
     }
   }
 
@@ -452,6 +497,8 @@
   // emitted code isn't too gross in this common case.
   MergeBlockIntoPredecessor(OrigHeader, this);
 
+  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
   ++NumRotated;
   return true;
 }