[TailCallElim] Preserve DT and PDT

Summary:
Previously, in the NewPM pipeline, TailCallElim recalculates the DomTree when it modifies any instruction in the Function.
For example,
```
CallInst *CI = dyn_cast<CallInst>(&I);
...
CI->setTailCall();
Modified = true;
...
if (!Modified || ...)
  return PreservedAnalyses::all();
```
After applying this patch, the DomTree only recalculates if needed (plus an extra insertEdge() + an extra deleteEdge() call).

When optimizing SQLite with `-passes="default<O3>"` pipeline of the newPM, the number of DomTree recalculation decreases by 6.2%, the number of nodes visited by DFS decreases by 2.9%. The time used by DomTree will decrease approximately 1%~2.5% after applying the patch.
 
Statistics:
```
Before the patch:
 23010 dom-tree-stats               - Number of DomTree recalculations
489264 dom-tree-stats               - Number of nodes visited by DFS -- DomTree
After the patch:
 21581 dom-tree-stats               - Number of DomTree recalculations
475088 dom-tree-stats               - Number of nodes visited by DFS -- DomTree
```

Reviewers: kuhar, dmgreen, brzycki, grosser, davide

Reviewed By: kuhar, brzycki

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D49982

llvm-svn: 338954
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 818e837..0b22f56 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -61,6 +61,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
@@ -68,6 +69,8 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DomTreeUpdater.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -488,12 +491,10 @@
   return CI;
 }
 
-static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
-                                       BasicBlock *&OldEntry,
-                                       bool &TailCallsAreMarkedTail,
-                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                       AliasAnalysis *AA,
-                                       OptimizationRemarkEmitter *ORE) {
+static bool eliminateRecursiveTailCall(
+    CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry,
+    bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
+    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
   // If we are introducing accumulator recursion to eliminate operations after
   // the call instruction that are both associative and commutative, the initial
   // value for the accumulator is placed in this variable.  If this value is set
@@ -593,6 +594,10 @@
       PN->addIncoming(&*I, NewEntry);
       ArgumentPHIs.push_back(PN);
     }
+    // The entry block was changed from OldEntry to NewEntry.
+    // The forward DominatorTree needs to be recalculated when the EntryBB is
+    // changed. In this corner-case we recalculate the entire tree.
+    DTU.recalculate(*NewEntry->getParent());
   }
 
   // If this function has self recursive calls in the tail position where some
@@ -668,6 +673,7 @@
 
   BB->getInstList().erase(Ret);  // Remove return.
   BB->getInstList().erase(CI);   // Remove call.
+  DTU.insertEdge(BB, OldEntry);
   ++NumEliminated;
   return true;
 }
@@ -676,7 +682,7 @@
     BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry,
     bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
     bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
-    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE) {
+    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
   bool Change = false;
 
   // Make sure this block is a trivial return block.
@@ -702,17 +708,17 @@
     if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){
       LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
                         << "INTO UNCOND BRANCH PRED: " << *Pred);
-      ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
+      ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred, &DTU);
 
       // Cleanup: if all predecessors of BB have been eliminated by
       // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
       // because the ret instruction in there is still using a value which
       // eliminateRecursiveTailCall will attempt to remove.
       if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
-        BB->eraseFromParent();
+        DTU.deleteBB(BB);
 
       eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
-                                 ArgumentPHIs, AA, ORE);
+                                 ArgumentPHIs, AA, ORE, DTU);
       ++NumRetDuped;
       Change = true;
     }
@@ -721,24 +727,23 @@
   return Change;
 }
 
-static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
-                                  bool &TailCallsAreMarkedTail,
-                                  SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                  bool CannotTailCallElimCallsMarkedTail,
-                                  const TargetTransformInfo *TTI,
-                                  AliasAnalysis *AA,
-                                  OptimizationRemarkEmitter *ORE) {
+static bool processReturningBlock(
+    ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail,
+    SmallVectorImpl<PHINode *> &ArgumentPHIs,
+    bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
+    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
   CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
   if (!CI)
     return false;
 
   return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
-                                    ArgumentPHIs, AA, ORE);
+                                    ArgumentPHIs, AA, ORE, DTU);
 }
 
 static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
                                    AliasAnalysis *AA,
-                                   OptimizationRemarkEmitter *ORE) {
+                                   OptimizationRemarkEmitter *ORE,
+                                   DomTreeUpdater &DTU) {
   if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
     return false;
 
@@ -773,11 +778,11 @@
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
       bool Change = processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
                                           ArgumentPHIs, !CanTRETailMarkedCall,
-                                          TTI, AA, ORE);
+                                          TTI, AA, ORE, DTU);
       if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-        Change = foldReturnAndProcessPred(BB, Ret, OldEntry,
-                                          TailCallsAreMarkedTail, ArgumentPHIs,
-                                          !CanTRETailMarkedCall, TTI, AA, ORE);
+        Change = foldReturnAndProcessPred(
+            BB, Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs,
+            !CanTRETailMarkedCall, TTI, AA, ORE, DTU);
       MadeChange |= Change;
     }
   }
@@ -810,16 +815,27 @@
     AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
 
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+    auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+    // There is no noticable performance difference here between Lazy and Eager
+    // UpdateStrategy based on some test results. It is feasible to switch the
+    // UpdateStrategy to Lazy if we find it profitable later.
+    DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
+
     return eliminateTailRecursion(
         F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
         &getAnalysis<AAResultsWrapperPass>().getAAResults(),
-        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
+        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU);
   }
 };
 }
@@ -843,12 +859,19 @@
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
   AliasAnalysis &AA = AM.getResult<AAManager>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
-  bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE);
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
+  // There is no noticable performance difference here between Lazy and Eager
+  // UpdateStrategy based on some test results. It is feasible to switch the
+  // UpdateStrategy to Lazy if we find it profitable later.
+  DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
+  bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE, DTU);
 
   if (!Changed)
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
   return PA;
 }
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 7f61477..94f37da 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -646,7 +646,8 @@
 }
 
 ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
-                                             BasicBlock *Pred) {
+                                             BasicBlock *Pred,
+                                             DomTreeUpdater *DTU) {
   Instruction *UncondBranch = Pred->getTerminator();
   // Clone the return and add it to the end of the predecessor.
   Instruction *NewRet = RI->clone();
@@ -680,6 +681,10 @@
   // longer branch to them.
   BB->removePredecessor(Pred);
   UncondBranch->eraseFromParent();
+
+  if (DTU)
+    DTU->deleteEdge(Pred, BB);
+
   return cast<ReturnInst>(NewRet);
 }