[SimpleLoopUnswitch] Unswitch by experimental.guard intrinsics

This patch adds support of `llvm.experimental.guard` intrinsics to non-trivial
simple loop unswitching. These intrinsics represent implicit control flow which
has pretty much the same semantics as usual conditional branches. The
algorithm of dealing with them is following:

- Consider guards as unswitching candidates;
- If a guard is considered the best candidate, turn it into a branch;
- Apply normal unswitching algorithm on this branch.

The patch has no compile time effect on code that does not contain any guards.

Differential Revision: https://reviews.llvm.org/D53744
Reviewed By: chandlerc

llvm-svn: 345387
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index e8f67a6..8b6935f 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -59,6 +60,7 @@
 
 STATISTIC(NumBranches, "Number of branches unswitched");
 STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
 STATISTIC(NumTrivial, "Number of unswitches that are trivial");
 
 static cl::opt<bool> EnableNonTrivialUnswitch(
@@ -70,6 +72,11 @@
     UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
                       cl::desc("The cost threshold for unswitching a loop."));
 
+static cl::opt<bool> UnswitchGuards(
+    "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
+    cl::desc("If enabled, simple loop unswitching will also consider "
+             "llvm.experimental.guard intrinsics as unswitch candidates."));
+
 /// Collect all of the loop invariant input values transitively used by the
 /// homogeneous instruction graph from a given root.
 ///
@@ -2169,6 +2176,77 @@
   return Cost;
 }
 
+/// Turns a llvm.experimental.guard intrinsic into implicit control flow branch,
+/// making the following replacement:
+///
+///   <code before guard>
+///   call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+///   <code after guard>
+///
+/// into
+///
+///   <code before guard>
+///   br i1 %cond, label %guarded, label %deopt
+///
+/// guarded:
+///   <code after guard>
+///
+/// deopt:
+///   call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+///   unreachable
+///
+/// It also makes all relevant DT and LI updates, so that all structures are in
+/// valid state after this transform.
+static BranchInst *
+turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
+                    SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                    DominatorTree &DT, LoopInfo &LI) {
+  SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+  LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n");
+  BasicBlock *CheckBB = GI->getParent();
+
+  // Remove all CheckBB's successors from DomTree. A block can be seen among
+  // successors more than once, but for DomTree it should be added only once.
+  SmallPtrSet<BasicBlock *, 4> Successors;
+  for (auto *Succ : successors(CheckBB))
+    if (Successors.insert(Succ).second)
+      DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ});
+
+  Instruction *DeoptBlockTerm =
+      SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true);
+  BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+  // SplitBlockAndInsertIfThen inserts control flow that branches to
+  // DeoptBlockTerm if the condition is true.  We want the opposite.
+  CheckBI->swapSuccessors();
+
+  BasicBlock *GuardedBlock = CheckBI->getSuccessor(0);
+  GuardedBlock->setName("guarded");
+  CheckBI->getSuccessor(1)->setName("deopt");
+
+  // We now have a new exit block.
+  ExitBlocks.push_back(CheckBI->getSuccessor(1));
+
+  GI->moveBefore(DeoptBlockTerm);
+  GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext()));
+
+  // Add new successors of CheckBB into DomTree.
+  for (auto *Succ : successors(CheckBB))
+    DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ});
+
+  // Now the blocks that used to be CheckBB's successors are GuardedBlock's
+  // successors.
+  for (auto *Succ : Successors)
+    DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ});
+
+  // Make proper changes to DT.
+  DT.applyUpdates(DTUpdates);
+  // Inform LI of a new loop block.
+  L.addBasicBlockToLoop(GuardedBlock, LI);
+
+  ++NumGuards;
+  return CheckBI;
+}
+
 static bool
 unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
                       AssumptionCache &AC, TargetTransformInfo &TTI,
@@ -2178,10 +2256,29 @@
   // loop which would be handled when visiting that inner loop).
   SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4>
       UnswitchCandidates;
+
+  // Whether or not we should also collect guards in the loop.
+  bool CollectGuards = false;
+  if (UnswitchGuards) {
+    auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction(
+        Intrinsic::getName(Intrinsic::experimental_guard));
+    if (GuardDecl && !GuardDecl->use_empty())
+      CollectGuards = true;
+  }
+
   for (auto *BB : L.blocks()) {
     if (LI.getLoopFor(BB) != &L)
       continue;
 
+    if (CollectGuards)
+      for (auto &I : *BB)
+        if (isGuard(&I)) {
+          auto *Cond = cast<IntrinsicInst>(&I)->getArgOperand(0);
+          // TODO: Support AND, OR conditions and partial unswitching.
+          if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond))
+            UnswitchCandidates.push_back({&I, {Cond}});
+        }
+
     if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
       // We can only consider fully loop-invariant switch conditions as we need
       // to completely eliminate the switch after unswitching.
@@ -2346,9 +2443,12 @@
     // Now scale the cost by the number of unique successors minus one. We
     // subtract one because there is already at least one copy of the entire
     // loop. This is computing the new cost of unswitching a condition.
-    assert(Visited.size() > 1 &&
+    // Note that guards always have 2 unique successors that are implicit and
+    // will be materialized if we decide to unswitch it.
+    int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size();
+    assert(SuccessorsCount > 1 &&
            "Cannot unswitch a condition without multiple distinct successors!");
-    return Cost * (Visited.size() - 1);
+    return Cost * (SuccessorsCount - 1);
   };
   Instruction *BestUnswitchTI = nullptr;
   int BestUnswitchCost;
@@ -2375,6 +2475,11 @@
     return false;
   }
 
+  // If the best candidate is a guard, turn it into a branch.
+  if (isGuard(BestUnswitchTI))
+    BestUnswitchTI = turnGuardIntoBranch(cast<IntrinsicInst>(BestUnswitchTI), L,
+                                         ExitBlocks, DT, LI);
+
   LLVM_DEBUG(dbgs() << "  Unswitching non-trivial (cost = "
                     << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
                     << "\n");