[LoopUnroll] Respect the convergent attribute.

Summary:
Specifically, when we perform runtime loop unrolling of a loop that
contains a convergent op, we can only unroll k times, where k divides
the loop trip multiple.

Without this change, we'll happily unroll e.g. the following loop

  for (int i = 0; i < N; ++i) {
    if (i == 0) convergent_op();
    foo();
  }

into

  int i = 0;
  if (N % 2 == 1) {
    convergent_op();
    foo();
    ++i;
  }
  for (; i < N - 1; i += 2) {
    if (i == 0) convergent_op();
    foo();
    foo();
  }.

This is unsafe, because we've just added a control-flow dependency to
the convergent op in the prelude.

In general, runtime unrolling loops that contain convergent ops is safe
only if we don't have emit a prelude, which occurs when the unroll count
divides the trip multiple.

Reviewers: resistor

Subscribers: llvm-commits, mzolotukhin

Differential Revision: http://reviews.llvm.org/D17526

llvm-svn: 263509
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 5afdaae..e478722 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -362,7 +362,7 @@
 
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
-                                    bool &NotDuplicatable,
+                                    bool &NotDuplicatable, bool &Convergent,
                                     const TargetTransformInfo &TTI,
                                     AssumptionCache *AC) {
   SmallPtrSet<const Value *, 32> EphValues;
@@ -373,6 +373,7 @@
     Metrics.analyzeBasicBlock(BB, TTI, EphValues);
   NumCalls = Metrics.NumInlineCandidates;
   NotDuplicatable = Metrics.notDuplicatable;
+  Convergent = Metrics.convergent;
 
   unsigned LoopSize = Metrics.NumInsts;
 
@@ -568,8 +569,9 @@
 
   unsigned NumInlineCandidates;
   bool NotDuplicatable;
-  unsigned LoopSize =
-      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, TTI, &AC);
+  bool Convergent;
+  unsigned LoopSize = ApproximateLoopSize(
+      L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC);
   DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
 
   // When computing the unrolled size, note that the conditional branch on the
@@ -623,6 +625,7 @@
   if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) {
     AllowRuntime = false;
   }
+  bool DecreasedCountDueToConvergence = false;
   if (Unrolling == Partial) {
     bool AllowPartial = PragmaEnableUnroll || UP.Partial;
     if (!AllowPartial && !CountSetExplicitly) {
@@ -643,14 +646,40 @@
                    << "-unroll-runtime not given\n");
       return false;
     }
+
     // Reduce unroll count to be the largest power-of-two factor of
     // the original count which satisfies the threshold limit.
     while (Count != 0 && UnrolledSize > UP.PartialThreshold) {
       Count >>= 1;
       UnrolledSize = (LoopSize-2) * Count + 2;
     }
+
     if (Count > UP.MaxCount)
       Count = UP.MaxCount;
+
+    // If the loop contains a convergent operation, the prelude we'd add
+    // to do the first few instructions before we hit the unrolled loop
+    // is unsafe -- it adds a control-flow dependency to the convergent
+    // operation.  Therefore Count must divide TripMultiple.
+    //
+    // TODO: This is quite conservative.  In practice, convergent_op()
+    // is likely to be called unconditionally in the loop.  In this
+    // case, the program would be ill-formed (on most architectures)
+    // unless n were the same on all threads in a thread group.
+    // Assuming n is the same on all threads, any kind of unrolling is
+    // safe.  But currently llvm's notion of convergence isn't powerful
+    // enough to express this.
+    unsigned OrigCount = Count;
+    while (Convergent && Count != 0 && TripMultiple % Count != 0) {
+      DecreasedCountDueToConvergence = true;
+      Count >>= 1;
+    }
+    if (OrigCount > Count) {
+      DEBUG(dbgs() << "  loop contains a convergent instruction, so unroll "
+                      "count must divide the trip multiple, "
+                   << TripMultiple << ".  Reducing unroll count from "
+                   << OrigCount << " to " << Count << ".\n");
+    }
     DEBUG(dbgs() << "  partially unrolling with count: " << Count << "\n");
   }
 
@@ -665,7 +694,16 @@
     DebugLoc LoopLoc = L->getStartLoc();
     Function *F = Header->getParent();
     LLVMContext &Ctx = F->getContext();
-    if ((PragmaCount > 0) && Count != OriginalCount) {
+    if (PragmaCount > 0 && DecreasedCountDueToConvergence) {
+      emitOptimizationRemarkMissed(
+          Ctx, DEBUG_TYPE, *F, LoopLoc,
+          Twine("Unable to unroll loop the number of times directed by "
+                "unroll_count pragma because the loop contains a convergent "
+                "instruction, and so must have an unroll count that divides "
+                "the loop trip multiple of ") +
+              Twine(TripMultiple) + ".  Unrolling instead " + Twine(Count) +
+              " time(s).");
+    } else if ((PragmaCount > 0) && Count != OriginalCount) {
       emitOptimizationRemarkMissed(
           Ctx, DEBUG_TYPE, *F, LoopLoc,
           "Unable to unroll loop the number of times directed by "