[LoopUnroll] Respect the convergent attribute.
Summary:
Specifically, when we perform runtime loop unrolling of a loop that
contains a convergent op, we can only unroll k times, where k divides
the loop trip multiple.
Without this change, we'll happily unroll e.g. the following loop
for (int i = 0; i < N; ++i) {
if (i == 0) convergent_op();
foo();
}
into
int i = 0;
if (N % 2 == 1) {
convergent_op();
foo();
++i;
}
for (; i < N - 1; i += 2) {
if (i == 0) convergent_op();
foo();
foo();
}.
This is unsafe, because we've just added a control-flow dependency to
the convergent op in the prelude.
In general, runtime unrolling loops that contain convergent ops is safe
only if we don't have emit a prelude, which occurs when the unroll count
divides the trip multiple.
Reviewers: resistor
Subscribers: llvm-commits, mzolotukhin
Differential Revision: http://reviews.llvm.org/D17526
llvm-svn: 263509
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 5afdaae..e478722 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -362,7 +362,7 @@
/// ApproximateLoopSize - Approximate the size of the loop.
static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
- bool &NotDuplicatable,
+ bool &NotDuplicatable, bool &Convergent,
const TargetTransformInfo &TTI,
AssumptionCache *AC) {
SmallPtrSet<const Value *, 32> EphValues;
@@ -373,6 +373,7 @@
Metrics.analyzeBasicBlock(BB, TTI, EphValues);
NumCalls = Metrics.NumInlineCandidates;
NotDuplicatable = Metrics.notDuplicatable;
+ Convergent = Metrics.convergent;
unsigned LoopSize = Metrics.NumInsts;
@@ -568,8 +569,9 @@
unsigned NumInlineCandidates;
bool NotDuplicatable;
- unsigned LoopSize =
- ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, TTI, &AC);
+ bool Convergent;
+ unsigned LoopSize = ApproximateLoopSize(
+ L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC);
DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
// When computing the unrolled size, note that the conditional branch on the
@@ -623,6 +625,7 @@
if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) {
AllowRuntime = false;
}
+ bool DecreasedCountDueToConvergence = false;
if (Unrolling == Partial) {
bool AllowPartial = PragmaEnableUnroll || UP.Partial;
if (!AllowPartial && !CountSetExplicitly) {
@@ -643,14 +646,40 @@
<< "-unroll-runtime not given\n");
return false;
}
+
// Reduce unroll count to be the largest power-of-two factor of
// the original count which satisfies the threshold limit.
while (Count != 0 && UnrolledSize > UP.PartialThreshold) {
Count >>= 1;
UnrolledSize = (LoopSize-2) * Count + 2;
}
+
if (Count > UP.MaxCount)
Count = UP.MaxCount;
+
+ // If the loop contains a convergent operation, the prelude we'd add
+ // to do the first few instructions before we hit the unrolled loop
+ // is unsafe -- it adds a control-flow dependency to the convergent
+ // operation. Therefore Count must divide TripMultiple.
+ //
+ // TODO: This is quite conservative. In practice, convergent_op()
+ // is likely to be called unconditionally in the loop. In this
+ // case, the program would be ill-formed (on most architectures)
+ // unless n were the same on all threads in a thread group.
+ // Assuming n is the same on all threads, any kind of unrolling is
+ // safe. But currently llvm's notion of convergence isn't powerful
+ // enough to express this.
+ unsigned OrigCount = Count;
+ while (Convergent && Count != 0 && TripMultiple % Count != 0) {
+ DecreasedCountDueToConvergence = true;
+ Count >>= 1;
+ }
+ if (OrigCount > Count) {
+ DEBUG(dbgs() << " loop contains a convergent instruction, so unroll "
+ "count must divide the trip multiple, "
+ << TripMultiple << ". Reducing unroll count from "
+ << OrigCount << " to " << Count << ".\n");
+ }
DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n");
}
@@ -665,7 +694,16 @@
DebugLoc LoopLoc = L->getStartLoc();
Function *F = Header->getParent();
LLVMContext &Ctx = F->getContext();
- if ((PragmaCount > 0) && Count != OriginalCount) {
+ if (PragmaCount > 0 && DecreasedCountDueToConvergence) {
+ emitOptimizationRemarkMissed(
+ Ctx, DEBUG_TYPE, *F, LoopLoc,
+ Twine("Unable to unroll loop the number of times directed by "
+ "unroll_count pragma because the loop contains a convergent "
+ "instruction, and so must have an unroll count that divides "
+ "the loop trip multiple of ") +
+ Twine(TripMultiple) + ". Unrolling instead " + Twine(Count) +
+ " time(s).");
+ } else if ((PragmaCount > 0) && Count != OriginalCount) {
emitOptimizationRemarkMissed(
Ctx, DEBUG_TYPE, *F, LoopLoc,
"Unable to unroll loop the number of times directed by "