[LoopUnroll] Enable option to peel remainder loop

On some targets, the penalty of executing runtime unrolling checks
and then not the unrolled loop can be significantly detrimental to
performance. This results in the need to be more conservative with
the unroll count, keeping a trip count of 2 reduces the overhead as
well as increasing the chance of the unrolled body being executed. But
being conservative leaves performance gains on the table.

This patch enables the unrolling of the remainder loop introduced by
runtime unrolling. This can help reduce the overhead of misunrolled
loops because the cost of non-taken branches is much less than the
cost of the backedge that would normally be executed in the remainder
loop. This allows larger unroll factors to be used without suffering
performance loses with smaller iteration counts.

Differential Revision: https://reviews.llvm.org/D36309

llvm-svn: 310824
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index a6d7849..fb50a35 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -115,6 +115,10 @@
                        cl::desc("Allows loops to be peeled when the dynamic "
                                 "trip count is known to be low."));
 
+static cl::opt<bool> UnrollUnrollRemainder(
+  "unroll-remainder", cl::Hidden,
+  cl::desc("Allow the loop remainder to be unrolled."));
+
 // This option isn't ever intended to be enabled, it serves to allow
 // experiments to check the assumptions about when this kind of revisit is
 // necessary.
@@ -153,6 +157,7 @@
   UP.Partial = false;
   UP.Runtime = false;
   UP.AllowRemainder = true;
+  UP.UnrollRemainder = false;
   UP.AllowExpensiveTripCount = false;
   UP.Force = false;
   UP.UpperBound = false;
@@ -188,6 +193,8 @@
     UP.UpperBound = false;
   if (UnrollAllowPeeling.getNumOccurrences() > 0)
     UP.AllowPeeling = UnrollAllowPeeling;
+  if (UnrollUnrollRemainder.getNumOccurrences() > 0)
+    UP.UnrollRemainder = UnrollUnrollRemainder;
 
   // Apply user values provided by argument
   if (UserThreshold.hasValue()) {
@@ -1034,7 +1041,8 @@
   // Unroll the loop.
   if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
                   UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
-                  TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE,
+                  TripMultiple, UP.PeelCount, UP.UnrollRemainder,
+                  LI, &SE, &DT, &AC, &ORE,
                   PreserveLCSSA))
     return false;
 
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index f2527f8..835f439 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -295,7 +295,8 @@
 bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
                       bool AllowRuntime, bool AllowExpensiveTripCount,
                       bool PreserveCondBr, bool PreserveOnlyFirst,
-                      unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI,
+                      unsigned TripMultiple, unsigned PeelCount,
+                      bool UnrollRemainder, LoopInfo *LI,
                       ScalarEvolution *SE, DominatorTree *DT,
                       AssumptionCache *AC, OptimizationRemarkEmitter *ORE,
                       bool PreserveLCSSA) {
@@ -418,7 +419,8 @@
 
   if (RuntimeTripCount && TripMultiple % Count != 0 &&
       !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
-                                  EpilogProfitability, LI, SE, DT,
+                                  EpilogProfitability, UnrollRemainder,
+                                  LI, SE, DT, AC, ORE,
                                   PreserveLCSSA)) {
     if (Force)
       RuntimeTripCount = false;
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 2631255..cd5e977 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -294,7 +294,8 @@
 /// Return the new cloned loop that is created when CreateRemainderLoop is true.
 static Loop *
 CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
-                const bool UseEpilogRemainder, BasicBlock *InsertTop,
+                const bool UseEpilogRemainder, const bool UnrollRemainder,
+                BasicBlock *InsertTop,
                 BasicBlock *InsertBot, BasicBlock *Preheader,
                 std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
                 ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
@@ -413,10 +414,13 @@
     }
 
     LLVMContext &Context = NewLoop->getHeader()->getContext();
-    SmallVector<Metadata *, 1> DisableOperands;
-    DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
-    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
-    MDs.push_back(DisableNode);
+    if (!UnrollRemainder) {
+      SmallVector<Metadata *, 1> DisableOperands;
+      DisableOperands.push_back(MDString::get(Context,
+                                              "llvm.loop.unroll.disable"));
+      MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+      MDs.push_back(DisableNode);
+    }
 
     MDNode *NewLoopID = MDNode::get(Context, MDs);
     // Set operand 0 to refer to the loop id itself.
@@ -525,8 +529,11 @@
 bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                       bool AllowExpensiveTripCount,
                                       bool UseEpilogRemainder,
+                                      bool UnrollRemainder,
                                       LoopInfo *LI, ScalarEvolution *SE,
-                                      DominatorTree *DT, bool PreserveLCSSA) {
+                                      DominatorTree *DT, AssumptionCache *AC,
+                                      OptimizationRemarkEmitter *ORE,
+                                      bool PreserveLCSSA) {
   DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
   DEBUG(L->dump());
 
@@ -739,7 +746,8 @@
   BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
   BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
   Loop *remainderLoop = CloneLoopBlocks(
-      L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot,
+      L, ModVal, CreateRemainderLoop, UseEpilogRemainder, UnrollRemainder,
+      InsertTop, InsertBot,
       NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
 
   // Insert the cloned blocks into the function.
@@ -883,6 +891,15 @@
       formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA);
   }
 
+  if (remainderLoop && UnrollRemainder) {
+    UnrollLoop(remainderLoop, /*Count*/Count - 1, /*TripCount*/Count - 1,
+               /*Force*/false, /*AllowRuntime*/false,
+               /*AllowExpensiveTripCount*/false, /*PreserveCondBr*/true,
+               /*PreserveOnlyFirst*/false, /*TripMultiple*/1,
+               /*PeelCount*/0, /*UnrollRemainder*/false, LI, SE, DT, AC, ORE,
+               PreserveLCSSA);
+  }
+
   NumRuntimeUnrolled++;
   return true;
 }