blob: dc526a20c903293b3f0e32bfbd6e631668f083bb [file] [log] [blame]
Michael Kupersteinb151a642016-11-30 21:13:57 +00001//===-- UnrollLoopPeel.cpp - Loop peeling utilities -----------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements some loop unrolling utilities for peeling loops
11// with dynamically inferred (from PGO) trip counts. See LoopUnroll.cpp for
12// unrolling loops with compile-time constant trip counts.
13//
14//===----------------------------------------------------------------------===//
15
16#include "llvm/ADT/Statistic.h"
17#include "llvm/Analysis/LoopIterator.h"
18#include "llvm/Analysis/LoopPass.h"
19#include "llvm/Analysis/ScalarEvolution.h"
20#include "llvm/Analysis/TargetTransformInfo.h"
21#include "llvm/IR/BasicBlock.h"
22#include "llvm/IR/Dominators.h"
23#include "llvm/IR/MDBuilder.h"
24#include "llvm/IR/Metadata.h"
25#include "llvm/IR/Module.h"
26#include "llvm/Support/Debug.h"
27#include "llvm/Support/raw_ostream.h"
28#include "llvm/Transforms/Scalar.h"
29#include "llvm/Transforms/Utils/BasicBlockUtils.h"
30#include "llvm/Transforms/Utils/Cloning.h"
31#include "llvm/Transforms/Utils/LoopUtils.h"
32#include "llvm/Transforms/Utils/UnrollLoop.h"
33#include <algorithm>
34
35using namespace llvm;
36
37#define DEBUG_TYPE "loop-unroll"
38STATISTIC(NumPeeled, "Number of loops peeled");
39
40static cl::opt<unsigned> UnrollPeelMaxCount(
41 "unroll-peel-max-count", cl::init(7), cl::Hidden,
42 cl::desc("Max average trip count which will cause loop peeling."));
43
44static cl::opt<unsigned> UnrollForcePeelCount(
45 "unroll-force-peel-count", cl::init(0), cl::Hidden,
46 cl::desc("Force a peel count regardless of profiling information."));
47
48// Check whether we are capable of peeling this loop.
49static bool canPeel(Loop *L) {
50 // Make sure the loop is in simplified form
51 if (!L->isLoopSimplifyForm())
52 return false;
53
54 // Only peel loops that contain a single exit
55 if (!L->getExitingBlock() || !L->getUniqueExitBlock())
56 return false;
57
58 return true;
59}
60
61// Return the number of iterations we want to peel off.
62void llvm::computePeelCount(Loop *L, unsigned LoopSize,
63 TargetTransformInfo::UnrollingPreferences &UP) {
64 UP.PeelCount = 0;
65 if (!canPeel(L))
66 return;
67
68 // Only try to peel innermost loops.
69 if (!L->empty())
70 return;
71
72 // If the user provided a peel count, use that.
73 bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
74 if (UserPeelCount) {
75 DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
76 << " iterations.\n");
77 UP.PeelCount = UnrollForcePeelCount;
78 return;
79 }
80
81 // If we don't know the trip count, but have reason to believe the average
82 // trip count is low, peeling should be beneficial, since we will usually
83 // hit the peeled section.
84 // We only do this in the presence of profile information, since otherwise
85 // our estimates of the trip count are not reliable enough.
86 if (UP.AllowPeeling && L->getHeader()->getParent()->getEntryCount()) {
87 Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
88 if (!PeelCount)
89 return;
90
91 DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
92 << "\n");
93
94 if (*PeelCount) {
95 if ((*PeelCount <= UnrollPeelMaxCount) &&
96 (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
97 DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n");
98 UP.PeelCount = *PeelCount;
99 return;
100 }
101 DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
102 DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
103 DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) << "\n");
104 DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n");
105 }
106 }
107
108 return;
109}
110
111/// \brief Update the branch weights of the latch of a peeled-off loop
112/// iteration.
113/// This sets the branch weights for the latch of the recently peeled off loop
114/// iteration correctly.
115/// Our goal is to make sure that:
116/// a) The total weight of all the copies of the loop body is preserved.
117/// b) The total weight of the loop exit is preserved.
118/// c) The body weight is reasonably distributed between the peeled iterations.
119///
120/// \param Header The copy of the header block that belongs to next iteration.
121/// \param LatchBR The copy of the latch branch that belongs to this iteration.
122/// \param IterNumber The serial number of the iteration that was just
123/// peeled off.
124/// \param AvgIters The average number of iterations we expect the loop to have.
125/// \param[in,out] PeeledHeaderWeight The total number of dynamic loop
126/// iterations that are unaccounted for. As an input, it represents the number
127/// of times we expect to enter the header of the iteration currently being
128/// peeled off. The output is the number of times we expect to enter the
129/// header of the next iteration.
130static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
131 unsigned IterNumber, unsigned AvgIters,
132 uint64_t &PeeledHeaderWeight) {
133
134 // FIXME: Pick a more realistic distribution.
135 // Currently the proportion of weight we assign to the fall-through
136 // side of the branch drops linearly with the iteration number, and we use
137 // a 0.9 fudge factor to make the drop-off less sharp...
138 if (PeeledHeaderWeight) {
139 uint64_t FallThruWeight =
140 PeeledHeaderWeight * ((float)(AvgIters - IterNumber) / AvgIters * 0.9);
141 uint64_t ExitWeight = PeeledHeaderWeight - FallThruWeight;
142 PeeledHeaderWeight -= ExitWeight;
143
144 unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
145 MDBuilder MDB(LatchBR->getContext());
146 MDNode *WeightNode =
147 HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThruWeight)
148 : MDB.createBranchWeights(FallThruWeight, ExitWeight);
149 LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
150 }
151}
152
153/// \brief Clones the body of the loop L, putting it between \p InsertTop and \p
154/// InsertBot.
155/// \param IterNumber The serial number of the iteration currently being
156/// peeled off.
157/// \param Exit The exit block of the original loop.
158/// \param[out] NewBlocks A list of the the blocks in the newly created clone
159/// \param[out] VMap The value map between the loop and the new clone.
160/// \param LoopBlocks A helper for DFS-traversal of the loop.
161/// \param LVMap A value-map that maps instructions from the original loop to
162/// instructions in the last peeled-off iteration.
163static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
164 BasicBlock *InsertBot, BasicBlock *Exit,
165 SmallVectorImpl<BasicBlock *> &NewBlocks,
166 LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
167 ValueToValueMapTy &LVMap, LoopInfo *LI) {
168
169 BasicBlock *Header = L->getHeader();
170 BasicBlock *Latch = L->getLoopLatch();
171 BasicBlock *PreHeader = L->getLoopPreheader();
172
173 Function *F = Header->getParent();
174 LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
175 LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
176 Loop *ParentLoop = L->getParentLoop();
177
178 // For each block in the original loop, create a new copy,
179 // and update the value map with the newly created values.
180 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
181 BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F);
182 NewBlocks.push_back(NewBB);
183
184 if (ParentLoop)
185 ParentLoop->addBasicBlockToLoop(NewBB, *LI);
186
187 VMap[*BB] = NewBB;
188 }
189
190 // Hook-up the control flow for the newly inserted blocks.
191 // The new header is hooked up directly to the "top", which is either
192 // the original loop preheader (for the first iteration) or the previous
193 // iteration's exiting block (for every other iteration)
194 InsertTop->getTerminator()->setSuccessor(0, cast<BasicBlock>(VMap[Header]));
195
196 // Similarly, for the latch:
197 // The original exiting edge is still hooked up to the loop exit.
198 // The backedge now goes to the "bottom", which is either the loop's real
199 // header (for the last peeled iteration) or the copied header of the next
200 // iteration (for every other iteration)
201 BranchInst *LatchBR =
202 cast<BranchInst>(cast<BasicBlock>(VMap[Latch])->getTerminator());
203 unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
204 LatchBR->setSuccessor(HeaderIdx, InsertBot);
205 LatchBR->setSuccessor(1 - HeaderIdx, Exit);
206
207 // The new copy of the loop body starts with a bunch of PHI nodes
208 // that pick an incoming value from either the preheader, or the previous
209 // loop iteration. Since this copy is no longer part of the loop, we
210 // resolve this statically:
211 // For the first iteration, we use the value from the preheader directly.
212 // For any other iteration, we replace the phi with the value generated by
213 // the immediately preceding clone of the loop body (which represents
214 // the previous iteration).
215 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
216 PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
217 if (IterNumber == 0) {
218 VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader);
219 } else {
220 Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch);
221 Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
222 if (LatchInst && L->contains(LatchInst))
223 VMap[&*I] = LVMap[LatchInst];
224 else
225 VMap[&*I] = LatchVal;
226 }
227 cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
228 }
229
230 // Fix up the outgoing values - we need to add a value for the iteration
231 // we've just created. Note that this must happen *after* the incoming
232 // values are adjusted, since the value going out of the latch may also be
233 // a value coming into the header.
234 for (BasicBlock::iterator I = Exit->begin(); isa<PHINode>(I); ++I) {
235 PHINode *PHI = cast<PHINode>(I);
236 Value *LatchVal = PHI->getIncomingValueForBlock(Latch);
237 Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
238 if (LatchInst && L->contains(LatchInst))
239 LatchVal = VMap[LatchVal];
240 PHI->addIncoming(LatchVal, cast<BasicBlock>(VMap[Latch]));
241 }
242
243 // LastValueMap is updated with the values for the current loop
244 // which are used the next time this function is called.
245 for (const auto &KV : VMap)
246 LVMap[KV.first] = KV.second;
247}
248
249/// \brief Peel off the first \p PeelCount iterations of loop \p L.
250///
251/// Note that this does not peel them off as a single straight-line block.
252/// Rather, each iteration is peeled off separately, and needs to check the
253/// exit condition.
254/// For loops that dynamically execute \p PeelCount iterations or less
255/// this provides a benefit, since the peeled off iterations, which account
256/// for the bulk of dynamic execution, can be further simplified by scalar
257/// optimizations.
258bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
259 ScalarEvolution *SE, DominatorTree *DT,
260 bool PreserveLCSSA) {
261 if (!canPeel(L))
262 return false;
263
264 LoopBlocksDFS LoopBlocks(L);
265 LoopBlocks.perform(LI);
266
267 BasicBlock *Header = L->getHeader();
268 BasicBlock *PreHeader = L->getLoopPreheader();
269 BasicBlock *Latch = L->getLoopLatch();
270 BasicBlock *Exit = L->getUniqueExitBlock();
271
272 Function *F = Header->getParent();
273
274 // Set up all the necessary basic blocks. It is convenient to split the
275 // preheader into 3 parts - two blocks to anchor the peeled copy of the loop
276 // body, and a new preheader for the "real" loop.
277
278 // Peeling the first iteration transforms.
279 //
280 // PreHeader:
281 // ...
282 // Header:
283 // LoopBody
284 // If (cond) goto Header
285 // Exit:
286 //
287 // into
288 //
289 // InsertTop:
290 // LoopBody
291 // If (!cond) goto Exit
292 // InsertBot:
293 // NewPreHeader:
294 // ...
295 // Header:
296 // LoopBody
297 // If (cond) goto Header
298 // Exit:
299 //
300 // Each following iteration will split the current bottom anchor in two,
301 // and put the new copy of the loop body between these two blocks. That is,
302 // after peeling another iteration from the example above, we'll split
303 // InsertBot, and get:
304 //
305 // InsertTop:
306 // LoopBody
307 // If (!cond) goto Exit
308 // InsertBot:
309 // LoopBody
310 // If (!cond) goto Exit
311 // InsertBot.next:
312 // NewPreHeader:
313 // ...
314 // Header:
315 // LoopBody
316 // If (cond) goto Header
317 // Exit:
318
319 BasicBlock *InsertTop = SplitEdge(PreHeader, Header, DT, LI);
320 BasicBlock *InsertBot =
321 SplitBlock(InsertTop, InsertTop->getTerminator(), DT, LI);
322 BasicBlock *NewPreHeader =
323 SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
324
325 InsertTop->setName(Header->getName() + ".peel.begin");
326 InsertBot->setName(Header->getName() + ".peel.next");
327 NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
328
329 ValueToValueMapTy LVMap;
330
331 // If we have branch weight information, we'll want to update it for the
332 // newly created branches.
333 BranchInst *LatchBR =
334 cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator());
335 unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
336
337 uint64_t TrueWeight, FalseWeight;
338 uint64_t ExitWeight = 0, BackEdgeWeight = 0;
339 if (LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) {
340 ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
341 BackEdgeWeight = HeaderIdx ? FalseWeight : TrueWeight;
342 }
343
344 // For each peeled-off iteration, make a copy of the loop.
345 for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
346 SmallVector<BasicBlock *, 8> NewBlocks;
347 ValueToValueMapTy VMap;
348
349 // The exit weight of the previous iteration is the header entry weight
350 // of the current iteration. So this is exactly how many dynamic iterations
351 // the current peeled-off static iteration uses up.
352 // FIXME: due to the way the distribution is constructed, we need a
353 // guard here to make sure we don't end up with non-positive weights.
354 if (ExitWeight < BackEdgeWeight)
355 BackEdgeWeight -= ExitWeight;
356 else
357 BackEdgeWeight = 1;
358
359 cloneLoopBlocks(L, Iter, InsertTop, InsertBot, Exit,
360 NewBlocks, LoopBlocks, VMap, LVMap, LI);
361 updateBranchWeights(InsertBot, cast<BranchInst>(VMap[LatchBR]), Iter,
362 PeelCount, ExitWeight);
363
364 InsertTop = InsertBot;
365 InsertBot = SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
366 InsertBot->setName(Header->getName() + ".peel.next");
367
368 F->getBasicBlockList().splice(InsertTop->getIterator(),
369 F->getBasicBlockList(),
370 NewBlocks[0]->getIterator(), F->end());
371
372 // Remap to use values from the current iteration instead of the
373 // previous one.
374 remapInstructionsInBlocks(NewBlocks, VMap);
375 }
376
377 // Now adjust the phi nodes in the loop header to get their initial values
378 // from the last peeled-off iteration instead of the preheader.
379 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
380 PHINode *PHI = cast<PHINode>(I);
381 Value *NewVal = PHI->getIncomingValueForBlock(Latch);
382 Instruction *LatchInst = dyn_cast<Instruction>(NewVal);
383 if (LatchInst && L->contains(LatchInst))
384 NewVal = LVMap[LatchInst];
385
386 PHI->setIncomingValue(PHI->getBasicBlockIndex(NewPreHeader), NewVal);
387 }
388
389 // Adjust the branch weights on the loop exit.
390 if (ExitWeight) {
391 MDBuilder MDB(LatchBR->getContext());
392 MDNode *WeightNode =
393 HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
394 : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
395 LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
396 }
397
398 // If the loop is nested, we changed the parent loop, update SE.
399 if (Loop *ParentLoop = L->getParentLoop())
400 SE->forgetLoop(ParentLoop);
401
402 NumPeeled++;
403
404 return true;
405}