blob: 4d0d6f4b7cf6d2a58836840fc5c96b8590382671 [file] [log] [blame]
David Green963401d2018-07-01 12:47:30 +00001//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements loop unroll and jam as a routine, much like
11// LoopUnroll.cpp implements loop unroll.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/SmallPtrSet.h"
16#include "llvm/ADT/Statistic.h"
17#include "llvm/Analysis/AssumptionCache.h"
18#include "llvm/Analysis/DependenceAnalysis.h"
19#include "llvm/Analysis/InstructionSimplify.h"
20#include "llvm/Analysis/LoopAnalysisManager.h"
21#include "llvm/Analysis/LoopIterator.h"
22#include "llvm/Analysis/LoopPass.h"
23#include "llvm/Analysis/OptimizationRemarkEmitter.h"
24#include "llvm/Analysis/ScalarEvolution.h"
25#include "llvm/Analysis/ScalarEvolutionExpander.h"
26#include "llvm/Analysis/Utils/Local.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/DataLayout.h"
29#include "llvm/IR/DebugInfoMetadata.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/IntrinsicInst.h"
32#include "llvm/IR/LLVMContext.h"
33#include "llvm/Support/Debug.h"
34#include "llvm/Support/raw_ostream.h"
35#include "llvm/Transforms/Utils/BasicBlockUtils.h"
36#include "llvm/Transforms/Utils/Cloning.h"
37#include "llvm/Transforms/Utils/LoopSimplify.h"
38#include "llvm/Transforms/Utils/LoopUtils.h"
39#include "llvm/Transforms/Utils/SimplifyIndVar.h"
40#include "llvm/Transforms/Utils/UnrollLoop.h"
41using namespace llvm;
42
43#define DEBUG_TYPE "loop-unroll-and-jam"
44
45STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
46STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
47
David Green2557e432018-07-12 10:44:47 +000048typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
David Green963401d2018-07-01 12:47:30 +000049
50// Partition blocks in an outer/inner loop pair into blocks before and after
51// the loop
52static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
David Green2557e432018-07-12 10:44:47 +000053 BasicBlockSet &ForeBlocks,
54 BasicBlockSet &SubLoopBlocks,
55 BasicBlockSet &AftBlocks,
David Green963401d2018-07-01 12:47:30 +000056 DominatorTree *DT) {
57 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
David Green2557e432018-07-12 10:44:47 +000058 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
David Green963401d2018-07-01 12:47:30 +000059
60 for (BasicBlock *BB : L->blocks()) {
61 if (!SubLoop->contains(BB)) {
62 if (DT->dominates(SubLoopLatch, BB))
David Green2557e432018-07-12 10:44:47 +000063 AftBlocks.insert(BB);
David Green963401d2018-07-01 12:47:30 +000064 else
David Green2557e432018-07-12 10:44:47 +000065 ForeBlocks.insert(BB);
David Green963401d2018-07-01 12:47:30 +000066 }
67 }
68
69 // Check that all blocks in ForeBlocks together dominate the subloop
70 // TODO: This might ideally be done better with a dominator/postdominators.
71 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
72 for (BasicBlock *BB : ForeBlocks) {
73 if (BB == SubLoopPreHeader)
74 continue;
75 TerminatorInst *TI = BB->getTerminator();
76 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
David Green2557e432018-07-12 10:44:47 +000077 if (!ForeBlocks.count(TI->getSuccessor(i)))
David Green963401d2018-07-01 12:47:30 +000078 return false;
79 }
80
81 return true;
82}
83
84// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
David Green2557e432018-07-12 10:44:47 +000085static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
86 BasicBlock *Latch,
87 Instruction *InsertLoc,
88 BasicBlockSet &AftBlocks) {
David Green963401d2018-07-01 12:47:30 +000089 // We need to ensure we move the instructions in the correct order,
90 // starting with the earliest required instruction and moving forward.
91 std::vector<Instruction *> Worklist;
92 std::vector<Instruction *> Visited;
93 for (auto &Phi : Header->phis()) {
94 Value *V = Phi.getIncomingValueForBlock(Latch);
95 if (Instruction *I = dyn_cast<Instruction>(V))
96 Worklist.push_back(I);
97 }
98
99 while (!Worklist.empty()) {
100 Instruction *I = Worklist.back();
101 Worklist.pop_back();
David Green2557e432018-07-12 10:44:47 +0000102 if (!AftBlocks.count(I->getParent()))
David Green963401d2018-07-01 12:47:30 +0000103 continue;
104
105 Visited.push_back(I);
106 for (auto &U : I->operands())
107 if (Instruction *II = dyn_cast<Instruction>(U))
108 Worklist.push_back(II);
109 }
110
111 // Move all instructions in program order to before the InsertLoc
112 BasicBlock *InsertLocBB = InsertLoc->getParent();
113 for (Instruction *I : reverse(Visited)) {
114 if (I->getParent() != InsertLocBB)
115 I->moveBefore(InsertLoc);
116 }
117}
118
119/*
120 This method performs Unroll and Jam. For a simple loop like:
121 for (i = ..)
122 Fore(i)
123 for (j = ..)
124 SubLoop(i, j)
125 Aft(i)
126
127 Instead of doing normal inner or outer unrolling, we do:
128 for (i = .., i+=2)
129 Fore(i)
130 Fore(i+1)
131 for (j = ..)
132 SubLoop(i, j)
133 SubLoop(i+1, j)
134 Aft(i)
135 Aft(i+1)
136
137 So the outer loop is essetially unrolled and then the inner loops are fused
138 ("jammed") together into a single loop. This can increase speed when there
139 are loads in SubLoop that are invariant to i, as they become shared between
140 the now jammed inner loops.
141
142 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
143 Fore blocks are those before the inner loop, Aft are those after. Normal
144 Unroll code is used to copy each of these sets of blocks and the results are
145 combined together into the final form above.
146
147 isSafeToUnrollAndJam should be used prior to calling this to make sure the
148 unrolling will be valid. Checking profitablility is also advisable.
149*/
150LoopUnrollResult
151llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
152 unsigned TripMultiple, bool UnrollRemainder,
153 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
154 AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
155
156 // When we enter here we should have already checked that it is safe
157 BasicBlock *Header = L->getHeader();
158 assert(L->getSubLoops().size() == 1);
159 Loop *SubLoop = *L->begin();
160
161 // Don't enter the unroll code if there is nothing to do.
162 if (TripCount == 0 && Count < 2) {
163 LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
164 return LoopUnrollResult::Unmodified;
165 }
166
167 assert(Count > 0);
168 assert(TripMultiple > 0);
169 assert(TripCount == 0 || TripCount % TripMultiple == 0);
170
171 // Are we eliminating the loop control altogether?
172 bool CompletelyUnroll = (Count == TripCount);
173
174 // We use the runtime remainder in cases where we don't know trip multiple
175 if (TripMultiple == 1 || TripMultiple % Count != 0) {
176 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
177 /*UseEpilogRemainder*/ true,
178 UnrollRemainder, LI, SE, DT, AC, true)) {
179 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
180 "generated when assuming runtime trip count\n");
181 return LoopUnrollResult::Unmodified;
182 }
183 }
184
185 // Notify ScalarEvolution that the loop will be substantially changed,
186 // if not outright eliminated.
187 if (SE) {
188 SE->forgetLoop(L);
189 SE->forgetLoop(SubLoop);
190 }
191
192 using namespace ore;
193 // Report the unrolling decision.
194 if (CompletelyUnroll) {
195 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
196 << Header->getName() << " with trip count " << TripCount
197 << "!\n");
198 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
199 L->getHeader())
200 << "completely unroll and jammed loop with "
201 << NV("UnrollCount", TripCount) << " iterations");
202 } else {
203 auto DiagBuilder = [&]() {
204 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
205 L->getHeader());
206 return Diag << "unroll and jammed loop by a factor of "
207 << NV("UnrollCount", Count);
208 };
209
210 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
211 << " by " << Count);
212 if (TripMultiple != 1) {
213 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
214 ORE->emit([&]() {
215 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
216 << " trips per branch";
217 });
218 } else {
219 LLVM_DEBUG(dbgs() << " with run-time trip count");
220 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
221 }
222 LLVM_DEBUG(dbgs() << "!\n");
223 }
224
225 BasicBlock *Preheader = L->getLoopPreheader();
226 BasicBlock *LatchBlock = L->getLoopLatch();
227 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
228 assert(Preheader && LatchBlock && Header);
229 assert(BI && !BI->isUnconditional());
230 bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
231 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
232 bool SubLoopContinueOnTrue = SubLoop->contains(
233 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
234
235 // Partition blocks in an outer/inner loop pair into blocks before and after
236 // the loop
David Green2557e432018-07-12 10:44:47 +0000237 BasicBlockSet SubLoopBlocks;
238 BasicBlockSet ForeBlocks;
239 BasicBlockSet AftBlocks;
David Green963401d2018-07-01 12:47:30 +0000240 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
241 DT);
242
243 // We keep track of the entering/first and exiting/last block of each of
244 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
245 // blocks easier.
246 std::vector<BasicBlock *> ForeBlocksFirst;
247 std::vector<BasicBlock *> ForeBlocksLast;
248 std::vector<BasicBlock *> SubLoopBlocksFirst;
249 std::vector<BasicBlock *> SubLoopBlocksLast;
250 std::vector<BasicBlock *> AftBlocksFirst;
251 std::vector<BasicBlock *> AftBlocksLast;
252 ForeBlocksFirst.push_back(Header);
253 ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
254 SubLoopBlocksFirst.push_back(SubLoop->getHeader());
255 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
256 AftBlocksFirst.push_back(SubLoop->getExitBlock());
257 AftBlocksLast.push_back(L->getExitingBlock());
258 // Maps Blocks[0] -> Blocks[It]
259 ValueToValueMapTy LastValueMap;
260
261 // Move any instructions from fore phi operands from AftBlocks into Fore.
262 moveHeaderPhiOperandsToForeBlocks(
263 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
264 AftBlocks);
265
266 // The current on-the-fly SSA update requires blocks to be processed in
267 // reverse postorder so that LastValueMap contains the correct value at each
268 // exit.
269 LoopBlocksDFS DFS(L);
270 DFS.perform(LI);
271 // Stash the DFS iterators before adding blocks to the loop.
272 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
273 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
274
275 if (Header->getParent()->isDebugInfoForProfiling())
276 for (BasicBlock *BB : L->getBlocks())
277 for (Instruction &I : *BB)
278 if (!isa<DbgInfoIntrinsic>(&I))
279 if (const DILocation *DIL = I.getDebugLoc())
280 I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
281
282 // Copy all blocks
283 for (unsigned It = 1; It != Count; ++It) {
284 std::vector<BasicBlock *> NewBlocks;
285 // Maps Blocks[It] -> Blocks[It-1]
286 DenseMap<Value *, Value *> PrevItValueMap;
287
288 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
289 ValueToValueMapTy VMap;
290 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
291 Header->getParent()->getBasicBlockList().push_back(New);
292
David Green2557e432018-07-12 10:44:47 +0000293 if (ForeBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000294 L->addBasicBlockToLoop(New, *LI);
295
296 if (*BB == ForeBlocksFirst[0])
297 ForeBlocksFirst.push_back(New);
298 if (*BB == ForeBlocksLast[0])
299 ForeBlocksLast.push_back(New);
David Green2557e432018-07-12 10:44:47 +0000300 } else if (SubLoopBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000301 SubLoop->addBasicBlockToLoop(New, *LI);
302
303 if (*BB == SubLoopBlocksFirst[0])
304 SubLoopBlocksFirst.push_back(New);
305 if (*BB == SubLoopBlocksLast[0])
306 SubLoopBlocksLast.push_back(New);
David Green2557e432018-07-12 10:44:47 +0000307 } else if (AftBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000308 L->addBasicBlockToLoop(New, *LI);
309
310 if (*BB == AftBlocksFirst[0])
311 AftBlocksFirst.push_back(New);
312 if (*BB == AftBlocksLast[0])
313 AftBlocksLast.push_back(New);
314 } else {
315 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
316 }
317
318 // Update our running maps of newest clones
319 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
320 LastValueMap[*BB] = New;
321 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
322 VI != VE; ++VI) {
323 PrevItValueMap[VI->second] =
324 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
325 LastValueMap[VI->first] = VI->second;
326 }
327
328 NewBlocks.push_back(New);
329
330 // Update DomTree:
331 if (*BB == ForeBlocksFirst[0])
332 DT->addNewBlock(New, ForeBlocksLast[It - 1]);
333 else if (*BB == SubLoopBlocksFirst[0])
334 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
335 else if (*BB == AftBlocksFirst[0])
336 DT->addNewBlock(New, AftBlocksLast[It - 1]);
337 else {
338 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
339 // structure.
340 auto BBDomNode = DT->getNode(*BB);
341 auto BBIDom = BBDomNode->getIDom();
342 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
343 assert(OriginalBBIDom);
344 assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
345 DT->addNewBlock(
346 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
347 }
348 }
349
350 // Remap all instructions in the most recent iteration
351 for (BasicBlock *NewBlock : NewBlocks) {
352 for (Instruction &I : *NewBlock) {
353 ::remapInstruction(&I, LastValueMap);
354 if (auto *II = dyn_cast<IntrinsicInst>(&I))
355 if (II->getIntrinsicID() == Intrinsic::assume)
356 AC->registerAssumption(II);
357 }
358 }
359
360 // Alter the ForeBlocks phi's, pointing them at the latest version of the
361 // value from the previous iteration's phis
362 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
363 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
364 assert(OldValue && "should have incoming edge from Aft[It]");
365 Value *NewValue = OldValue;
366 if (Value *PrevValue = PrevItValueMap[OldValue])
367 NewValue = PrevValue;
368
369 assert(Phi.getNumOperands() == 2);
370 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
371 Phi.setIncomingValue(0, NewValue);
372 Phi.removeIncomingValue(1);
373 }
374 }
375
376 // Now that all the basic blocks for the unrolled iterations are in place,
377 // finish up connecting the blocks and phi nodes. At this point LastValueMap
378 // is the last unrolled iterations values.
379
380 // Update Phis in BB from OldBB to point to NewBB
381 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
382 BasicBlock *NewBB) {
383 for (PHINode &Phi : BB->phis()) {
384 int I = Phi.getBasicBlockIndex(OldBB);
385 Phi.setIncomingBlock(I, NewBB);
386 }
387 };
388 // Update Phis in BB from OldBB to point to NewBB and use the latest value
389 // from LastValueMap
390 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
391 BasicBlock *NewBB,
392 ValueToValueMapTy &LastValueMap) {
393 for (PHINode &Phi : BB->phis()) {
394 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
395 if (Phi.getIncomingBlock(b) == OldBB) {
396 Value *OldValue = Phi.getIncomingValue(b);
397 if (Value *LastValue = LastValueMap[OldValue])
398 Phi.setIncomingValue(b, LastValue);
399 Phi.setIncomingBlock(b, NewBB);
400 break;
401 }
402 }
403 }
404 };
405 // Move all the phis from Src into Dest
406 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
407 Instruction *insertPoint = Dest->getFirstNonPHI();
408 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
409 Phi->moveBefore(insertPoint);
410 };
411
412 // Update the PHI values outside the loop to point to the last block
413 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
414 LastValueMap);
415
416 // Update ForeBlocks successors and phi nodes
417 BranchInst *ForeTerm =
418 cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
419 BasicBlock *Dest = SubLoopBlocksFirst[0];
420 ForeTerm->setSuccessor(0, Dest);
421
422 if (CompletelyUnroll) {
423 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
424 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
425 Phi->getParent()->getInstList().erase(Phi);
426 }
427 } else {
428 // Update the PHI values to point to the last aft block
429 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
430 AftBlocksLast.back(), LastValueMap);
431 }
432
433 for (unsigned It = 1; It != Count; It++) {
434 // Remap ForeBlock successors from previous iteration to this
435 BranchInst *ForeTerm =
436 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
437 BasicBlock *Dest = ForeBlocksFirst[It];
438 ForeTerm->setSuccessor(0, Dest);
439 }
440
441 // Subloop successors and phis
442 BranchInst *SubTerm =
443 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
444 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
445 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
446 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
447 ForeBlocksLast.back());
448 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
449 SubLoopBlocksLast.back());
450
451 for (unsigned It = 1; It != Count; It++) {
452 // Replace the conditional branch of the previous iteration subloop with an
453 // unconditional one to this one
454 BranchInst *SubTerm =
455 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
456 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
457 SubTerm->eraseFromParent();
458
459 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
460 ForeBlocksLast.back());
461 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
462 SubLoopBlocksLast.back());
463 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
464 }
465
466 // Aft blocks successors and phis
467 BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
468 if (CompletelyUnroll) {
469 BranchInst::Create(LoopExit, Term);
470 Term->eraseFromParent();
471 } else {
472 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
473 }
474 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
475 SubLoopBlocksLast.back());
476
477 for (unsigned It = 1; It != Count; It++) {
478 // Replace the conditional branch of the previous iteration subloop with an
479 // unconditional one to this one
480 BranchInst *AftTerm =
481 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
482 BranchInst::Create(AftBlocksFirst[It], AftTerm);
483 AftTerm->eraseFromParent();
484
485 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
486 SubLoopBlocksLast.back());
487 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
488 }
489
490 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
491 // new ones required.
492 if (Count != 1) {
493 SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
494 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
495 SubLoopBlocksFirst[0]);
496 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
497 SubLoopBlocksLast[0], AftBlocksFirst[0]);
498
499 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
500 ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
501 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
502 SubLoopBlocksLast.back(), AftBlocksFirst[0]);
503 DT->applyUpdates(DTUpdates);
504 }
505
506 // Merge adjacent basic blocks, if possible.
507 SmallPtrSet<BasicBlock *, 16> MergeBlocks;
508 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
509 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
510 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
511 while (!MergeBlocks.empty()) {
512 BasicBlock *BB = *MergeBlocks.begin();
513 BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
514 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
515 BasicBlock *Dest = Term->getSuccessor(0);
516 if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
517 // Don't remove BB and add Fold as they are the same BB
518 assert(Fold == BB);
519 (void)Fold;
520 MergeBlocks.erase(Dest);
521 } else
522 MergeBlocks.erase(BB);
523 } else
524 MergeBlocks.erase(BB);
525 }
526
527 // At this point, the code is well formed. We now do a quick sweep over the
528 // inserted code, doing constant propagation and dead code elimination as we
529 // go.
530 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
531 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
532
533 NumCompletelyUnrolledAndJammed += CompletelyUnroll;
534 ++NumUnrolledAndJammed;
535
536#ifndef NDEBUG
537 // We shouldn't have done anything to break loop simplify form or LCSSA.
538 Loop *OuterL = L->getParentLoop();
539 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
540 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
541 if (!CompletelyUnroll)
542 assert(L->isLoopSimplifyForm());
543 assert(SubLoop->isLoopSimplifyForm());
544 assert(DT->verify());
545#endif
546
547 // Update LoopInfo if the loop is completely removed.
548 if (CompletelyUnroll)
549 LI->erase(L);
550
551 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
552 : LoopUnrollResult::PartiallyUnrolled;
553}
554
David Green2557e432018-07-12 10:44:47 +0000555static bool getLoadsAndStores(BasicBlockSet &Blocks,
David Green963401d2018-07-01 12:47:30 +0000556 SmallVector<Value *, 4> &MemInstr) {
557 // Scan the BBs and collect legal loads and stores.
558 // Returns false if non-simple loads/stores are found.
559 for (BasicBlock *BB : Blocks) {
560 for (Instruction &I : *BB) {
561 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
562 if (!Ld->isSimple())
563 return false;
564 MemInstr.push_back(&I);
565 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
566 if (!St->isSimple())
567 return false;
568 MemInstr.push_back(&I);
569 } else if (I.mayReadOrWriteMemory()) {
570 return false;
571 }
572 }
573 }
574 return true;
575}
576
577static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
578 SmallVector<Value *, 4> &Later,
579 unsigned LoopDepth, bool InnerLoop,
580 DependenceInfo &DI) {
581 // Use DA to check for dependencies between loads and stores that make unroll
582 // and jam invalid
583 for (Value *I : Earlier) {
584 for (Value *J : Later) {
585 Instruction *Src = cast<Instruction>(I);
586 Instruction *Dst = cast<Instruction>(J);
587 if (Src == Dst)
588 continue;
589 // Ignore Input dependencies.
590 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
591 continue;
592
593 // Track dependencies, and if we find them take a conservative approach
594 // by allowing only = or < (not >), altough some > would be safe
595 // (depending upon unroll width).
596 // For the inner loop, we need to disallow any (> <) dependencies
597 // FIXME: Allow > so long as distance is less than unroll width
598 if (auto D = DI.depends(Src, Dst, true)) {
599 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
600
601 if (D->isConfused())
602 return false;
603 if (!InnerLoop) {
604 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
605 return false;
606 } else {
607 assert(LoopDepth + 1 <= D->getLevels());
608 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
609 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
610 return false;
611 }
612 }
613 }
614 }
615 return true;
616}
617
David Green2557e432018-07-12 10:44:47 +0000618static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
619 BasicBlockSet &SubLoopBlocks,
620 BasicBlockSet &AftBlocks, DependenceInfo &DI) {
David Green963401d2018-07-01 12:47:30 +0000621 // Get all loads/store pairs for each blocks
622 SmallVector<Value *, 4> ForeMemInstr;
623 SmallVector<Value *, 4> SubLoopMemInstr;
624 SmallVector<Value *, 4> AftMemInstr;
625 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
626 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
627 !getLoadsAndStores(AftBlocks, AftMemInstr))
628 return false;
629
630 // Check for dependencies between any blocks that may change order
631 unsigned LoopDepth = L->getLoopDepth();
632 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
633 DI) &&
634 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
635 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
636 DI) &&
637 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
638 DI);
639}
640
641bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
642 DependenceInfo &DI) {
643 /* We currently handle outer loops like this:
644 |
645 ForeFirst <----\ }
646 Blocks | } ForeBlocks
647 ForeLast | }
648 | |
649 SubLoopFirst <\ | }
650 Blocks | | } SubLoopBlocks
651 SubLoopLast -/ | }
652 | |
653 AftFirst | }
654 Blocks | } AftBlocks
655 AftLast ------/ }
656 |
657
658 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
659 and AftBlocks, providing that there is one edge from Fores to SubLoops,
660 one edge from SubLoops to Afts and a single outer loop exit (from Afts).
661 In practice we currently limit Aft blocks to a single block, and limit
662 things further in the profitablility checks of the unroll and jam pass.
663
664 Because of the way we rearrange basic blocks, we also require that
665 the Fore blocks on all unrolled iterations are safe to move before the
666 SubLoop blocks of all iterations. So we require that the phi node looping
667 operands of ForeHeader can be moved to at least the end of ForeEnd, so that
668 we can arrange cloned Fore Blocks before the subloop and match up Phi's
669 correctly.
670
671 i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
672 It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
673
674 There are then a number of checks along the lines of no calls, no
675 exceptions, inner loop IV is consistent, etc. Note that for loops requiring
676 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
677 UnrollAndJamLoop if the trip count cannot be easily calculated.
678 */
679
680 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
681 return false;
682 Loop *SubLoop = L->getSubLoops()[0];
683 if (!SubLoop->isLoopSimplifyForm())
684 return false;
685
686 BasicBlock *Header = L->getHeader();
687 BasicBlock *Latch = L->getLoopLatch();
688 BasicBlock *Exit = L->getExitingBlock();
689 BasicBlock *SubLoopHeader = SubLoop->getHeader();
690 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
691 BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
692
693 if (Latch != Exit)
694 return false;
695 if (SubLoopLatch != SubLoopExit)
696 return false;
697
698 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
699 return false;
700
701 // Split blocks into Fore/SubLoop/Aft based on dominators
David Green2557e432018-07-12 10:44:47 +0000702 BasicBlockSet SubLoopBlocks;
703 BasicBlockSet ForeBlocks;
704 BasicBlockSet AftBlocks;
David Green963401d2018-07-01 12:47:30 +0000705 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
706 AftBlocks, &DT))
707 return false;
708
709 // Aft blocks may need to move instructions to fore blocks, which becomes more
710 // difficult if there are multiple (potentially conditionally executed)
711 // blocks. For now we just exclude loops with multiple aft blocks.
712 if (AftBlocks.size() != 1)
713 return false;
714
715 // Check inner loop IV is consistent between all iterations
716 const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
717 if (isa<SCEVCouldNotCompute>(SubLoopBECountSC) ||
718 !SubLoopBECountSC->getType()->isIntegerTy())
719 return false;
720 ScalarEvolution::LoopDisposition LD =
721 SE.getLoopDisposition(SubLoopBECountSC, L);
722 if (LD != ScalarEvolution::LoopInvariant)
723 return false;
724
725 // Check the loop safety info for exceptions.
726 LoopSafetyInfo LSI;
727 computeLoopSafetyInfo(&LSI, L);
728 if (LSI.MayThrow)
729 return false;
730
731 // We've ruled out the easy stuff and now need to check that there are no
732 // interdependencies which may prevent us from moving the:
733 // ForeBlocks before Subloop and AftBlocks.
734 // Subloop before AftBlocks.
735 // ForeBlock phi operands before the subloop
736
737 // Make sure we can move all instructions we need to before the subloop
738 SmallVector<Instruction *, 8> Worklist;
739 SmallPtrSet<Instruction *, 8> Visited;
740 for (auto &Phi : Header->phis()) {
741 Value *V = Phi.getIncomingValueForBlock(Latch);
742 if (Instruction *I = dyn_cast<Instruction>(V))
743 Worklist.push_back(I);
744 }
745 while (!Worklist.empty()) {
746 Instruction *I = Worklist.back();
747 Worklist.pop_back();
748 if (Visited.insert(I).second) {
749 if (SubLoop->contains(I->getParent()))
750 return false;
David Green2557e432018-07-12 10:44:47 +0000751 if (AftBlocks.count(I->getParent())) {
David Green963401d2018-07-01 12:47:30 +0000752 // If we hit a phi node in afts we know we are done (probably LCSSA)
753 if (isa<PHINode>(I))
754 return false;
755 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
756 return false;
757 for (auto &U : I->operands())
758 if (Instruction *II = dyn_cast<Instruction>(U))
759 Worklist.push_back(II);
760 }
761 }
762 }
763
764 // Check for memory dependencies which prohibit the unrolling we are doing.
765 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
766 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
767 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
768 return false;
769
770 return true;
771}