blob: e26762639c13f7199070b4418810bc2230d3822f [file] [log] [blame]
David Green963401d2018-07-01 12:47:30 +00001//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements loop unroll and jam as a routine, much like
11// LoopUnroll.cpp implements loop unroll.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/SmallPtrSet.h"
16#include "llvm/ADT/Statistic.h"
17#include "llvm/Analysis/AssumptionCache.h"
18#include "llvm/Analysis/DependenceAnalysis.h"
19#include "llvm/Analysis/InstructionSimplify.h"
20#include "llvm/Analysis/LoopAnalysisManager.h"
21#include "llvm/Analysis/LoopIterator.h"
22#include "llvm/Analysis/LoopPass.h"
23#include "llvm/Analysis/OptimizationRemarkEmitter.h"
24#include "llvm/Analysis/ScalarEvolution.h"
25#include "llvm/Analysis/ScalarEvolutionExpander.h"
26#include "llvm/Analysis/Utils/Local.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/DataLayout.h"
29#include "llvm/IR/DebugInfoMetadata.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/IntrinsicInst.h"
32#include "llvm/IR/LLVMContext.h"
33#include "llvm/Support/Debug.h"
34#include "llvm/Support/raw_ostream.h"
35#include "llvm/Transforms/Utils/BasicBlockUtils.h"
36#include "llvm/Transforms/Utils/Cloning.h"
37#include "llvm/Transforms/Utils/LoopSimplify.h"
38#include "llvm/Transforms/Utils/LoopUtils.h"
39#include "llvm/Transforms/Utils/SimplifyIndVar.h"
40#include "llvm/Transforms/Utils/UnrollLoop.h"
41using namespace llvm;
42
43#define DEBUG_TYPE "loop-unroll-and-jam"
44
45STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
46STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
47
David Green2557e432018-07-12 10:44:47 +000048typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
David Green963401d2018-07-01 12:47:30 +000049
50// Partition blocks in an outer/inner loop pair into blocks before and after
51// the loop
52static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
David Green2557e432018-07-12 10:44:47 +000053 BasicBlockSet &ForeBlocks,
54 BasicBlockSet &SubLoopBlocks,
55 BasicBlockSet &AftBlocks,
David Green963401d2018-07-01 12:47:30 +000056 DominatorTree *DT) {
57 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
David Green2557e432018-07-12 10:44:47 +000058 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
David Green963401d2018-07-01 12:47:30 +000059
60 for (BasicBlock *BB : L->blocks()) {
61 if (!SubLoop->contains(BB)) {
62 if (DT->dominates(SubLoopLatch, BB))
David Green2557e432018-07-12 10:44:47 +000063 AftBlocks.insert(BB);
David Green963401d2018-07-01 12:47:30 +000064 else
David Green2557e432018-07-12 10:44:47 +000065 ForeBlocks.insert(BB);
David Green963401d2018-07-01 12:47:30 +000066 }
67 }
68
69 // Check that all blocks in ForeBlocks together dominate the subloop
70 // TODO: This might ideally be done better with a dominator/postdominators.
71 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
72 for (BasicBlock *BB : ForeBlocks) {
73 if (BB == SubLoopPreHeader)
74 continue;
Chandler Carruthedb12a82018-10-15 10:04:59 +000075 Instruction *TI = BB->getTerminator();
David Green963401d2018-07-01 12:47:30 +000076 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
David Green2557e432018-07-12 10:44:47 +000077 if (!ForeBlocks.count(TI->getSuccessor(i)))
David Green963401d2018-07-01 12:47:30 +000078 return false;
79 }
80
81 return true;
82}
83
David Greeneda3c9e2018-07-26 15:19:07 +000084// Looks at the phi nodes in Header for values coming from Latch. For these
85// instructions and all their operands calls Visit on them, keeping going for
86// all the operands in AftBlocks. Returns false if Visit returns false,
87// otherwise returns true. This is used to process the instructions in the
88// Aft blocks that need to be moved before the subloop. It is used in two
89// places. One to check that the required set of instructions can be moved
90// before the loop. Then to collect the instructions to actually move in
91// moveHeaderPhiOperandsToForeBlocks.
92template <typename T>
93static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
94 BasicBlockSet &AftBlocks, T Visit) {
95 SmallVector<Instruction *, 8> Worklist;
David Green963401d2018-07-01 12:47:30 +000096 for (auto &Phi : Header->phis()) {
97 Value *V = Phi.getIncomingValueForBlock(Latch);
98 if (Instruction *I = dyn_cast<Instruction>(V))
99 Worklist.push_back(I);
100 }
101
102 while (!Worklist.empty()) {
103 Instruction *I = Worklist.back();
104 Worklist.pop_back();
David Greeneda3c9e2018-07-26 15:19:07 +0000105 if (!Visit(I))
106 return false;
David Green963401d2018-07-01 12:47:30 +0000107
David Greeneda3c9e2018-07-26 15:19:07 +0000108 if (AftBlocks.count(I->getParent()))
109 for (auto &U : I->operands())
110 if (Instruction *II = dyn_cast<Instruction>(U))
111 Worklist.push_back(II);
David Green963401d2018-07-01 12:47:30 +0000112 }
113
David Greeneda3c9e2018-07-26 15:19:07 +0000114 return true;
115}
116
117// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
118static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
119 BasicBlock *Latch,
120 Instruction *InsertLoc,
121 BasicBlockSet &AftBlocks) {
122 // We need to ensure we move the instructions in the correct order,
123 // starting with the earliest required instruction and moving forward.
124 std::vector<Instruction *> Visited;
125 processHeaderPhiOperands(Header, Latch, AftBlocks,
126 [&Visited, &AftBlocks](Instruction *I) {
127 if (AftBlocks.count(I->getParent()))
128 Visited.push_back(I);
129 return true;
130 });
131
David Green963401d2018-07-01 12:47:30 +0000132 // Move all instructions in program order to before the InsertLoc
133 BasicBlock *InsertLocBB = InsertLoc->getParent();
134 for (Instruction *I : reverse(Visited)) {
135 if (I->getParent() != InsertLocBB)
136 I->moveBefore(InsertLoc);
137 }
138}
139
140/*
141 This method performs Unroll and Jam. For a simple loop like:
142 for (i = ..)
143 Fore(i)
144 for (j = ..)
145 SubLoop(i, j)
146 Aft(i)
147
148 Instead of doing normal inner or outer unrolling, we do:
149 for (i = .., i+=2)
150 Fore(i)
151 Fore(i+1)
152 for (j = ..)
153 SubLoop(i, j)
154 SubLoop(i+1, j)
155 Aft(i)
156 Aft(i+1)
157
158 So the outer loop is essetially unrolled and then the inner loops are fused
159 ("jammed") together into a single loop. This can increase speed when there
160 are loads in SubLoop that are invariant to i, as they become shared between
161 the now jammed inner loops.
162
163 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
164 Fore blocks are those before the inner loop, Aft are those after. Normal
165 Unroll code is used to copy each of these sets of blocks and the results are
166 combined together into the final form above.
167
168 isSafeToUnrollAndJam should be used prior to calling this to make sure the
169 unrolling will be valid. Checking profitablility is also advisable.
Michael Kruse72448522018-12-12 17:32:52 +0000170
171 If EpilogueLoop is non-null, it receives the epilogue loop (if it was
172 necessary to create one and not fully unrolled).
David Green963401d2018-07-01 12:47:30 +0000173*/
Michael Kruse72448522018-12-12 17:32:52 +0000174LoopUnrollResult llvm::UnrollAndJamLoop(
175 Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
176 bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
177 AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
David Green963401d2018-07-01 12:47:30 +0000178
179 // When we enter here we should have already checked that it is safe
180 BasicBlock *Header = L->getHeader();
181 assert(L->getSubLoops().size() == 1);
182 Loop *SubLoop = *L->begin();
183
184 // Don't enter the unroll code if there is nothing to do.
185 if (TripCount == 0 && Count < 2) {
David Greenbc2e1c32018-08-02 07:30:53 +0000186 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
David Green963401d2018-07-01 12:47:30 +0000187 return LoopUnrollResult::Unmodified;
188 }
189
190 assert(Count > 0);
191 assert(TripMultiple > 0);
192 assert(TripCount == 0 || TripCount % TripMultiple == 0);
193
194 // Are we eliminating the loop control altogether?
195 bool CompletelyUnroll = (Count == TripCount);
196
197 // We use the runtime remainder in cases where we don't know trip multiple
198 if (TripMultiple == 1 || TripMultiple % Count != 0) {
199 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
200 /*UseEpilogRemainder*/ true,
Michael Kruse72448522018-12-12 17:32:52 +0000201 UnrollRemainder, LI, SE, DT, AC, true,
202 EpilogueLoop)) {
David Green963401d2018-07-01 12:47:30 +0000203 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
204 "generated when assuming runtime trip count\n");
205 return LoopUnrollResult::Unmodified;
206 }
207 }
208
209 // Notify ScalarEvolution that the loop will be substantially changed,
210 // if not outright eliminated.
211 if (SE) {
212 SE->forgetLoop(L);
213 SE->forgetLoop(SubLoop);
214 }
215
216 using namespace ore;
217 // Report the unrolling decision.
218 if (CompletelyUnroll) {
219 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
220 << Header->getName() << " with trip count " << TripCount
221 << "!\n");
222 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
223 L->getHeader())
224 << "completely unroll and jammed loop with "
225 << NV("UnrollCount", TripCount) << " iterations");
226 } else {
227 auto DiagBuilder = [&]() {
228 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
229 L->getHeader());
230 return Diag << "unroll and jammed loop by a factor of "
231 << NV("UnrollCount", Count);
232 };
233
234 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
235 << " by " << Count);
236 if (TripMultiple != 1) {
237 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
238 ORE->emit([&]() {
239 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
240 << " trips per branch";
241 });
242 } else {
243 LLVM_DEBUG(dbgs() << " with run-time trip count");
244 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
245 }
246 LLVM_DEBUG(dbgs() << "!\n");
247 }
248
249 BasicBlock *Preheader = L->getLoopPreheader();
250 BasicBlock *LatchBlock = L->getLoopLatch();
251 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
252 assert(Preheader && LatchBlock && Header);
253 assert(BI && !BI->isUnconditional());
254 bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
255 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
256 bool SubLoopContinueOnTrue = SubLoop->contains(
257 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
258
259 // Partition blocks in an outer/inner loop pair into blocks before and after
260 // the loop
David Green2557e432018-07-12 10:44:47 +0000261 BasicBlockSet SubLoopBlocks;
262 BasicBlockSet ForeBlocks;
263 BasicBlockSet AftBlocks;
David Green963401d2018-07-01 12:47:30 +0000264 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
265 DT);
266
267 // We keep track of the entering/first and exiting/last block of each of
268 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
269 // blocks easier.
270 std::vector<BasicBlock *> ForeBlocksFirst;
271 std::vector<BasicBlock *> ForeBlocksLast;
272 std::vector<BasicBlock *> SubLoopBlocksFirst;
273 std::vector<BasicBlock *> SubLoopBlocksLast;
274 std::vector<BasicBlock *> AftBlocksFirst;
275 std::vector<BasicBlock *> AftBlocksLast;
276 ForeBlocksFirst.push_back(Header);
277 ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
278 SubLoopBlocksFirst.push_back(SubLoop->getHeader());
279 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
280 AftBlocksFirst.push_back(SubLoop->getExitBlock());
281 AftBlocksLast.push_back(L->getExitingBlock());
282 // Maps Blocks[0] -> Blocks[It]
283 ValueToValueMapTy LastValueMap;
284
285 // Move any instructions from fore phi operands from AftBlocks into Fore.
286 moveHeaderPhiOperandsToForeBlocks(
287 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
288 AftBlocks);
289
290 // The current on-the-fly SSA update requires blocks to be processed in
291 // reverse postorder so that LastValueMap contains the correct value at each
292 // exit.
293 LoopBlocksDFS DFS(L);
294 DFS.perform(LI);
295 // Stash the DFS iterators before adding blocks to the loop.
296 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
297 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
298
299 if (Header->getParent()->isDebugInfoForProfiling())
300 for (BasicBlock *BB : L->getBlocks())
301 for (Instruction &I : *BB)
302 if (!isa<DbgInfoIntrinsic>(&I))
Mircea Trofinb53eeb62018-12-21 22:48:50 +0000303 if (const DILocation *DIL = I.getDebugLoc()) {
304 auto NewDIL = DIL->cloneWithDuplicationFactor(Count);
305 if (NewDIL)
306 I.setDebugLoc(NewDIL.getValue());
307 else
308 LLVM_DEBUG(dbgs()
309 << "Failed to create new discriminator: "
310 << DIL->getFilename() << " Line: " << DIL->getLine());
311 }
David Green963401d2018-07-01 12:47:30 +0000312
313 // Copy all blocks
314 for (unsigned It = 1; It != Count; ++It) {
315 std::vector<BasicBlock *> NewBlocks;
316 // Maps Blocks[It] -> Blocks[It-1]
317 DenseMap<Value *, Value *> PrevItValueMap;
318
319 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
320 ValueToValueMapTy VMap;
321 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
322 Header->getParent()->getBasicBlockList().push_back(New);
323
David Green2557e432018-07-12 10:44:47 +0000324 if (ForeBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000325 L->addBasicBlockToLoop(New, *LI);
326
327 if (*BB == ForeBlocksFirst[0])
328 ForeBlocksFirst.push_back(New);
329 if (*BB == ForeBlocksLast[0])
330 ForeBlocksLast.push_back(New);
David Green2557e432018-07-12 10:44:47 +0000331 } else if (SubLoopBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000332 SubLoop->addBasicBlockToLoop(New, *LI);
333
334 if (*BB == SubLoopBlocksFirst[0])
335 SubLoopBlocksFirst.push_back(New);
336 if (*BB == SubLoopBlocksLast[0])
337 SubLoopBlocksLast.push_back(New);
David Green2557e432018-07-12 10:44:47 +0000338 } else if (AftBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000339 L->addBasicBlockToLoop(New, *LI);
340
341 if (*BB == AftBlocksFirst[0])
342 AftBlocksFirst.push_back(New);
343 if (*BB == AftBlocksLast[0])
344 AftBlocksLast.push_back(New);
345 } else {
346 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
347 }
348
349 // Update our running maps of newest clones
350 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
351 LastValueMap[*BB] = New;
352 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
353 VI != VE; ++VI) {
354 PrevItValueMap[VI->second] =
355 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
356 LastValueMap[VI->first] = VI->second;
357 }
358
359 NewBlocks.push_back(New);
360
361 // Update DomTree:
362 if (*BB == ForeBlocksFirst[0])
363 DT->addNewBlock(New, ForeBlocksLast[It - 1]);
364 else if (*BB == SubLoopBlocksFirst[0])
365 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
366 else if (*BB == AftBlocksFirst[0])
367 DT->addNewBlock(New, AftBlocksLast[It - 1]);
368 else {
369 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
370 // structure.
371 auto BBDomNode = DT->getNode(*BB);
372 auto BBIDom = BBDomNode->getIDom();
373 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
374 assert(OriginalBBIDom);
375 assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
376 DT->addNewBlock(
377 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
378 }
379 }
380
381 // Remap all instructions in the most recent iteration
382 for (BasicBlock *NewBlock : NewBlocks) {
383 for (Instruction &I : *NewBlock) {
384 ::remapInstruction(&I, LastValueMap);
385 if (auto *II = dyn_cast<IntrinsicInst>(&I))
386 if (II->getIntrinsicID() == Intrinsic::assume)
387 AC->registerAssumption(II);
388 }
389 }
390
391 // Alter the ForeBlocks phi's, pointing them at the latest version of the
392 // value from the previous iteration's phis
393 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
394 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
395 assert(OldValue && "should have incoming edge from Aft[It]");
396 Value *NewValue = OldValue;
397 if (Value *PrevValue = PrevItValueMap[OldValue])
398 NewValue = PrevValue;
399
400 assert(Phi.getNumOperands() == 2);
401 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
402 Phi.setIncomingValue(0, NewValue);
403 Phi.removeIncomingValue(1);
404 }
405 }
406
407 // Now that all the basic blocks for the unrolled iterations are in place,
408 // finish up connecting the blocks and phi nodes. At this point LastValueMap
409 // is the last unrolled iterations values.
410
411 // Update Phis in BB from OldBB to point to NewBB
412 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
413 BasicBlock *NewBB) {
414 for (PHINode &Phi : BB->phis()) {
415 int I = Phi.getBasicBlockIndex(OldBB);
416 Phi.setIncomingBlock(I, NewBB);
417 }
418 };
419 // Update Phis in BB from OldBB to point to NewBB and use the latest value
420 // from LastValueMap
421 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
422 BasicBlock *NewBB,
423 ValueToValueMapTy &LastValueMap) {
424 for (PHINode &Phi : BB->phis()) {
425 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
426 if (Phi.getIncomingBlock(b) == OldBB) {
427 Value *OldValue = Phi.getIncomingValue(b);
428 if (Value *LastValue = LastValueMap[OldValue])
429 Phi.setIncomingValue(b, LastValue);
430 Phi.setIncomingBlock(b, NewBB);
431 break;
432 }
433 }
434 }
435 };
436 // Move all the phis from Src into Dest
437 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
438 Instruction *insertPoint = Dest->getFirstNonPHI();
439 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
440 Phi->moveBefore(insertPoint);
441 };
442
443 // Update the PHI values outside the loop to point to the last block
444 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
445 LastValueMap);
446
447 // Update ForeBlocks successors and phi nodes
448 BranchInst *ForeTerm =
449 cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
450 BasicBlock *Dest = SubLoopBlocksFirst[0];
451 ForeTerm->setSuccessor(0, Dest);
452
453 if (CompletelyUnroll) {
454 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
455 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
456 Phi->getParent()->getInstList().erase(Phi);
457 }
458 } else {
459 // Update the PHI values to point to the last aft block
460 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
461 AftBlocksLast.back(), LastValueMap);
462 }
463
464 for (unsigned It = 1; It != Count; It++) {
465 // Remap ForeBlock successors from previous iteration to this
466 BranchInst *ForeTerm =
467 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
468 BasicBlock *Dest = ForeBlocksFirst[It];
469 ForeTerm->setSuccessor(0, Dest);
470 }
471
472 // Subloop successors and phis
473 BranchInst *SubTerm =
474 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
475 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
476 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
477 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
478 ForeBlocksLast.back());
479 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
480 SubLoopBlocksLast.back());
481
482 for (unsigned It = 1; It != Count; It++) {
483 // Replace the conditional branch of the previous iteration subloop with an
484 // unconditional one to this one
485 BranchInst *SubTerm =
486 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
487 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
488 SubTerm->eraseFromParent();
489
490 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
491 ForeBlocksLast.back());
492 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
493 SubLoopBlocksLast.back());
494 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
495 }
496
497 // Aft blocks successors and phis
498 BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
499 if (CompletelyUnroll) {
500 BranchInst::Create(LoopExit, Term);
501 Term->eraseFromParent();
502 } else {
503 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
504 }
505 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
506 SubLoopBlocksLast.back());
507
508 for (unsigned It = 1; It != Count; It++) {
509 // Replace the conditional branch of the previous iteration subloop with an
510 // unconditional one to this one
511 BranchInst *AftTerm =
512 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
513 BranchInst::Create(AftBlocksFirst[It], AftTerm);
514 AftTerm->eraseFromParent();
515
516 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
517 SubLoopBlocksLast.back());
518 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
519 }
520
521 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
522 // new ones required.
523 if (Count != 1) {
524 SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
525 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
526 SubLoopBlocksFirst[0]);
527 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
528 SubLoopBlocksLast[0], AftBlocksFirst[0]);
529
530 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
531 ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
532 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
533 SubLoopBlocksLast.back(), AftBlocksFirst[0]);
534 DT->applyUpdates(DTUpdates);
535 }
536
537 // Merge adjacent basic blocks, if possible.
538 SmallPtrSet<BasicBlock *, 16> MergeBlocks;
539 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
540 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
541 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
542 while (!MergeBlocks.empty()) {
543 BasicBlock *BB = *MergeBlocks.begin();
544 BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
545 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
546 BasicBlock *Dest = Term->getSuccessor(0);
547 if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
548 // Don't remove BB and add Fold as they are the same BB
549 assert(Fold == BB);
550 (void)Fold;
551 MergeBlocks.erase(Dest);
552 } else
553 MergeBlocks.erase(BB);
554 } else
555 MergeBlocks.erase(BB);
556 }
557
558 // At this point, the code is well formed. We now do a quick sweep over the
559 // inserted code, doing constant propagation and dead code elimination as we
560 // go.
561 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
562 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
563
564 NumCompletelyUnrolledAndJammed += CompletelyUnroll;
565 ++NumUnrolledAndJammed;
566
567#ifndef NDEBUG
568 // We shouldn't have done anything to break loop simplify form or LCSSA.
569 Loop *OuterL = L->getParentLoop();
570 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
571 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
572 if (!CompletelyUnroll)
573 assert(L->isLoopSimplifyForm());
574 assert(SubLoop->isLoopSimplifyForm());
575 assert(DT->verify());
576#endif
577
578 // Update LoopInfo if the loop is completely removed.
579 if (CompletelyUnroll)
580 LI->erase(L);
581
582 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
583 : LoopUnrollResult::PartiallyUnrolled;
584}
585
David Green2557e432018-07-12 10:44:47 +0000586static bool getLoadsAndStores(BasicBlockSet &Blocks,
David Green963401d2018-07-01 12:47:30 +0000587 SmallVector<Value *, 4> &MemInstr) {
588 // Scan the BBs and collect legal loads and stores.
589 // Returns false if non-simple loads/stores are found.
590 for (BasicBlock *BB : Blocks) {
591 for (Instruction &I : *BB) {
592 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
593 if (!Ld->isSimple())
594 return false;
595 MemInstr.push_back(&I);
596 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
597 if (!St->isSimple())
598 return false;
599 MemInstr.push_back(&I);
600 } else if (I.mayReadOrWriteMemory()) {
601 return false;
602 }
603 }
604 }
605 return true;
606}
607
608static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
609 SmallVector<Value *, 4> &Later,
610 unsigned LoopDepth, bool InnerLoop,
611 DependenceInfo &DI) {
612 // Use DA to check for dependencies between loads and stores that make unroll
613 // and jam invalid
614 for (Value *I : Earlier) {
615 for (Value *J : Later) {
616 Instruction *Src = cast<Instruction>(I);
617 Instruction *Dst = cast<Instruction>(J);
618 if (Src == Dst)
619 continue;
620 // Ignore Input dependencies.
621 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
622 continue;
623
624 // Track dependencies, and if we find them take a conservative approach
625 // by allowing only = or < (not >), altough some > would be safe
626 // (depending upon unroll width).
627 // For the inner loop, we need to disallow any (> <) dependencies
628 // FIXME: Allow > so long as distance is less than unroll width
629 if (auto D = DI.depends(Src, Dst, true)) {
630 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
631
David Greenbc2e1c32018-08-02 07:30:53 +0000632 if (D->isConfused()) {
633 LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
634 << " " << *Src << "\n"
635 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000636 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000637 }
David Green963401d2018-07-01 12:47:30 +0000638 if (!InnerLoop) {
David Greenbc2e1c32018-08-02 07:30:53 +0000639 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
640 LLVM_DEBUG(dbgs() << " > dependency between:\n"
641 << " " << *Src << "\n"
642 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000643 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000644 }
David Green963401d2018-07-01 12:47:30 +0000645 } else {
646 assert(LoopDepth + 1 <= D->getLevels());
647 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
David Greenbc2e1c32018-08-02 07:30:53 +0000648 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
649 LLVM_DEBUG(dbgs() << " < > dependency between:\n"
650 << " " << *Src << "\n"
651 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000652 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000653 }
David Green963401d2018-07-01 12:47:30 +0000654 }
655 }
656 }
657 }
658 return true;
659}
660
David Green2557e432018-07-12 10:44:47 +0000661static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
662 BasicBlockSet &SubLoopBlocks,
663 BasicBlockSet &AftBlocks, DependenceInfo &DI) {
David Green963401d2018-07-01 12:47:30 +0000664 // Get all loads/store pairs for each blocks
665 SmallVector<Value *, 4> ForeMemInstr;
666 SmallVector<Value *, 4> SubLoopMemInstr;
667 SmallVector<Value *, 4> AftMemInstr;
668 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
669 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
670 !getLoadsAndStores(AftBlocks, AftMemInstr))
671 return false;
672
673 // Check for dependencies between any blocks that may change order
674 unsigned LoopDepth = L->getLoopDepth();
675 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
676 DI) &&
677 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
678 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
679 DI) &&
680 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
681 DI);
682}
683
684bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
685 DependenceInfo &DI) {
686 /* We currently handle outer loops like this:
687 |
688 ForeFirst <----\ }
689 Blocks | } ForeBlocks
690 ForeLast | }
691 | |
692 SubLoopFirst <\ | }
693 Blocks | | } SubLoopBlocks
694 SubLoopLast -/ | }
695 | |
696 AftFirst | }
697 Blocks | } AftBlocks
698 AftLast ------/ }
699 |
700
701 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
702 and AftBlocks, providing that there is one edge from Fores to SubLoops,
703 one edge from SubLoops to Afts and a single outer loop exit (from Afts).
704 In practice we currently limit Aft blocks to a single block, and limit
705 things further in the profitablility checks of the unroll and jam pass.
706
707 Because of the way we rearrange basic blocks, we also require that
708 the Fore blocks on all unrolled iterations are safe to move before the
709 SubLoop blocks of all iterations. So we require that the phi node looping
710 operands of ForeHeader can be moved to at least the end of ForeEnd, so that
711 we can arrange cloned Fore Blocks before the subloop and match up Phi's
712 correctly.
713
714 i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
715 It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
716
717 There are then a number of checks along the lines of no calls, no
718 exceptions, inner loop IV is consistent, etc. Note that for loops requiring
719 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
720 UnrollAndJamLoop if the trip count cannot be easily calculated.
721 */
722
723 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
724 return false;
725 Loop *SubLoop = L->getSubLoops()[0];
726 if (!SubLoop->isLoopSimplifyForm())
727 return false;
728
729 BasicBlock *Header = L->getHeader();
730 BasicBlock *Latch = L->getLoopLatch();
731 BasicBlock *Exit = L->getExitingBlock();
732 BasicBlock *SubLoopHeader = SubLoop->getHeader();
733 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
734 BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
735
736 if (Latch != Exit)
737 return false;
738 if (SubLoopLatch != SubLoopExit)
739 return false;
740
David Greenbc2e1c32018-08-02 07:30:53 +0000741 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
742 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
David Green963401d2018-07-01 12:47:30 +0000743 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000744 }
David Green963401d2018-07-01 12:47:30 +0000745
746 // Split blocks into Fore/SubLoop/Aft based on dominators
David Green2557e432018-07-12 10:44:47 +0000747 BasicBlockSet SubLoopBlocks;
748 BasicBlockSet ForeBlocks;
749 BasicBlockSet AftBlocks;
David Green963401d2018-07-01 12:47:30 +0000750 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
David Greenbc2e1c32018-08-02 07:30:53 +0000751 AftBlocks, &DT)) {
752 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
David Green963401d2018-07-01 12:47:30 +0000753 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000754 }
David Green963401d2018-07-01 12:47:30 +0000755
756 // Aft blocks may need to move instructions to fore blocks, which becomes more
757 // difficult if there are multiple (potentially conditionally executed)
758 // blocks. For now we just exclude loops with multiple aft blocks.
David Greenbc2e1c32018-08-02 07:30:53 +0000759 if (AftBlocks.size() != 1) {
760 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
761 "multiple blocks after the loop\n");
David Green963401d2018-07-01 12:47:30 +0000762 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000763 }
David Green963401d2018-07-01 12:47:30 +0000764
David Greenbc2e1c32018-08-02 07:30:53 +0000765 // Check inner loop backedge count is consistent on all iterations of the
766 // outer loop
David Green6cb64782018-08-15 10:59:41 +0000767 if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
David Greenbc2e1c32018-08-02 07:30:53 +0000768 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
769 "not consistent on each iteration\n");
David Green963401d2018-07-01 12:47:30 +0000770 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000771 }
David Green963401d2018-07-01 12:47:30 +0000772
773 // Check the loop safety info for exceptions.
Max Kazantsev9c90ec22018-10-16 08:31:05 +0000774 SimpleLoopSafetyInfo LSI;
Max Kazantsev530b8d12018-08-15 05:55:43 +0000775 LSI.computeLoopSafetyInfo(L);
776 if (LSI.anyBlockMayThrow()) {
David Greenbc2e1c32018-08-02 07:30:53 +0000777 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
David Green963401d2018-07-01 12:47:30 +0000778 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000779 }
David Green963401d2018-07-01 12:47:30 +0000780
781 // We've ruled out the easy stuff and now need to check that there are no
782 // interdependencies which may prevent us from moving the:
783 // ForeBlocks before Subloop and AftBlocks.
784 // Subloop before AftBlocks.
785 // ForeBlock phi operands before the subloop
786
787 // Make sure we can move all instructions we need to before the subloop
David Greeneda3c9e2018-07-26 15:19:07 +0000788 if (!processHeaderPhiOperands(
789 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
790 if (SubLoop->contains(I->getParent()))
791 return false;
792 if (AftBlocks.count(I->getParent())) {
793 // If we hit a phi node in afts we know we are done (probably
794 // LCSSA)
795 if (isa<PHINode>(I))
796 return false;
797 // Can't move instructions with side effects or memory
798 // reads/writes
799 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
800 return false;
801 }
802 // Keep going
803 return true;
David Greenbc2e1c32018-08-02 07:30:53 +0000804 })) {
805 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
806 "instructions after subloop to before it\n");
David Greeneda3c9e2018-07-26 15:19:07 +0000807 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000808 }
David Green963401d2018-07-01 12:47:30 +0000809
810 // Check for memory dependencies which prohibit the unrolling we are doing.
811 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
812 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
David Greenbc2e1c32018-08-02 07:30:53 +0000813 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
814 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
David Green963401d2018-07-01 12:47:30 +0000815 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000816 }
David Green963401d2018-07-01 12:47:30 +0000817
818 return true;
819}