blob: b5d80f669fbf669eae07d12f0d3fb598f105b467 [file] [log] [blame]
David Green963401d2018-07-01 12:47:30 +00001//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements loop unroll and jam as a routine, much like
11// LoopUnroll.cpp implements loop unroll.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/SmallPtrSet.h"
16#include "llvm/ADT/Statistic.h"
17#include "llvm/Analysis/AssumptionCache.h"
18#include "llvm/Analysis/DependenceAnalysis.h"
19#include "llvm/Analysis/InstructionSimplify.h"
20#include "llvm/Analysis/LoopAnalysisManager.h"
21#include "llvm/Analysis/LoopIterator.h"
22#include "llvm/Analysis/LoopPass.h"
23#include "llvm/Analysis/OptimizationRemarkEmitter.h"
24#include "llvm/Analysis/ScalarEvolution.h"
25#include "llvm/Analysis/ScalarEvolutionExpander.h"
26#include "llvm/Analysis/Utils/Local.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/DataLayout.h"
29#include "llvm/IR/DebugInfoMetadata.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/IntrinsicInst.h"
32#include "llvm/IR/LLVMContext.h"
33#include "llvm/Support/Debug.h"
34#include "llvm/Support/raw_ostream.h"
35#include "llvm/Transforms/Utils/BasicBlockUtils.h"
36#include "llvm/Transforms/Utils/Cloning.h"
37#include "llvm/Transforms/Utils/LoopSimplify.h"
38#include "llvm/Transforms/Utils/LoopUtils.h"
39#include "llvm/Transforms/Utils/SimplifyIndVar.h"
40#include "llvm/Transforms/Utils/UnrollLoop.h"
41using namespace llvm;
42
43#define DEBUG_TYPE "loop-unroll-and-jam"
44
45STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
46STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
47
David Green2557e432018-07-12 10:44:47 +000048typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
David Green963401d2018-07-01 12:47:30 +000049
50// Partition blocks in an outer/inner loop pair into blocks before and after
51// the loop
52static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
David Green2557e432018-07-12 10:44:47 +000053 BasicBlockSet &ForeBlocks,
54 BasicBlockSet &SubLoopBlocks,
55 BasicBlockSet &AftBlocks,
David Green963401d2018-07-01 12:47:30 +000056 DominatorTree *DT) {
57 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
David Green2557e432018-07-12 10:44:47 +000058 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
David Green963401d2018-07-01 12:47:30 +000059
60 for (BasicBlock *BB : L->blocks()) {
61 if (!SubLoop->contains(BB)) {
62 if (DT->dominates(SubLoopLatch, BB))
David Green2557e432018-07-12 10:44:47 +000063 AftBlocks.insert(BB);
David Green963401d2018-07-01 12:47:30 +000064 else
David Green2557e432018-07-12 10:44:47 +000065 ForeBlocks.insert(BB);
David Green963401d2018-07-01 12:47:30 +000066 }
67 }
68
69 // Check that all blocks in ForeBlocks together dominate the subloop
70 // TODO: This might ideally be done better with a dominator/postdominators.
71 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
72 for (BasicBlock *BB : ForeBlocks) {
73 if (BB == SubLoopPreHeader)
74 continue;
Chandler Carruthedb12a82018-10-15 10:04:59 +000075 Instruction *TI = BB->getTerminator();
David Green963401d2018-07-01 12:47:30 +000076 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
David Green2557e432018-07-12 10:44:47 +000077 if (!ForeBlocks.count(TI->getSuccessor(i)))
David Green963401d2018-07-01 12:47:30 +000078 return false;
79 }
80
81 return true;
82}
83
David Greeneda3c9e2018-07-26 15:19:07 +000084// Looks at the phi nodes in Header for values coming from Latch. For these
85// instructions and all their operands calls Visit on them, keeping going for
86// all the operands in AftBlocks. Returns false if Visit returns false,
87// otherwise returns true. This is used to process the instructions in the
88// Aft blocks that need to be moved before the subloop. It is used in two
89// places. One to check that the required set of instructions can be moved
90// before the loop. Then to collect the instructions to actually move in
91// moveHeaderPhiOperandsToForeBlocks.
92template <typename T>
93static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
94 BasicBlockSet &AftBlocks, T Visit) {
95 SmallVector<Instruction *, 8> Worklist;
David Green963401d2018-07-01 12:47:30 +000096 for (auto &Phi : Header->phis()) {
97 Value *V = Phi.getIncomingValueForBlock(Latch);
98 if (Instruction *I = dyn_cast<Instruction>(V))
99 Worklist.push_back(I);
100 }
101
102 while (!Worklist.empty()) {
103 Instruction *I = Worklist.back();
104 Worklist.pop_back();
David Greeneda3c9e2018-07-26 15:19:07 +0000105 if (!Visit(I))
106 return false;
David Green963401d2018-07-01 12:47:30 +0000107
David Greeneda3c9e2018-07-26 15:19:07 +0000108 if (AftBlocks.count(I->getParent()))
109 for (auto &U : I->operands())
110 if (Instruction *II = dyn_cast<Instruction>(U))
111 Worklist.push_back(II);
David Green963401d2018-07-01 12:47:30 +0000112 }
113
David Greeneda3c9e2018-07-26 15:19:07 +0000114 return true;
115}
116
117// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
118static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
119 BasicBlock *Latch,
120 Instruction *InsertLoc,
121 BasicBlockSet &AftBlocks) {
122 // We need to ensure we move the instructions in the correct order,
123 // starting with the earliest required instruction and moving forward.
124 std::vector<Instruction *> Visited;
125 processHeaderPhiOperands(Header, Latch, AftBlocks,
126 [&Visited, &AftBlocks](Instruction *I) {
127 if (AftBlocks.count(I->getParent()))
128 Visited.push_back(I);
129 return true;
130 });
131
David Green963401d2018-07-01 12:47:30 +0000132 // Move all instructions in program order to before the InsertLoc
133 BasicBlock *InsertLocBB = InsertLoc->getParent();
134 for (Instruction *I : reverse(Visited)) {
135 if (I->getParent() != InsertLocBB)
136 I->moveBefore(InsertLoc);
137 }
138}
139
140/*
141 This method performs Unroll and Jam. For a simple loop like:
142 for (i = ..)
143 Fore(i)
144 for (j = ..)
145 SubLoop(i, j)
146 Aft(i)
147
148 Instead of doing normal inner or outer unrolling, we do:
149 for (i = .., i+=2)
150 Fore(i)
151 Fore(i+1)
152 for (j = ..)
153 SubLoop(i, j)
154 SubLoop(i+1, j)
155 Aft(i)
156 Aft(i+1)
157
158 So the outer loop is essetially unrolled and then the inner loops are fused
159 ("jammed") together into a single loop. This can increase speed when there
160 are loads in SubLoop that are invariant to i, as they become shared between
161 the now jammed inner loops.
162
163 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
164 Fore blocks are those before the inner loop, Aft are those after. Normal
165 Unroll code is used to copy each of these sets of blocks and the results are
166 combined together into the final form above.
167
168 isSafeToUnrollAndJam should be used prior to calling this to make sure the
169 unrolling will be valid. Checking profitablility is also advisable.
Michael Kruse72448522018-12-12 17:32:52 +0000170
171 If EpilogueLoop is non-null, it receives the epilogue loop (if it was
172 necessary to create one and not fully unrolled).
David Green963401d2018-07-01 12:47:30 +0000173*/
Michael Kruse72448522018-12-12 17:32:52 +0000174LoopUnrollResult llvm::UnrollAndJamLoop(
175 Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
176 bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
177 AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
David Green963401d2018-07-01 12:47:30 +0000178
179 // When we enter here we should have already checked that it is safe
180 BasicBlock *Header = L->getHeader();
181 assert(L->getSubLoops().size() == 1);
182 Loop *SubLoop = *L->begin();
183
184 // Don't enter the unroll code if there is nothing to do.
185 if (TripCount == 0 && Count < 2) {
David Greenbc2e1c32018-08-02 07:30:53 +0000186 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
David Green963401d2018-07-01 12:47:30 +0000187 return LoopUnrollResult::Unmodified;
188 }
189
190 assert(Count > 0);
191 assert(TripMultiple > 0);
192 assert(TripCount == 0 || TripCount % TripMultiple == 0);
193
194 // Are we eliminating the loop control altogether?
195 bool CompletelyUnroll = (Count == TripCount);
196
197 // We use the runtime remainder in cases where we don't know trip multiple
198 if (TripMultiple == 1 || TripMultiple % Count != 0) {
199 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
200 /*UseEpilogRemainder*/ true,
Michael Kruse72448522018-12-12 17:32:52 +0000201 UnrollRemainder, LI, SE, DT, AC, true,
202 EpilogueLoop)) {
David Green963401d2018-07-01 12:47:30 +0000203 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
204 "generated when assuming runtime trip count\n");
205 return LoopUnrollResult::Unmodified;
206 }
207 }
208
209 // Notify ScalarEvolution that the loop will be substantially changed,
210 // if not outright eliminated.
211 if (SE) {
212 SE->forgetLoop(L);
213 SE->forgetLoop(SubLoop);
214 }
215
216 using namespace ore;
217 // Report the unrolling decision.
218 if (CompletelyUnroll) {
219 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
220 << Header->getName() << " with trip count " << TripCount
221 << "!\n");
222 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
223 L->getHeader())
224 << "completely unroll and jammed loop with "
225 << NV("UnrollCount", TripCount) << " iterations");
226 } else {
227 auto DiagBuilder = [&]() {
228 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
229 L->getHeader());
230 return Diag << "unroll and jammed loop by a factor of "
231 << NV("UnrollCount", Count);
232 };
233
234 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
235 << " by " << Count);
236 if (TripMultiple != 1) {
237 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
238 ORE->emit([&]() {
239 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
240 << " trips per branch";
241 });
242 } else {
243 LLVM_DEBUG(dbgs() << " with run-time trip count");
244 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
245 }
246 LLVM_DEBUG(dbgs() << "!\n");
247 }
248
249 BasicBlock *Preheader = L->getLoopPreheader();
250 BasicBlock *LatchBlock = L->getLoopLatch();
251 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
252 assert(Preheader && LatchBlock && Header);
253 assert(BI && !BI->isUnconditional());
254 bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
255 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
256 bool SubLoopContinueOnTrue = SubLoop->contains(
257 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
258
259 // Partition blocks in an outer/inner loop pair into blocks before and after
260 // the loop
David Green2557e432018-07-12 10:44:47 +0000261 BasicBlockSet SubLoopBlocks;
262 BasicBlockSet ForeBlocks;
263 BasicBlockSet AftBlocks;
David Green963401d2018-07-01 12:47:30 +0000264 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
265 DT);
266
267 // We keep track of the entering/first and exiting/last block of each of
268 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
269 // blocks easier.
270 std::vector<BasicBlock *> ForeBlocksFirst;
271 std::vector<BasicBlock *> ForeBlocksLast;
272 std::vector<BasicBlock *> SubLoopBlocksFirst;
273 std::vector<BasicBlock *> SubLoopBlocksLast;
274 std::vector<BasicBlock *> AftBlocksFirst;
275 std::vector<BasicBlock *> AftBlocksLast;
276 ForeBlocksFirst.push_back(Header);
277 ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
278 SubLoopBlocksFirst.push_back(SubLoop->getHeader());
279 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
280 AftBlocksFirst.push_back(SubLoop->getExitBlock());
281 AftBlocksLast.push_back(L->getExitingBlock());
282 // Maps Blocks[0] -> Blocks[It]
283 ValueToValueMapTy LastValueMap;
284
285 // Move any instructions from fore phi operands from AftBlocks into Fore.
286 moveHeaderPhiOperandsToForeBlocks(
287 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
288 AftBlocks);
289
290 // The current on-the-fly SSA update requires blocks to be processed in
291 // reverse postorder so that LastValueMap contains the correct value at each
292 // exit.
293 LoopBlocksDFS DFS(L);
294 DFS.perform(LI);
295 // Stash the DFS iterators before adding blocks to the loop.
296 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
297 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
298
299 if (Header->getParent()->isDebugInfoForProfiling())
300 for (BasicBlock *BB : L->getBlocks())
301 for (Instruction &I : *BB)
302 if (!isa<DbgInfoIntrinsic>(&I))
303 if (const DILocation *DIL = I.getDebugLoc())
304 I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
305
306 // Copy all blocks
307 for (unsigned It = 1; It != Count; ++It) {
308 std::vector<BasicBlock *> NewBlocks;
309 // Maps Blocks[It] -> Blocks[It-1]
310 DenseMap<Value *, Value *> PrevItValueMap;
311
312 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
313 ValueToValueMapTy VMap;
314 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
315 Header->getParent()->getBasicBlockList().push_back(New);
316
David Green2557e432018-07-12 10:44:47 +0000317 if (ForeBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000318 L->addBasicBlockToLoop(New, *LI);
319
320 if (*BB == ForeBlocksFirst[0])
321 ForeBlocksFirst.push_back(New);
322 if (*BB == ForeBlocksLast[0])
323 ForeBlocksLast.push_back(New);
David Green2557e432018-07-12 10:44:47 +0000324 } else if (SubLoopBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000325 SubLoop->addBasicBlockToLoop(New, *LI);
326
327 if (*BB == SubLoopBlocksFirst[0])
328 SubLoopBlocksFirst.push_back(New);
329 if (*BB == SubLoopBlocksLast[0])
330 SubLoopBlocksLast.push_back(New);
David Green2557e432018-07-12 10:44:47 +0000331 } else if (AftBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000332 L->addBasicBlockToLoop(New, *LI);
333
334 if (*BB == AftBlocksFirst[0])
335 AftBlocksFirst.push_back(New);
336 if (*BB == AftBlocksLast[0])
337 AftBlocksLast.push_back(New);
338 } else {
339 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
340 }
341
342 // Update our running maps of newest clones
343 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
344 LastValueMap[*BB] = New;
345 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
346 VI != VE; ++VI) {
347 PrevItValueMap[VI->second] =
348 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
349 LastValueMap[VI->first] = VI->second;
350 }
351
352 NewBlocks.push_back(New);
353
354 // Update DomTree:
355 if (*BB == ForeBlocksFirst[0])
356 DT->addNewBlock(New, ForeBlocksLast[It - 1]);
357 else if (*BB == SubLoopBlocksFirst[0])
358 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
359 else if (*BB == AftBlocksFirst[0])
360 DT->addNewBlock(New, AftBlocksLast[It - 1]);
361 else {
362 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
363 // structure.
364 auto BBDomNode = DT->getNode(*BB);
365 auto BBIDom = BBDomNode->getIDom();
366 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
367 assert(OriginalBBIDom);
368 assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
369 DT->addNewBlock(
370 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
371 }
372 }
373
374 // Remap all instructions in the most recent iteration
375 for (BasicBlock *NewBlock : NewBlocks) {
376 for (Instruction &I : *NewBlock) {
377 ::remapInstruction(&I, LastValueMap);
378 if (auto *II = dyn_cast<IntrinsicInst>(&I))
379 if (II->getIntrinsicID() == Intrinsic::assume)
380 AC->registerAssumption(II);
381 }
382 }
383
384 // Alter the ForeBlocks phi's, pointing them at the latest version of the
385 // value from the previous iteration's phis
386 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
387 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
388 assert(OldValue && "should have incoming edge from Aft[It]");
389 Value *NewValue = OldValue;
390 if (Value *PrevValue = PrevItValueMap[OldValue])
391 NewValue = PrevValue;
392
393 assert(Phi.getNumOperands() == 2);
394 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
395 Phi.setIncomingValue(0, NewValue);
396 Phi.removeIncomingValue(1);
397 }
398 }
399
400 // Now that all the basic blocks for the unrolled iterations are in place,
401 // finish up connecting the blocks and phi nodes. At this point LastValueMap
402 // is the last unrolled iterations values.
403
404 // Update Phis in BB from OldBB to point to NewBB
405 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
406 BasicBlock *NewBB) {
407 for (PHINode &Phi : BB->phis()) {
408 int I = Phi.getBasicBlockIndex(OldBB);
409 Phi.setIncomingBlock(I, NewBB);
410 }
411 };
412 // Update Phis in BB from OldBB to point to NewBB and use the latest value
413 // from LastValueMap
414 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
415 BasicBlock *NewBB,
416 ValueToValueMapTy &LastValueMap) {
417 for (PHINode &Phi : BB->phis()) {
418 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
419 if (Phi.getIncomingBlock(b) == OldBB) {
420 Value *OldValue = Phi.getIncomingValue(b);
421 if (Value *LastValue = LastValueMap[OldValue])
422 Phi.setIncomingValue(b, LastValue);
423 Phi.setIncomingBlock(b, NewBB);
424 break;
425 }
426 }
427 }
428 };
429 // Move all the phis from Src into Dest
430 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
431 Instruction *insertPoint = Dest->getFirstNonPHI();
432 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
433 Phi->moveBefore(insertPoint);
434 };
435
436 // Update the PHI values outside the loop to point to the last block
437 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
438 LastValueMap);
439
440 // Update ForeBlocks successors and phi nodes
441 BranchInst *ForeTerm =
442 cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
443 BasicBlock *Dest = SubLoopBlocksFirst[0];
444 ForeTerm->setSuccessor(0, Dest);
445
446 if (CompletelyUnroll) {
447 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
448 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
449 Phi->getParent()->getInstList().erase(Phi);
450 }
451 } else {
452 // Update the PHI values to point to the last aft block
453 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
454 AftBlocksLast.back(), LastValueMap);
455 }
456
457 for (unsigned It = 1; It != Count; It++) {
458 // Remap ForeBlock successors from previous iteration to this
459 BranchInst *ForeTerm =
460 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
461 BasicBlock *Dest = ForeBlocksFirst[It];
462 ForeTerm->setSuccessor(0, Dest);
463 }
464
465 // Subloop successors and phis
466 BranchInst *SubTerm =
467 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
468 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
469 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
470 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
471 ForeBlocksLast.back());
472 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
473 SubLoopBlocksLast.back());
474
475 for (unsigned It = 1; It != Count; It++) {
476 // Replace the conditional branch of the previous iteration subloop with an
477 // unconditional one to this one
478 BranchInst *SubTerm =
479 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
480 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
481 SubTerm->eraseFromParent();
482
483 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
484 ForeBlocksLast.back());
485 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
486 SubLoopBlocksLast.back());
487 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
488 }
489
490 // Aft blocks successors and phis
491 BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
492 if (CompletelyUnroll) {
493 BranchInst::Create(LoopExit, Term);
494 Term->eraseFromParent();
495 } else {
496 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
497 }
498 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
499 SubLoopBlocksLast.back());
500
501 for (unsigned It = 1; It != Count; It++) {
502 // Replace the conditional branch of the previous iteration subloop with an
503 // unconditional one to this one
504 BranchInst *AftTerm =
505 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
506 BranchInst::Create(AftBlocksFirst[It], AftTerm);
507 AftTerm->eraseFromParent();
508
509 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
510 SubLoopBlocksLast.back());
511 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
512 }
513
514 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
515 // new ones required.
516 if (Count != 1) {
517 SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
518 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
519 SubLoopBlocksFirst[0]);
520 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
521 SubLoopBlocksLast[0], AftBlocksFirst[0]);
522
523 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
524 ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
525 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
526 SubLoopBlocksLast.back(), AftBlocksFirst[0]);
527 DT->applyUpdates(DTUpdates);
528 }
529
530 // Merge adjacent basic blocks, if possible.
531 SmallPtrSet<BasicBlock *, 16> MergeBlocks;
532 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
533 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
534 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
535 while (!MergeBlocks.empty()) {
536 BasicBlock *BB = *MergeBlocks.begin();
537 BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
538 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
539 BasicBlock *Dest = Term->getSuccessor(0);
540 if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
541 // Don't remove BB and add Fold as they are the same BB
542 assert(Fold == BB);
543 (void)Fold;
544 MergeBlocks.erase(Dest);
545 } else
546 MergeBlocks.erase(BB);
547 } else
548 MergeBlocks.erase(BB);
549 }
550
551 // At this point, the code is well formed. We now do a quick sweep over the
552 // inserted code, doing constant propagation and dead code elimination as we
553 // go.
554 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
555 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
556
557 NumCompletelyUnrolledAndJammed += CompletelyUnroll;
558 ++NumUnrolledAndJammed;
559
560#ifndef NDEBUG
561 // We shouldn't have done anything to break loop simplify form or LCSSA.
562 Loop *OuterL = L->getParentLoop();
563 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
564 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
565 if (!CompletelyUnroll)
566 assert(L->isLoopSimplifyForm());
567 assert(SubLoop->isLoopSimplifyForm());
568 assert(DT->verify());
569#endif
570
571 // Update LoopInfo if the loop is completely removed.
572 if (CompletelyUnroll)
573 LI->erase(L);
574
575 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
576 : LoopUnrollResult::PartiallyUnrolled;
577}
578
David Green2557e432018-07-12 10:44:47 +0000579static bool getLoadsAndStores(BasicBlockSet &Blocks,
David Green963401d2018-07-01 12:47:30 +0000580 SmallVector<Value *, 4> &MemInstr) {
581 // Scan the BBs and collect legal loads and stores.
582 // Returns false if non-simple loads/stores are found.
583 for (BasicBlock *BB : Blocks) {
584 for (Instruction &I : *BB) {
585 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
586 if (!Ld->isSimple())
587 return false;
588 MemInstr.push_back(&I);
589 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
590 if (!St->isSimple())
591 return false;
592 MemInstr.push_back(&I);
593 } else if (I.mayReadOrWriteMemory()) {
594 return false;
595 }
596 }
597 }
598 return true;
599}
600
601static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
602 SmallVector<Value *, 4> &Later,
603 unsigned LoopDepth, bool InnerLoop,
604 DependenceInfo &DI) {
605 // Use DA to check for dependencies between loads and stores that make unroll
606 // and jam invalid
607 for (Value *I : Earlier) {
608 for (Value *J : Later) {
609 Instruction *Src = cast<Instruction>(I);
610 Instruction *Dst = cast<Instruction>(J);
611 if (Src == Dst)
612 continue;
613 // Ignore Input dependencies.
614 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
615 continue;
616
617 // Track dependencies, and if we find them take a conservative approach
618 // by allowing only = or < (not >), altough some > would be safe
619 // (depending upon unroll width).
620 // For the inner loop, we need to disallow any (> <) dependencies
621 // FIXME: Allow > so long as distance is less than unroll width
622 if (auto D = DI.depends(Src, Dst, true)) {
623 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
624
David Greenbc2e1c32018-08-02 07:30:53 +0000625 if (D->isConfused()) {
626 LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
627 << " " << *Src << "\n"
628 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000629 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000630 }
David Green963401d2018-07-01 12:47:30 +0000631 if (!InnerLoop) {
David Greenbc2e1c32018-08-02 07:30:53 +0000632 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
633 LLVM_DEBUG(dbgs() << " > dependency between:\n"
634 << " " << *Src << "\n"
635 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000636 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000637 }
David Green963401d2018-07-01 12:47:30 +0000638 } else {
639 assert(LoopDepth + 1 <= D->getLevels());
640 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
David Greenbc2e1c32018-08-02 07:30:53 +0000641 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
642 LLVM_DEBUG(dbgs() << " < > dependency between:\n"
643 << " " << *Src << "\n"
644 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000645 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000646 }
David Green963401d2018-07-01 12:47:30 +0000647 }
648 }
649 }
650 }
651 return true;
652}
653
David Green2557e432018-07-12 10:44:47 +0000654static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
655 BasicBlockSet &SubLoopBlocks,
656 BasicBlockSet &AftBlocks, DependenceInfo &DI) {
David Green963401d2018-07-01 12:47:30 +0000657 // Get all loads/store pairs for each blocks
658 SmallVector<Value *, 4> ForeMemInstr;
659 SmallVector<Value *, 4> SubLoopMemInstr;
660 SmallVector<Value *, 4> AftMemInstr;
661 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
662 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
663 !getLoadsAndStores(AftBlocks, AftMemInstr))
664 return false;
665
666 // Check for dependencies between any blocks that may change order
667 unsigned LoopDepth = L->getLoopDepth();
668 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
669 DI) &&
670 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
671 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
672 DI) &&
673 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
674 DI);
675}
676
677bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
678 DependenceInfo &DI) {
679 /* We currently handle outer loops like this:
680 |
681 ForeFirst <----\ }
682 Blocks | } ForeBlocks
683 ForeLast | }
684 | |
685 SubLoopFirst <\ | }
686 Blocks | | } SubLoopBlocks
687 SubLoopLast -/ | }
688 | |
689 AftFirst | }
690 Blocks | } AftBlocks
691 AftLast ------/ }
692 |
693
694 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
695 and AftBlocks, providing that there is one edge from Fores to SubLoops,
696 one edge from SubLoops to Afts and a single outer loop exit (from Afts).
697 In practice we currently limit Aft blocks to a single block, and limit
698 things further in the profitablility checks of the unroll and jam pass.
699
700 Because of the way we rearrange basic blocks, we also require that
701 the Fore blocks on all unrolled iterations are safe to move before the
702 SubLoop blocks of all iterations. So we require that the phi node looping
703 operands of ForeHeader can be moved to at least the end of ForeEnd, so that
704 we can arrange cloned Fore Blocks before the subloop and match up Phi's
705 correctly.
706
707 i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
708 It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
709
710 There are then a number of checks along the lines of no calls, no
711 exceptions, inner loop IV is consistent, etc. Note that for loops requiring
712 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
713 UnrollAndJamLoop if the trip count cannot be easily calculated.
714 */
715
716 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
717 return false;
718 Loop *SubLoop = L->getSubLoops()[0];
719 if (!SubLoop->isLoopSimplifyForm())
720 return false;
721
722 BasicBlock *Header = L->getHeader();
723 BasicBlock *Latch = L->getLoopLatch();
724 BasicBlock *Exit = L->getExitingBlock();
725 BasicBlock *SubLoopHeader = SubLoop->getHeader();
726 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
727 BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
728
729 if (Latch != Exit)
730 return false;
731 if (SubLoopLatch != SubLoopExit)
732 return false;
733
David Greenbc2e1c32018-08-02 07:30:53 +0000734 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
735 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
David Green963401d2018-07-01 12:47:30 +0000736 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000737 }
David Green963401d2018-07-01 12:47:30 +0000738
739 // Split blocks into Fore/SubLoop/Aft based on dominators
David Green2557e432018-07-12 10:44:47 +0000740 BasicBlockSet SubLoopBlocks;
741 BasicBlockSet ForeBlocks;
742 BasicBlockSet AftBlocks;
David Green963401d2018-07-01 12:47:30 +0000743 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
David Greenbc2e1c32018-08-02 07:30:53 +0000744 AftBlocks, &DT)) {
745 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
David Green963401d2018-07-01 12:47:30 +0000746 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000747 }
David Green963401d2018-07-01 12:47:30 +0000748
749 // Aft blocks may need to move instructions to fore blocks, which becomes more
750 // difficult if there are multiple (potentially conditionally executed)
751 // blocks. For now we just exclude loops with multiple aft blocks.
David Greenbc2e1c32018-08-02 07:30:53 +0000752 if (AftBlocks.size() != 1) {
753 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
754 "multiple blocks after the loop\n");
David Green963401d2018-07-01 12:47:30 +0000755 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000756 }
David Green963401d2018-07-01 12:47:30 +0000757
David Greenbc2e1c32018-08-02 07:30:53 +0000758 // Check inner loop backedge count is consistent on all iterations of the
759 // outer loop
David Green6cb64782018-08-15 10:59:41 +0000760 if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
David Greenbc2e1c32018-08-02 07:30:53 +0000761 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
762 "not consistent on each iteration\n");
David Green963401d2018-07-01 12:47:30 +0000763 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000764 }
David Green963401d2018-07-01 12:47:30 +0000765
766 // Check the loop safety info for exceptions.
Max Kazantsev9c90ec22018-10-16 08:31:05 +0000767 SimpleLoopSafetyInfo LSI;
Max Kazantsev530b8d12018-08-15 05:55:43 +0000768 LSI.computeLoopSafetyInfo(L);
769 if (LSI.anyBlockMayThrow()) {
David Greenbc2e1c32018-08-02 07:30:53 +0000770 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
David Green963401d2018-07-01 12:47:30 +0000771 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000772 }
David Green963401d2018-07-01 12:47:30 +0000773
774 // We've ruled out the easy stuff and now need to check that there are no
775 // interdependencies which may prevent us from moving the:
776 // ForeBlocks before Subloop and AftBlocks.
777 // Subloop before AftBlocks.
778 // ForeBlock phi operands before the subloop
779
780 // Make sure we can move all instructions we need to before the subloop
David Greeneda3c9e2018-07-26 15:19:07 +0000781 if (!processHeaderPhiOperands(
782 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
783 if (SubLoop->contains(I->getParent()))
784 return false;
785 if (AftBlocks.count(I->getParent())) {
786 // If we hit a phi node in afts we know we are done (probably
787 // LCSSA)
788 if (isa<PHINode>(I))
789 return false;
790 // Can't move instructions with side effects or memory
791 // reads/writes
792 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
793 return false;
794 }
795 // Keep going
796 return true;
David Greenbc2e1c32018-08-02 07:30:53 +0000797 })) {
798 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
799 "instructions after subloop to before it\n");
David Greeneda3c9e2018-07-26 15:19:07 +0000800 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000801 }
David Green963401d2018-07-01 12:47:30 +0000802
803 // Check for memory dependencies which prohibit the unrolling we are doing.
804 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
805 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
David Greenbc2e1c32018-08-02 07:30:53 +0000806 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
807 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
David Green963401d2018-07-01 12:47:30 +0000808 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000809 }
David Green963401d2018-07-01 12:47:30 +0000810
811 return true;
812}