blob: ff49d83f25c54be2bbea39d8176117898b4bce87 [file] [log] [blame]
David Green963401d2018-07-01 12:47:30 +00001//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
David Green963401d2018-07-01 12:47:30 +00006//
7//===----------------------------------------------------------------------===//
8//
9// This file implements loop unroll and jam as a routine, much like
10// LoopUnroll.cpp implements loop unroll.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/ADT/SmallPtrSet.h"
15#include "llvm/ADT/Statistic.h"
16#include "llvm/Analysis/AssumptionCache.h"
17#include "llvm/Analysis/DependenceAnalysis.h"
18#include "llvm/Analysis/InstructionSimplify.h"
19#include "llvm/Analysis/LoopAnalysisManager.h"
20#include "llvm/Analysis/LoopIterator.h"
21#include "llvm/Analysis/LoopPass.h"
22#include "llvm/Analysis/OptimizationRemarkEmitter.h"
23#include "llvm/Analysis/ScalarEvolution.h"
24#include "llvm/Analysis/ScalarEvolutionExpander.h"
25#include "llvm/Analysis/Utils/Local.h"
26#include "llvm/IR/BasicBlock.h"
27#include "llvm/IR/DataLayout.h"
28#include "llvm/IR/DebugInfoMetadata.h"
29#include "llvm/IR/Dominators.h"
30#include "llvm/IR/IntrinsicInst.h"
31#include "llvm/IR/LLVMContext.h"
32#include "llvm/Support/Debug.h"
33#include "llvm/Support/raw_ostream.h"
34#include "llvm/Transforms/Utils/BasicBlockUtils.h"
35#include "llvm/Transforms/Utils/Cloning.h"
36#include "llvm/Transforms/Utils/LoopSimplify.h"
37#include "llvm/Transforms/Utils/LoopUtils.h"
38#include "llvm/Transforms/Utils/SimplifyIndVar.h"
39#include "llvm/Transforms/Utils/UnrollLoop.h"
40using namespace llvm;
41
42#define DEBUG_TYPE "loop-unroll-and-jam"
43
44STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
45STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
46
David Green2557e432018-07-12 10:44:47 +000047typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
David Green963401d2018-07-01 12:47:30 +000048
49// Partition blocks in an outer/inner loop pair into blocks before and after
50// the loop
51static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
David Green2557e432018-07-12 10:44:47 +000052 BasicBlockSet &ForeBlocks,
53 BasicBlockSet &SubLoopBlocks,
54 BasicBlockSet &AftBlocks,
David Green963401d2018-07-01 12:47:30 +000055 DominatorTree *DT) {
56 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
David Green2557e432018-07-12 10:44:47 +000057 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
David Green963401d2018-07-01 12:47:30 +000058
59 for (BasicBlock *BB : L->blocks()) {
60 if (!SubLoop->contains(BB)) {
61 if (DT->dominates(SubLoopLatch, BB))
David Green2557e432018-07-12 10:44:47 +000062 AftBlocks.insert(BB);
David Green963401d2018-07-01 12:47:30 +000063 else
David Green2557e432018-07-12 10:44:47 +000064 ForeBlocks.insert(BB);
David Green963401d2018-07-01 12:47:30 +000065 }
66 }
67
68 // Check that all blocks in ForeBlocks together dominate the subloop
69 // TODO: This might ideally be done better with a dominator/postdominators.
70 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
71 for (BasicBlock *BB : ForeBlocks) {
72 if (BB == SubLoopPreHeader)
73 continue;
Chandler Carruthedb12a82018-10-15 10:04:59 +000074 Instruction *TI = BB->getTerminator();
David Green963401d2018-07-01 12:47:30 +000075 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
David Green2557e432018-07-12 10:44:47 +000076 if (!ForeBlocks.count(TI->getSuccessor(i)))
David Green963401d2018-07-01 12:47:30 +000077 return false;
78 }
79
80 return true;
81}
82
David Greeneda3c9e2018-07-26 15:19:07 +000083// Looks at the phi nodes in Header for values coming from Latch. For these
84// instructions and all their operands calls Visit on them, keeping going for
85// all the operands in AftBlocks. Returns false if Visit returns false,
86// otherwise returns true. This is used to process the instructions in the
87// Aft blocks that need to be moved before the subloop. It is used in two
88// places. One to check that the required set of instructions can be moved
89// before the loop. Then to collect the instructions to actually move in
90// moveHeaderPhiOperandsToForeBlocks.
91template <typename T>
92static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
93 BasicBlockSet &AftBlocks, T Visit) {
94 SmallVector<Instruction *, 8> Worklist;
David Green963401d2018-07-01 12:47:30 +000095 for (auto &Phi : Header->phis()) {
96 Value *V = Phi.getIncomingValueForBlock(Latch);
97 if (Instruction *I = dyn_cast<Instruction>(V))
98 Worklist.push_back(I);
99 }
100
101 while (!Worklist.empty()) {
102 Instruction *I = Worklist.back();
103 Worklist.pop_back();
David Greeneda3c9e2018-07-26 15:19:07 +0000104 if (!Visit(I))
105 return false;
David Green963401d2018-07-01 12:47:30 +0000106
David Greeneda3c9e2018-07-26 15:19:07 +0000107 if (AftBlocks.count(I->getParent()))
108 for (auto &U : I->operands())
109 if (Instruction *II = dyn_cast<Instruction>(U))
110 Worklist.push_back(II);
David Green963401d2018-07-01 12:47:30 +0000111 }
112
David Greeneda3c9e2018-07-26 15:19:07 +0000113 return true;
114}
115
116// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
117static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
118 BasicBlock *Latch,
119 Instruction *InsertLoc,
120 BasicBlockSet &AftBlocks) {
121 // We need to ensure we move the instructions in the correct order,
122 // starting with the earliest required instruction and moving forward.
123 std::vector<Instruction *> Visited;
124 processHeaderPhiOperands(Header, Latch, AftBlocks,
125 [&Visited, &AftBlocks](Instruction *I) {
126 if (AftBlocks.count(I->getParent()))
127 Visited.push_back(I);
128 return true;
129 });
130
David Green963401d2018-07-01 12:47:30 +0000131 // Move all instructions in program order to before the InsertLoc
132 BasicBlock *InsertLocBB = InsertLoc->getParent();
133 for (Instruction *I : reverse(Visited)) {
134 if (I->getParent() != InsertLocBB)
135 I->moveBefore(InsertLoc);
136 }
137}
138
139/*
140 This method performs Unroll and Jam. For a simple loop like:
141 for (i = ..)
142 Fore(i)
143 for (j = ..)
144 SubLoop(i, j)
145 Aft(i)
146
147 Instead of doing normal inner or outer unrolling, we do:
148 for (i = .., i+=2)
149 Fore(i)
150 Fore(i+1)
151 for (j = ..)
152 SubLoop(i, j)
153 SubLoop(i+1, j)
154 Aft(i)
155 Aft(i+1)
156
157 So the outer loop is essetially unrolled and then the inner loops are fused
158 ("jammed") together into a single loop. This can increase speed when there
159 are loads in SubLoop that are invariant to i, as they become shared between
160 the now jammed inner loops.
161
162 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
163 Fore blocks are those before the inner loop, Aft are those after. Normal
164 Unroll code is used to copy each of these sets of blocks and the results are
165 combined together into the final form above.
166
167 isSafeToUnrollAndJam should be used prior to calling this to make sure the
168 unrolling will be valid. Checking profitablility is also advisable.
Michael Kruse72448522018-12-12 17:32:52 +0000169
170 If EpilogueLoop is non-null, it receives the epilogue loop (if it was
171 necessary to create one and not fully unrolled).
David Green963401d2018-07-01 12:47:30 +0000172*/
Michael Kruse72448522018-12-12 17:32:52 +0000173LoopUnrollResult llvm::UnrollAndJamLoop(
174 Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
175 bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
176 AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
David Green963401d2018-07-01 12:47:30 +0000177
178 // When we enter here we should have already checked that it is safe
179 BasicBlock *Header = L->getHeader();
180 assert(L->getSubLoops().size() == 1);
181 Loop *SubLoop = *L->begin();
182
183 // Don't enter the unroll code if there is nothing to do.
184 if (TripCount == 0 && Count < 2) {
David Greenbc2e1c32018-08-02 07:30:53 +0000185 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
David Green963401d2018-07-01 12:47:30 +0000186 return LoopUnrollResult::Unmodified;
187 }
188
189 assert(Count > 0);
190 assert(TripMultiple > 0);
191 assert(TripCount == 0 || TripCount % TripMultiple == 0);
192
193 // Are we eliminating the loop control altogether?
194 bool CompletelyUnroll = (Count == TripCount);
195
196 // We use the runtime remainder in cases where we don't know trip multiple
197 if (TripMultiple == 1 || TripMultiple % Count != 0) {
198 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
199 /*UseEpilogRemainder*/ true,
Alina Sbirlea2312a062019-04-12 19:16:07 +0000200 UnrollRemainder, /*ForgetAllSCEV*/ false,
201 LI, SE, DT, AC, true, EpilogueLoop)) {
David Green963401d2018-07-01 12:47:30 +0000202 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
203 "generated when assuming runtime trip count\n");
204 return LoopUnrollResult::Unmodified;
205 }
206 }
207
208 // Notify ScalarEvolution that the loop will be substantially changed,
209 // if not outright eliminated.
210 if (SE) {
211 SE->forgetLoop(L);
212 SE->forgetLoop(SubLoop);
213 }
214
215 using namespace ore;
216 // Report the unrolling decision.
217 if (CompletelyUnroll) {
218 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
219 << Header->getName() << " with trip count " << TripCount
220 << "!\n");
221 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
222 L->getHeader())
223 << "completely unroll and jammed loop with "
224 << NV("UnrollCount", TripCount) << " iterations");
225 } else {
226 auto DiagBuilder = [&]() {
227 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
228 L->getHeader());
229 return Diag << "unroll and jammed loop by a factor of "
230 << NV("UnrollCount", Count);
231 };
232
233 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
234 << " by " << Count);
235 if (TripMultiple != 1) {
236 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
237 ORE->emit([&]() {
238 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
239 << " trips per branch";
240 });
241 } else {
242 LLVM_DEBUG(dbgs() << " with run-time trip count");
243 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
244 }
245 LLVM_DEBUG(dbgs() << "!\n");
246 }
247
248 BasicBlock *Preheader = L->getLoopPreheader();
249 BasicBlock *LatchBlock = L->getLoopLatch();
250 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
251 assert(Preheader && LatchBlock && Header);
252 assert(BI && !BI->isUnconditional());
253 bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
254 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
255 bool SubLoopContinueOnTrue = SubLoop->contains(
256 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
257
258 // Partition blocks in an outer/inner loop pair into blocks before and after
259 // the loop
David Green2557e432018-07-12 10:44:47 +0000260 BasicBlockSet SubLoopBlocks;
261 BasicBlockSet ForeBlocks;
262 BasicBlockSet AftBlocks;
David Green963401d2018-07-01 12:47:30 +0000263 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
264 DT);
265
266 // We keep track of the entering/first and exiting/last block of each of
267 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
268 // blocks easier.
269 std::vector<BasicBlock *> ForeBlocksFirst;
270 std::vector<BasicBlock *> ForeBlocksLast;
271 std::vector<BasicBlock *> SubLoopBlocksFirst;
272 std::vector<BasicBlock *> SubLoopBlocksLast;
273 std::vector<BasicBlock *> AftBlocksFirst;
274 std::vector<BasicBlock *> AftBlocksLast;
275 ForeBlocksFirst.push_back(Header);
276 ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
277 SubLoopBlocksFirst.push_back(SubLoop->getHeader());
278 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
279 AftBlocksFirst.push_back(SubLoop->getExitBlock());
280 AftBlocksLast.push_back(L->getExitingBlock());
281 // Maps Blocks[0] -> Blocks[It]
282 ValueToValueMapTy LastValueMap;
283
284 // Move any instructions from fore phi operands from AftBlocks into Fore.
285 moveHeaderPhiOperandsToForeBlocks(
286 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
287 AftBlocks);
288
289 // The current on-the-fly SSA update requires blocks to be processed in
290 // reverse postorder so that LastValueMap contains the correct value at each
291 // exit.
292 LoopBlocksDFS DFS(L);
293 DFS.perform(LI);
294 // Stash the DFS iterators before adding blocks to the loop.
295 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
296 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
297
298 if (Header->getParent()->isDebugInfoForProfiling())
299 for (BasicBlock *BB : L->getBlocks())
300 for (Instruction &I : *BB)
301 if (!isa<DbgInfoIntrinsic>(&I))
Mircea Trofinb53eeb62018-12-21 22:48:50 +0000302 if (const DILocation *DIL = I.getDebugLoc()) {
Mircea Trofinec026302019-01-24 00:10:25 +0000303 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
Mircea Trofinb53eeb62018-12-21 22:48:50 +0000304 if (NewDIL)
305 I.setDebugLoc(NewDIL.getValue());
306 else
307 LLVM_DEBUG(dbgs()
308 << "Failed to create new discriminator: "
309 << DIL->getFilename() << " Line: " << DIL->getLine());
310 }
David Green963401d2018-07-01 12:47:30 +0000311
312 // Copy all blocks
313 for (unsigned It = 1; It != Count; ++It) {
314 std::vector<BasicBlock *> NewBlocks;
315 // Maps Blocks[It] -> Blocks[It-1]
316 DenseMap<Value *, Value *> PrevItValueMap;
317
318 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
319 ValueToValueMapTy VMap;
320 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
321 Header->getParent()->getBasicBlockList().push_back(New);
322
David Green2557e432018-07-12 10:44:47 +0000323 if (ForeBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000324 L->addBasicBlockToLoop(New, *LI);
325
326 if (*BB == ForeBlocksFirst[0])
327 ForeBlocksFirst.push_back(New);
328 if (*BB == ForeBlocksLast[0])
329 ForeBlocksLast.push_back(New);
David Green2557e432018-07-12 10:44:47 +0000330 } else if (SubLoopBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000331 SubLoop->addBasicBlockToLoop(New, *LI);
332
333 if (*BB == SubLoopBlocksFirst[0])
334 SubLoopBlocksFirst.push_back(New);
335 if (*BB == SubLoopBlocksLast[0])
336 SubLoopBlocksLast.push_back(New);
David Green2557e432018-07-12 10:44:47 +0000337 } else if (AftBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000338 L->addBasicBlockToLoop(New, *LI);
339
340 if (*BB == AftBlocksFirst[0])
341 AftBlocksFirst.push_back(New);
342 if (*BB == AftBlocksLast[0])
343 AftBlocksLast.push_back(New);
344 } else {
345 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
346 }
347
348 // Update our running maps of newest clones
349 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
350 LastValueMap[*BB] = New;
351 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
352 VI != VE; ++VI) {
353 PrevItValueMap[VI->second] =
354 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
355 LastValueMap[VI->first] = VI->second;
356 }
357
358 NewBlocks.push_back(New);
359
360 // Update DomTree:
361 if (*BB == ForeBlocksFirst[0])
362 DT->addNewBlock(New, ForeBlocksLast[It - 1]);
363 else if (*BB == SubLoopBlocksFirst[0])
364 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
365 else if (*BB == AftBlocksFirst[0])
366 DT->addNewBlock(New, AftBlocksLast[It - 1]);
367 else {
368 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
369 // structure.
370 auto BBDomNode = DT->getNode(*BB);
371 auto BBIDom = BBDomNode->getIDom();
372 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
373 assert(OriginalBBIDom);
374 assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
375 DT->addNewBlock(
376 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
377 }
378 }
379
380 // Remap all instructions in the most recent iteration
381 for (BasicBlock *NewBlock : NewBlocks) {
382 for (Instruction &I : *NewBlock) {
383 ::remapInstruction(&I, LastValueMap);
384 if (auto *II = dyn_cast<IntrinsicInst>(&I))
385 if (II->getIntrinsicID() == Intrinsic::assume)
386 AC->registerAssumption(II);
387 }
388 }
389
390 // Alter the ForeBlocks phi's, pointing them at the latest version of the
391 // value from the previous iteration's phis
392 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
393 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
394 assert(OldValue && "should have incoming edge from Aft[It]");
395 Value *NewValue = OldValue;
396 if (Value *PrevValue = PrevItValueMap[OldValue])
397 NewValue = PrevValue;
398
399 assert(Phi.getNumOperands() == 2);
400 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
401 Phi.setIncomingValue(0, NewValue);
402 Phi.removeIncomingValue(1);
403 }
404 }
405
406 // Now that all the basic blocks for the unrolled iterations are in place,
407 // finish up connecting the blocks and phi nodes. At this point LastValueMap
408 // is the last unrolled iterations values.
409
410 // Update Phis in BB from OldBB to point to NewBB
411 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
412 BasicBlock *NewBB) {
413 for (PHINode &Phi : BB->phis()) {
414 int I = Phi.getBasicBlockIndex(OldBB);
415 Phi.setIncomingBlock(I, NewBB);
416 }
417 };
418 // Update Phis in BB from OldBB to point to NewBB and use the latest value
419 // from LastValueMap
420 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
421 BasicBlock *NewBB,
422 ValueToValueMapTy &LastValueMap) {
423 for (PHINode &Phi : BB->phis()) {
424 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
425 if (Phi.getIncomingBlock(b) == OldBB) {
426 Value *OldValue = Phi.getIncomingValue(b);
427 if (Value *LastValue = LastValueMap[OldValue])
428 Phi.setIncomingValue(b, LastValue);
429 Phi.setIncomingBlock(b, NewBB);
430 break;
431 }
432 }
433 }
434 };
435 // Move all the phis from Src into Dest
436 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
437 Instruction *insertPoint = Dest->getFirstNonPHI();
438 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
439 Phi->moveBefore(insertPoint);
440 };
441
442 // Update the PHI values outside the loop to point to the last block
443 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
444 LastValueMap);
445
446 // Update ForeBlocks successors and phi nodes
447 BranchInst *ForeTerm =
448 cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
449 BasicBlock *Dest = SubLoopBlocksFirst[0];
450 ForeTerm->setSuccessor(0, Dest);
451
452 if (CompletelyUnroll) {
453 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
454 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
455 Phi->getParent()->getInstList().erase(Phi);
456 }
457 } else {
458 // Update the PHI values to point to the last aft block
459 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
460 AftBlocksLast.back(), LastValueMap);
461 }
462
463 for (unsigned It = 1; It != Count; It++) {
464 // Remap ForeBlock successors from previous iteration to this
465 BranchInst *ForeTerm =
466 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
467 BasicBlock *Dest = ForeBlocksFirst[It];
468 ForeTerm->setSuccessor(0, Dest);
469 }
470
471 // Subloop successors and phis
472 BranchInst *SubTerm =
473 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
474 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
475 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
476 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
477 ForeBlocksLast.back());
478 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
479 SubLoopBlocksLast.back());
480
481 for (unsigned It = 1; It != Count; It++) {
482 // Replace the conditional branch of the previous iteration subloop with an
483 // unconditional one to this one
484 BranchInst *SubTerm =
485 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
486 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
487 SubTerm->eraseFromParent();
488
489 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
490 ForeBlocksLast.back());
491 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
492 SubLoopBlocksLast.back());
493 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
494 }
495
496 // Aft blocks successors and phis
497 BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
498 if (CompletelyUnroll) {
499 BranchInst::Create(LoopExit, Term);
500 Term->eraseFromParent();
501 } else {
502 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
503 }
504 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
505 SubLoopBlocksLast.back());
506
507 for (unsigned It = 1; It != Count; It++) {
508 // Replace the conditional branch of the previous iteration subloop with an
509 // unconditional one to this one
510 BranchInst *AftTerm =
511 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
512 BranchInst::Create(AftBlocksFirst[It], AftTerm);
513 AftTerm->eraseFromParent();
514
515 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
516 SubLoopBlocksLast.back());
517 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
518 }
519
520 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
521 // new ones required.
522 if (Count != 1) {
523 SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
524 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
525 SubLoopBlocksFirst[0]);
526 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
527 SubLoopBlocksLast[0], AftBlocksFirst[0]);
528
529 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
530 ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
531 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
532 SubLoopBlocksLast.back(), AftBlocksFirst[0]);
533 DT->applyUpdates(DTUpdates);
534 }
535
536 // Merge adjacent basic blocks, if possible.
537 SmallPtrSet<BasicBlock *, 16> MergeBlocks;
538 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
539 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
540 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
Alina Sbirleabfceed42019-06-04 18:45:15 +0000541 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
David Green963401d2018-07-01 12:47:30 +0000542 while (!MergeBlocks.empty()) {
543 BasicBlock *BB = *MergeBlocks.begin();
544 BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
545 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
546 BasicBlock *Dest = Term->getSuccessor(0);
Alina Sbirleabfceed42019-06-04 18:45:15 +0000547 BasicBlock *Fold = Dest->getUniquePredecessor();
548 if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
David Green963401d2018-07-01 12:47:30 +0000549 // Don't remove BB and add Fold as they are the same BB
550 assert(Fold == BB);
551 (void)Fold;
552 MergeBlocks.erase(Dest);
553 } else
554 MergeBlocks.erase(BB);
555 } else
556 MergeBlocks.erase(BB);
557 }
558
559 // At this point, the code is well formed. We now do a quick sweep over the
560 // inserted code, doing constant propagation and dead code elimination as we
561 // go.
562 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
563 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
564
565 NumCompletelyUnrolledAndJammed += CompletelyUnroll;
566 ++NumUnrolledAndJammed;
567
568#ifndef NDEBUG
569 // We shouldn't have done anything to break loop simplify form or LCSSA.
570 Loop *OuterL = L->getParentLoop();
571 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
572 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
573 if (!CompletelyUnroll)
574 assert(L->isLoopSimplifyForm());
575 assert(SubLoop->isLoopSimplifyForm());
576 assert(DT->verify());
577#endif
578
579 // Update LoopInfo if the loop is completely removed.
580 if (CompletelyUnroll)
581 LI->erase(L);
582
583 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
584 : LoopUnrollResult::PartiallyUnrolled;
585}
586
David Green2557e432018-07-12 10:44:47 +0000587static bool getLoadsAndStores(BasicBlockSet &Blocks,
David Green963401d2018-07-01 12:47:30 +0000588 SmallVector<Value *, 4> &MemInstr) {
589 // Scan the BBs and collect legal loads and stores.
590 // Returns false if non-simple loads/stores are found.
591 for (BasicBlock *BB : Blocks) {
592 for (Instruction &I : *BB) {
593 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
594 if (!Ld->isSimple())
595 return false;
596 MemInstr.push_back(&I);
597 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
598 if (!St->isSimple())
599 return false;
600 MemInstr.push_back(&I);
601 } else if (I.mayReadOrWriteMemory()) {
602 return false;
603 }
604 }
605 }
606 return true;
607}
608
609static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
610 SmallVector<Value *, 4> &Later,
611 unsigned LoopDepth, bool InnerLoop,
612 DependenceInfo &DI) {
613 // Use DA to check for dependencies between loads and stores that make unroll
614 // and jam invalid
615 for (Value *I : Earlier) {
616 for (Value *J : Later) {
617 Instruction *Src = cast<Instruction>(I);
618 Instruction *Dst = cast<Instruction>(J);
619 if (Src == Dst)
620 continue;
621 // Ignore Input dependencies.
622 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
623 continue;
624
625 // Track dependencies, and if we find them take a conservative approach
626 // by allowing only = or < (not >), altough some > would be safe
627 // (depending upon unroll width).
628 // For the inner loop, we need to disallow any (> <) dependencies
629 // FIXME: Allow > so long as distance is less than unroll width
630 if (auto D = DI.depends(Src, Dst, true)) {
631 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
632
David Greenbc2e1c32018-08-02 07:30:53 +0000633 if (D->isConfused()) {
634 LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
635 << " " << *Src << "\n"
636 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000637 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000638 }
David Green963401d2018-07-01 12:47:30 +0000639 if (!InnerLoop) {
David Greenbc2e1c32018-08-02 07:30:53 +0000640 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
641 LLVM_DEBUG(dbgs() << " > dependency between:\n"
642 << " " << *Src << "\n"
643 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000644 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000645 }
David Green963401d2018-07-01 12:47:30 +0000646 } else {
647 assert(LoopDepth + 1 <= D->getLevels());
648 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
David Greenbc2e1c32018-08-02 07:30:53 +0000649 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
650 LLVM_DEBUG(dbgs() << " < > dependency between:\n"
651 << " " << *Src << "\n"
652 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000653 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000654 }
David Green963401d2018-07-01 12:47:30 +0000655 }
656 }
657 }
658 }
659 return true;
660}
661
David Green2557e432018-07-12 10:44:47 +0000662static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
663 BasicBlockSet &SubLoopBlocks,
664 BasicBlockSet &AftBlocks, DependenceInfo &DI) {
David Green963401d2018-07-01 12:47:30 +0000665 // Get all loads/store pairs for each blocks
666 SmallVector<Value *, 4> ForeMemInstr;
667 SmallVector<Value *, 4> SubLoopMemInstr;
668 SmallVector<Value *, 4> AftMemInstr;
669 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
670 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
671 !getLoadsAndStores(AftBlocks, AftMemInstr))
672 return false;
673
674 // Check for dependencies between any blocks that may change order
675 unsigned LoopDepth = L->getLoopDepth();
676 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
677 DI) &&
678 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
679 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
680 DI) &&
681 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
682 DI);
683}
684
685bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
686 DependenceInfo &DI) {
687 /* We currently handle outer loops like this:
688 |
689 ForeFirst <----\ }
690 Blocks | } ForeBlocks
691 ForeLast | }
692 | |
693 SubLoopFirst <\ | }
694 Blocks | | } SubLoopBlocks
695 SubLoopLast -/ | }
696 | |
697 AftFirst | }
698 Blocks | } AftBlocks
699 AftLast ------/ }
700 |
701
702 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
703 and AftBlocks, providing that there is one edge from Fores to SubLoops,
704 one edge from SubLoops to Afts and a single outer loop exit (from Afts).
705 In practice we currently limit Aft blocks to a single block, and limit
706 things further in the profitablility checks of the unroll and jam pass.
707
708 Because of the way we rearrange basic blocks, we also require that
709 the Fore blocks on all unrolled iterations are safe to move before the
710 SubLoop blocks of all iterations. So we require that the phi node looping
711 operands of ForeHeader can be moved to at least the end of ForeEnd, so that
712 we can arrange cloned Fore Blocks before the subloop and match up Phi's
713 correctly.
714
715 i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
716 It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
717
718 There are then a number of checks along the lines of no calls, no
719 exceptions, inner loop IV is consistent, etc. Note that for loops requiring
720 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
721 UnrollAndJamLoop if the trip count cannot be easily calculated.
722 */
723
724 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
725 return false;
726 Loop *SubLoop = L->getSubLoops()[0];
727 if (!SubLoop->isLoopSimplifyForm())
728 return false;
729
730 BasicBlock *Header = L->getHeader();
731 BasicBlock *Latch = L->getLoopLatch();
732 BasicBlock *Exit = L->getExitingBlock();
733 BasicBlock *SubLoopHeader = SubLoop->getHeader();
734 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
735 BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
736
737 if (Latch != Exit)
738 return false;
739 if (SubLoopLatch != SubLoopExit)
740 return false;
741
David Greenbc2e1c32018-08-02 07:30:53 +0000742 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
743 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
David Green963401d2018-07-01 12:47:30 +0000744 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000745 }
David Green963401d2018-07-01 12:47:30 +0000746
747 // Split blocks into Fore/SubLoop/Aft based on dominators
David Green2557e432018-07-12 10:44:47 +0000748 BasicBlockSet SubLoopBlocks;
749 BasicBlockSet ForeBlocks;
750 BasicBlockSet AftBlocks;
David Green963401d2018-07-01 12:47:30 +0000751 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
David Greenbc2e1c32018-08-02 07:30:53 +0000752 AftBlocks, &DT)) {
753 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
David Green963401d2018-07-01 12:47:30 +0000754 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000755 }
David Green963401d2018-07-01 12:47:30 +0000756
757 // Aft blocks may need to move instructions to fore blocks, which becomes more
758 // difficult if there are multiple (potentially conditionally executed)
759 // blocks. For now we just exclude loops with multiple aft blocks.
David Greenbc2e1c32018-08-02 07:30:53 +0000760 if (AftBlocks.size() != 1) {
761 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
762 "multiple blocks after the loop\n");
David Green963401d2018-07-01 12:47:30 +0000763 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000764 }
David Green963401d2018-07-01 12:47:30 +0000765
David Greenbc2e1c32018-08-02 07:30:53 +0000766 // Check inner loop backedge count is consistent on all iterations of the
767 // outer loop
David Green6cb64782018-08-15 10:59:41 +0000768 if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
David Greenbc2e1c32018-08-02 07:30:53 +0000769 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
770 "not consistent on each iteration\n");
David Green963401d2018-07-01 12:47:30 +0000771 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000772 }
David Green963401d2018-07-01 12:47:30 +0000773
774 // Check the loop safety info for exceptions.
Max Kazantsev9c90ec22018-10-16 08:31:05 +0000775 SimpleLoopSafetyInfo LSI;
Max Kazantsev530b8d12018-08-15 05:55:43 +0000776 LSI.computeLoopSafetyInfo(L);
777 if (LSI.anyBlockMayThrow()) {
David Greenbc2e1c32018-08-02 07:30:53 +0000778 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
David Green963401d2018-07-01 12:47:30 +0000779 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000780 }
David Green963401d2018-07-01 12:47:30 +0000781
782 // We've ruled out the easy stuff and now need to check that there are no
783 // interdependencies which may prevent us from moving the:
784 // ForeBlocks before Subloop and AftBlocks.
785 // Subloop before AftBlocks.
786 // ForeBlock phi operands before the subloop
787
788 // Make sure we can move all instructions we need to before the subloop
David Greeneda3c9e2018-07-26 15:19:07 +0000789 if (!processHeaderPhiOperands(
790 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
791 if (SubLoop->contains(I->getParent()))
792 return false;
793 if (AftBlocks.count(I->getParent())) {
794 // If we hit a phi node in afts we know we are done (probably
795 // LCSSA)
796 if (isa<PHINode>(I))
797 return false;
798 // Can't move instructions with side effects or memory
799 // reads/writes
800 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
801 return false;
802 }
803 // Keep going
804 return true;
David Greenbc2e1c32018-08-02 07:30:53 +0000805 })) {
806 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
807 "instructions after subloop to before it\n");
David Greeneda3c9e2018-07-26 15:19:07 +0000808 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000809 }
David Green963401d2018-07-01 12:47:30 +0000810
811 // Check for memory dependencies which prohibit the unrolling we are doing.
812 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
813 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
David Greenbc2e1c32018-08-02 07:30:53 +0000814 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
815 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
David Green963401d2018-07-01 12:47:30 +0000816 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000817 }
David Green963401d2018-07-01 12:47:30 +0000818
819 return true;
820}