blob: b347b7b2f3bea708be8f4b3e31fd34d1b6fd3404 [file] [log] [blame]
David Green963401d2018-07-01 12:47:30 +00001//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2//
Chandler Carruth2946cd72019-01-19 08:50:56 +00003// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
David Green963401d2018-07-01 12:47:30 +00006//
7//===----------------------------------------------------------------------===//
8//
9// This file implements loop unroll and jam as a routine, much like
10// LoopUnroll.cpp implements loop unroll.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/ADT/SmallPtrSet.h"
15#include "llvm/ADT/Statistic.h"
16#include "llvm/Analysis/AssumptionCache.h"
17#include "llvm/Analysis/DependenceAnalysis.h"
18#include "llvm/Analysis/InstructionSimplify.h"
19#include "llvm/Analysis/LoopAnalysisManager.h"
20#include "llvm/Analysis/LoopIterator.h"
21#include "llvm/Analysis/LoopPass.h"
22#include "llvm/Analysis/OptimizationRemarkEmitter.h"
23#include "llvm/Analysis/ScalarEvolution.h"
24#include "llvm/Analysis/ScalarEvolutionExpander.h"
25#include "llvm/Analysis/Utils/Local.h"
26#include "llvm/IR/BasicBlock.h"
27#include "llvm/IR/DataLayout.h"
28#include "llvm/IR/DebugInfoMetadata.h"
29#include "llvm/IR/Dominators.h"
30#include "llvm/IR/IntrinsicInst.h"
31#include "llvm/IR/LLVMContext.h"
32#include "llvm/Support/Debug.h"
33#include "llvm/Support/raw_ostream.h"
34#include "llvm/Transforms/Utils/BasicBlockUtils.h"
35#include "llvm/Transforms/Utils/Cloning.h"
36#include "llvm/Transforms/Utils/LoopSimplify.h"
37#include "llvm/Transforms/Utils/LoopUtils.h"
38#include "llvm/Transforms/Utils/SimplifyIndVar.h"
39#include "llvm/Transforms/Utils/UnrollLoop.h"
40using namespace llvm;
41
42#define DEBUG_TYPE "loop-unroll-and-jam"
43
44STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
45STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
46
David Green2557e432018-07-12 10:44:47 +000047typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
David Green963401d2018-07-01 12:47:30 +000048
49// Partition blocks in an outer/inner loop pair into blocks before and after
50// the loop
51static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
David Green2557e432018-07-12 10:44:47 +000052 BasicBlockSet &ForeBlocks,
53 BasicBlockSet &SubLoopBlocks,
54 BasicBlockSet &AftBlocks,
David Green963401d2018-07-01 12:47:30 +000055 DominatorTree *DT) {
56 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
David Green2557e432018-07-12 10:44:47 +000057 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
David Green963401d2018-07-01 12:47:30 +000058
59 for (BasicBlock *BB : L->blocks()) {
60 if (!SubLoop->contains(BB)) {
61 if (DT->dominates(SubLoopLatch, BB))
David Green2557e432018-07-12 10:44:47 +000062 AftBlocks.insert(BB);
David Green963401d2018-07-01 12:47:30 +000063 else
David Green2557e432018-07-12 10:44:47 +000064 ForeBlocks.insert(BB);
David Green963401d2018-07-01 12:47:30 +000065 }
66 }
67
68 // Check that all blocks in ForeBlocks together dominate the subloop
69 // TODO: This might ideally be done better with a dominator/postdominators.
70 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
71 for (BasicBlock *BB : ForeBlocks) {
72 if (BB == SubLoopPreHeader)
73 continue;
Chandler Carruthedb12a82018-10-15 10:04:59 +000074 Instruction *TI = BB->getTerminator();
David Green963401d2018-07-01 12:47:30 +000075 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
David Green2557e432018-07-12 10:44:47 +000076 if (!ForeBlocks.count(TI->getSuccessor(i)))
David Green963401d2018-07-01 12:47:30 +000077 return false;
78 }
79
80 return true;
81}
82
David Greeneda3c9e2018-07-26 15:19:07 +000083// Looks at the phi nodes in Header for values coming from Latch. For these
84// instructions and all their operands calls Visit on them, keeping going for
85// all the operands in AftBlocks. Returns false if Visit returns false,
86// otherwise returns true. This is used to process the instructions in the
87// Aft blocks that need to be moved before the subloop. It is used in two
88// places. One to check that the required set of instructions can be moved
89// before the loop. Then to collect the instructions to actually move in
90// moveHeaderPhiOperandsToForeBlocks.
91template <typename T>
92static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
93 BasicBlockSet &AftBlocks, T Visit) {
94 SmallVector<Instruction *, 8> Worklist;
David Green963401d2018-07-01 12:47:30 +000095 for (auto &Phi : Header->phis()) {
96 Value *V = Phi.getIncomingValueForBlock(Latch);
97 if (Instruction *I = dyn_cast<Instruction>(V))
98 Worklist.push_back(I);
99 }
100
101 while (!Worklist.empty()) {
102 Instruction *I = Worklist.back();
103 Worklist.pop_back();
David Greeneda3c9e2018-07-26 15:19:07 +0000104 if (!Visit(I))
105 return false;
David Green963401d2018-07-01 12:47:30 +0000106
David Greeneda3c9e2018-07-26 15:19:07 +0000107 if (AftBlocks.count(I->getParent()))
108 for (auto &U : I->operands())
109 if (Instruction *II = dyn_cast<Instruction>(U))
110 Worklist.push_back(II);
David Green963401d2018-07-01 12:47:30 +0000111 }
112
David Greeneda3c9e2018-07-26 15:19:07 +0000113 return true;
114}
115
116// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
117static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
118 BasicBlock *Latch,
119 Instruction *InsertLoc,
120 BasicBlockSet &AftBlocks) {
121 // We need to ensure we move the instructions in the correct order,
122 // starting with the earliest required instruction and moving forward.
123 std::vector<Instruction *> Visited;
124 processHeaderPhiOperands(Header, Latch, AftBlocks,
125 [&Visited, &AftBlocks](Instruction *I) {
126 if (AftBlocks.count(I->getParent()))
127 Visited.push_back(I);
128 return true;
129 });
130
David Green963401d2018-07-01 12:47:30 +0000131 // Move all instructions in program order to before the InsertLoc
132 BasicBlock *InsertLocBB = InsertLoc->getParent();
133 for (Instruction *I : reverse(Visited)) {
134 if (I->getParent() != InsertLocBB)
135 I->moveBefore(InsertLoc);
136 }
137}
138
139/*
140 This method performs Unroll and Jam. For a simple loop like:
141 for (i = ..)
142 Fore(i)
143 for (j = ..)
144 SubLoop(i, j)
145 Aft(i)
146
147 Instead of doing normal inner or outer unrolling, we do:
148 for (i = .., i+=2)
149 Fore(i)
150 Fore(i+1)
151 for (j = ..)
152 SubLoop(i, j)
153 SubLoop(i+1, j)
154 Aft(i)
155 Aft(i+1)
156
157 So the outer loop is essetially unrolled and then the inner loops are fused
158 ("jammed") together into a single loop. This can increase speed when there
159 are loads in SubLoop that are invariant to i, as they become shared between
160 the now jammed inner loops.
161
162 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
163 Fore blocks are those before the inner loop, Aft are those after. Normal
164 Unroll code is used to copy each of these sets of blocks and the results are
165 combined together into the final form above.
166
167 isSafeToUnrollAndJam should be used prior to calling this to make sure the
168 unrolling will be valid. Checking profitablility is also advisable.
Michael Kruse72448522018-12-12 17:32:52 +0000169
170 If EpilogueLoop is non-null, it receives the epilogue loop (if it was
171 necessary to create one and not fully unrolled).
David Green963401d2018-07-01 12:47:30 +0000172*/
Michael Kruse72448522018-12-12 17:32:52 +0000173LoopUnrollResult llvm::UnrollAndJamLoop(
174 Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
175 bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
176 AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
David Green963401d2018-07-01 12:47:30 +0000177
178 // When we enter here we should have already checked that it is safe
179 BasicBlock *Header = L->getHeader();
Dávid Bolvanský60cb1932019-11-03 20:02:54 +0100180 assert(Header && "No header.");
David Green963401d2018-07-01 12:47:30 +0000181 assert(L->getSubLoops().size() == 1);
182 Loop *SubLoop = *L->begin();
183
184 // Don't enter the unroll code if there is nothing to do.
185 if (TripCount == 0 && Count < 2) {
David Greenbc2e1c32018-08-02 07:30:53 +0000186 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
David Green963401d2018-07-01 12:47:30 +0000187 return LoopUnrollResult::Unmodified;
188 }
189
190 assert(Count > 0);
191 assert(TripMultiple > 0);
192 assert(TripCount == 0 || TripCount % TripMultiple == 0);
193
194 // Are we eliminating the loop control altogether?
195 bool CompletelyUnroll = (Count == TripCount);
196
197 // We use the runtime remainder in cases where we don't know trip multiple
198 if (TripMultiple == 1 || TripMultiple % Count != 0) {
199 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
200 /*UseEpilogRemainder*/ true,
Alina Sbirlea2312a062019-04-12 19:16:07 +0000201 UnrollRemainder, /*ForgetAllSCEV*/ false,
202 LI, SE, DT, AC, true, EpilogueLoop)) {
David Green963401d2018-07-01 12:47:30 +0000203 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
204 "generated when assuming runtime trip count\n");
205 return LoopUnrollResult::Unmodified;
206 }
207 }
208
209 // Notify ScalarEvolution that the loop will be substantially changed,
210 // if not outright eliminated.
211 if (SE) {
212 SE->forgetLoop(L);
213 SE->forgetLoop(SubLoop);
214 }
215
216 using namespace ore;
217 // Report the unrolling decision.
218 if (CompletelyUnroll) {
219 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
220 << Header->getName() << " with trip count " << TripCount
221 << "!\n");
222 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
223 L->getHeader())
224 << "completely unroll and jammed loop with "
225 << NV("UnrollCount", TripCount) << " iterations");
226 } else {
227 auto DiagBuilder = [&]() {
228 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
229 L->getHeader());
230 return Diag << "unroll and jammed loop by a factor of "
231 << NV("UnrollCount", Count);
232 };
233
234 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
235 << " by " << Count);
236 if (TripMultiple != 1) {
237 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
238 ORE->emit([&]() {
239 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
240 << " trips per branch";
241 });
242 } else {
243 LLVM_DEBUG(dbgs() << " with run-time trip count");
244 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
245 }
246 LLVM_DEBUG(dbgs() << "!\n");
247 }
248
249 BasicBlock *Preheader = L->getLoopPreheader();
250 BasicBlock *LatchBlock = L->getLoopLatch();
Dávid Bolvanský60cb1932019-11-03 20:02:54 +0100251 assert(Preheader && "No preheader");
252 assert(LatchBlock && "No latch block");
David Green963401d2018-07-01 12:47:30 +0000253 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
David Green963401d2018-07-01 12:47:30 +0000254 assert(BI && !BI->isUnconditional());
255 bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
256 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
257 bool SubLoopContinueOnTrue = SubLoop->contains(
258 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
259
260 // Partition blocks in an outer/inner loop pair into blocks before and after
261 // the loop
David Green2557e432018-07-12 10:44:47 +0000262 BasicBlockSet SubLoopBlocks;
263 BasicBlockSet ForeBlocks;
264 BasicBlockSet AftBlocks;
David Green963401d2018-07-01 12:47:30 +0000265 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
266 DT);
267
268 // We keep track of the entering/first and exiting/last block of each of
269 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
270 // blocks easier.
271 std::vector<BasicBlock *> ForeBlocksFirst;
272 std::vector<BasicBlock *> ForeBlocksLast;
273 std::vector<BasicBlock *> SubLoopBlocksFirst;
274 std::vector<BasicBlock *> SubLoopBlocksLast;
275 std::vector<BasicBlock *> AftBlocksFirst;
276 std::vector<BasicBlock *> AftBlocksLast;
277 ForeBlocksFirst.push_back(Header);
278 ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
279 SubLoopBlocksFirst.push_back(SubLoop->getHeader());
280 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
281 AftBlocksFirst.push_back(SubLoop->getExitBlock());
282 AftBlocksLast.push_back(L->getExitingBlock());
283 // Maps Blocks[0] -> Blocks[It]
284 ValueToValueMapTy LastValueMap;
285
286 // Move any instructions from fore phi operands from AftBlocks into Fore.
287 moveHeaderPhiOperandsToForeBlocks(
288 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
289 AftBlocks);
290
291 // The current on-the-fly SSA update requires blocks to be processed in
292 // reverse postorder so that LastValueMap contains the correct value at each
293 // exit.
294 LoopBlocksDFS DFS(L);
295 DFS.perform(LI);
296 // Stash the DFS iterators before adding blocks to the loop.
297 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
298 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
299
300 if (Header->getParent()->isDebugInfoForProfiling())
301 for (BasicBlock *BB : L->getBlocks())
302 for (Instruction &I : *BB)
303 if (!isa<DbgInfoIntrinsic>(&I))
Mircea Trofinb53eeb62018-12-21 22:48:50 +0000304 if (const DILocation *DIL = I.getDebugLoc()) {
Mircea Trofinec026302019-01-24 00:10:25 +0000305 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
Mircea Trofinb53eeb62018-12-21 22:48:50 +0000306 if (NewDIL)
307 I.setDebugLoc(NewDIL.getValue());
308 else
309 LLVM_DEBUG(dbgs()
310 << "Failed to create new discriminator: "
311 << DIL->getFilename() << " Line: " << DIL->getLine());
312 }
David Green963401d2018-07-01 12:47:30 +0000313
314 // Copy all blocks
315 for (unsigned It = 1; It != Count; ++It) {
316 std::vector<BasicBlock *> NewBlocks;
317 // Maps Blocks[It] -> Blocks[It-1]
318 DenseMap<Value *, Value *> PrevItValueMap;
319
320 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
321 ValueToValueMapTy VMap;
322 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
323 Header->getParent()->getBasicBlockList().push_back(New);
324
David Green2557e432018-07-12 10:44:47 +0000325 if (ForeBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000326 L->addBasicBlockToLoop(New, *LI);
327
328 if (*BB == ForeBlocksFirst[0])
329 ForeBlocksFirst.push_back(New);
330 if (*BB == ForeBlocksLast[0])
331 ForeBlocksLast.push_back(New);
David Green2557e432018-07-12 10:44:47 +0000332 } else if (SubLoopBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000333 SubLoop->addBasicBlockToLoop(New, *LI);
334
335 if (*BB == SubLoopBlocksFirst[0])
336 SubLoopBlocksFirst.push_back(New);
337 if (*BB == SubLoopBlocksLast[0])
338 SubLoopBlocksLast.push_back(New);
David Green2557e432018-07-12 10:44:47 +0000339 } else if (AftBlocks.count(*BB)) {
David Green963401d2018-07-01 12:47:30 +0000340 L->addBasicBlockToLoop(New, *LI);
341
342 if (*BB == AftBlocksFirst[0])
343 AftBlocksFirst.push_back(New);
344 if (*BB == AftBlocksLast[0])
345 AftBlocksLast.push_back(New);
346 } else {
347 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
348 }
349
350 // Update our running maps of newest clones
351 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
352 LastValueMap[*BB] = New;
353 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
354 VI != VE; ++VI) {
355 PrevItValueMap[VI->second] =
356 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
357 LastValueMap[VI->first] = VI->second;
358 }
359
360 NewBlocks.push_back(New);
361
362 // Update DomTree:
363 if (*BB == ForeBlocksFirst[0])
364 DT->addNewBlock(New, ForeBlocksLast[It - 1]);
365 else if (*BB == SubLoopBlocksFirst[0])
366 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
367 else if (*BB == AftBlocksFirst[0])
368 DT->addNewBlock(New, AftBlocksLast[It - 1]);
369 else {
370 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
371 // structure.
372 auto BBDomNode = DT->getNode(*BB);
373 auto BBIDom = BBDomNode->getIDom();
374 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
375 assert(OriginalBBIDom);
376 assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
377 DT->addNewBlock(
378 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
379 }
380 }
381
382 // Remap all instructions in the most recent iteration
383 for (BasicBlock *NewBlock : NewBlocks) {
384 for (Instruction &I : *NewBlock) {
385 ::remapInstruction(&I, LastValueMap);
386 if (auto *II = dyn_cast<IntrinsicInst>(&I))
387 if (II->getIntrinsicID() == Intrinsic::assume)
388 AC->registerAssumption(II);
389 }
390 }
391
392 // Alter the ForeBlocks phi's, pointing them at the latest version of the
393 // value from the previous iteration's phis
394 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
395 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
396 assert(OldValue && "should have incoming edge from Aft[It]");
397 Value *NewValue = OldValue;
398 if (Value *PrevValue = PrevItValueMap[OldValue])
399 NewValue = PrevValue;
400
401 assert(Phi.getNumOperands() == 2);
402 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
403 Phi.setIncomingValue(0, NewValue);
404 Phi.removeIncomingValue(1);
405 }
406 }
407
408 // Now that all the basic blocks for the unrolled iterations are in place,
409 // finish up connecting the blocks and phi nodes. At this point LastValueMap
410 // is the last unrolled iterations values.
411
412 // Update Phis in BB from OldBB to point to NewBB
413 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
414 BasicBlock *NewBB) {
415 for (PHINode &Phi : BB->phis()) {
416 int I = Phi.getBasicBlockIndex(OldBB);
417 Phi.setIncomingBlock(I, NewBB);
418 }
419 };
420 // Update Phis in BB from OldBB to point to NewBB and use the latest value
421 // from LastValueMap
422 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
423 BasicBlock *NewBB,
424 ValueToValueMapTy &LastValueMap) {
425 for (PHINode &Phi : BB->phis()) {
426 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
427 if (Phi.getIncomingBlock(b) == OldBB) {
428 Value *OldValue = Phi.getIncomingValue(b);
429 if (Value *LastValue = LastValueMap[OldValue])
430 Phi.setIncomingValue(b, LastValue);
431 Phi.setIncomingBlock(b, NewBB);
432 break;
433 }
434 }
435 }
436 };
437 // Move all the phis from Src into Dest
438 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
439 Instruction *insertPoint = Dest->getFirstNonPHI();
440 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
441 Phi->moveBefore(insertPoint);
442 };
443
444 // Update the PHI values outside the loop to point to the last block
445 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
446 LastValueMap);
447
448 // Update ForeBlocks successors and phi nodes
449 BranchInst *ForeTerm =
450 cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
451 BasicBlock *Dest = SubLoopBlocksFirst[0];
452 ForeTerm->setSuccessor(0, Dest);
453
454 if (CompletelyUnroll) {
455 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
456 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
457 Phi->getParent()->getInstList().erase(Phi);
458 }
459 } else {
460 // Update the PHI values to point to the last aft block
461 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
462 AftBlocksLast.back(), LastValueMap);
463 }
464
465 for (unsigned It = 1; It != Count; It++) {
466 // Remap ForeBlock successors from previous iteration to this
467 BranchInst *ForeTerm =
468 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
469 BasicBlock *Dest = ForeBlocksFirst[It];
470 ForeTerm->setSuccessor(0, Dest);
471 }
472
473 // Subloop successors and phis
474 BranchInst *SubTerm =
475 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
476 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
477 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
478 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
479 ForeBlocksLast.back());
480 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
481 SubLoopBlocksLast.back());
482
483 for (unsigned It = 1; It != Count; It++) {
484 // Replace the conditional branch of the previous iteration subloop with an
485 // unconditional one to this one
486 BranchInst *SubTerm =
487 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
488 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
489 SubTerm->eraseFromParent();
490
491 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
492 ForeBlocksLast.back());
493 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
494 SubLoopBlocksLast.back());
495 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
496 }
497
498 // Aft blocks successors and phis
499 BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
500 if (CompletelyUnroll) {
501 BranchInst::Create(LoopExit, Term);
502 Term->eraseFromParent();
503 } else {
504 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
505 }
506 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
507 SubLoopBlocksLast.back());
508
509 for (unsigned It = 1; It != Count; It++) {
510 // Replace the conditional branch of the previous iteration subloop with an
511 // unconditional one to this one
512 BranchInst *AftTerm =
513 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
514 BranchInst::Create(AftBlocksFirst[It], AftTerm);
515 AftTerm->eraseFromParent();
516
517 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
518 SubLoopBlocksLast.back());
519 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
520 }
521
Florian Hahnf9cdb982019-08-29 17:47:58 +0000522 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
David Green963401d2018-07-01 12:47:30 +0000523 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
524 // new ones required.
525 if (Count != 1) {
526 SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
527 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
528 SubLoopBlocksFirst[0]);
529 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
530 SubLoopBlocksLast[0], AftBlocksFirst[0]);
531
532 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
533 ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
534 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
535 SubLoopBlocksLast.back(), AftBlocksFirst[0]);
Florian Hahnf9cdb982019-08-29 17:47:58 +0000536 DTU.applyUpdatesPermissive(DTUpdates);
David Green963401d2018-07-01 12:47:30 +0000537 }
538
539 // Merge adjacent basic blocks, if possible.
540 SmallPtrSet<BasicBlock *, 16> MergeBlocks;
541 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
542 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
543 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
544 while (!MergeBlocks.empty()) {
545 BasicBlock *BB = *MergeBlocks.begin();
546 BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
547 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
548 BasicBlock *Dest = Term->getSuccessor(0);
Alina Sbirleabfceed42019-06-04 18:45:15 +0000549 BasicBlock *Fold = Dest->getUniquePredecessor();
550 if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
David Green963401d2018-07-01 12:47:30 +0000551 // Don't remove BB and add Fold as they are the same BB
552 assert(Fold == BB);
553 (void)Fold;
554 MergeBlocks.erase(Dest);
555 } else
556 MergeBlocks.erase(BB);
557 } else
558 MergeBlocks.erase(BB);
559 }
Florian Hahnf9cdb982019-08-29 17:47:58 +0000560 // Apply updates to the DomTree.
561 DT = &DTU.getDomTree();
David Green963401d2018-07-01 12:47:30 +0000562
563 // At this point, the code is well formed. We now do a quick sweep over the
564 // inserted code, doing constant propagation and dead code elimination as we
565 // go.
566 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
567 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
568
569 NumCompletelyUnrolledAndJammed += CompletelyUnroll;
570 ++NumUnrolledAndJammed;
571
572#ifndef NDEBUG
573 // We shouldn't have done anything to break loop simplify form or LCSSA.
574 Loop *OuterL = L->getParentLoop();
575 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
576 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
577 if (!CompletelyUnroll)
578 assert(L->isLoopSimplifyForm());
579 assert(SubLoop->isLoopSimplifyForm());
580 assert(DT->verify());
581#endif
582
583 // Update LoopInfo if the loop is completely removed.
584 if (CompletelyUnroll)
585 LI->erase(L);
586
587 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
588 : LoopUnrollResult::PartiallyUnrolled;
589}
590
David Green2557e432018-07-12 10:44:47 +0000591static bool getLoadsAndStores(BasicBlockSet &Blocks,
David Green963401d2018-07-01 12:47:30 +0000592 SmallVector<Value *, 4> &MemInstr) {
593 // Scan the BBs and collect legal loads and stores.
594 // Returns false if non-simple loads/stores are found.
595 for (BasicBlock *BB : Blocks) {
596 for (Instruction &I : *BB) {
597 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
598 if (!Ld->isSimple())
599 return false;
600 MemInstr.push_back(&I);
601 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
602 if (!St->isSimple())
603 return false;
604 MemInstr.push_back(&I);
605 } else if (I.mayReadOrWriteMemory()) {
606 return false;
607 }
608 }
609 }
610 return true;
611}
612
613static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
614 SmallVector<Value *, 4> &Later,
615 unsigned LoopDepth, bool InnerLoop,
616 DependenceInfo &DI) {
617 // Use DA to check for dependencies between loads and stores that make unroll
618 // and jam invalid
619 for (Value *I : Earlier) {
620 for (Value *J : Later) {
621 Instruction *Src = cast<Instruction>(I);
622 Instruction *Dst = cast<Instruction>(J);
623 if (Src == Dst)
624 continue;
625 // Ignore Input dependencies.
626 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
627 continue;
628
629 // Track dependencies, and if we find them take a conservative approach
630 // by allowing only = or < (not >), altough some > would be safe
631 // (depending upon unroll width).
632 // For the inner loop, we need to disallow any (> <) dependencies
633 // FIXME: Allow > so long as distance is less than unroll width
634 if (auto D = DI.depends(Src, Dst, true)) {
635 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
636
David Greenbc2e1c32018-08-02 07:30:53 +0000637 if (D->isConfused()) {
638 LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
639 << " " << *Src << "\n"
640 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000641 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000642 }
David Green963401d2018-07-01 12:47:30 +0000643 if (!InnerLoop) {
David Greenbc2e1c32018-08-02 07:30:53 +0000644 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
645 LLVM_DEBUG(dbgs() << " > dependency between:\n"
646 << " " << *Src << "\n"
647 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000648 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000649 }
David Green963401d2018-07-01 12:47:30 +0000650 } else {
651 assert(LoopDepth + 1 <= D->getLevels());
652 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
David Greenbc2e1c32018-08-02 07:30:53 +0000653 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
654 LLVM_DEBUG(dbgs() << " < > dependency between:\n"
655 << " " << *Src << "\n"
656 << " " << *Dst << "\n");
David Green963401d2018-07-01 12:47:30 +0000657 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000658 }
David Green963401d2018-07-01 12:47:30 +0000659 }
660 }
661 }
662 }
663 return true;
664}
665
David Green2557e432018-07-12 10:44:47 +0000666static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
667 BasicBlockSet &SubLoopBlocks,
668 BasicBlockSet &AftBlocks, DependenceInfo &DI) {
David Green963401d2018-07-01 12:47:30 +0000669 // Get all loads/store pairs for each blocks
670 SmallVector<Value *, 4> ForeMemInstr;
671 SmallVector<Value *, 4> SubLoopMemInstr;
672 SmallVector<Value *, 4> AftMemInstr;
673 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
674 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
675 !getLoadsAndStores(AftBlocks, AftMemInstr))
676 return false;
677
678 // Check for dependencies between any blocks that may change order
679 unsigned LoopDepth = L->getLoopDepth();
680 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
681 DI) &&
682 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
683 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
684 DI) &&
685 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
686 DI);
687}
688
689bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
690 DependenceInfo &DI) {
691 /* We currently handle outer loops like this:
692 |
693 ForeFirst <----\ }
694 Blocks | } ForeBlocks
695 ForeLast | }
696 | |
697 SubLoopFirst <\ | }
698 Blocks | | } SubLoopBlocks
699 SubLoopLast -/ | }
700 | |
701 AftFirst | }
702 Blocks | } AftBlocks
703 AftLast ------/ }
704 |
705
706 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
707 and AftBlocks, providing that there is one edge from Fores to SubLoops,
708 one edge from SubLoops to Afts and a single outer loop exit (from Afts).
709 In practice we currently limit Aft blocks to a single block, and limit
710 things further in the profitablility checks of the unroll and jam pass.
711
712 Because of the way we rearrange basic blocks, we also require that
713 the Fore blocks on all unrolled iterations are safe to move before the
714 SubLoop blocks of all iterations. So we require that the phi node looping
715 operands of ForeHeader can be moved to at least the end of ForeEnd, so that
716 we can arrange cloned Fore Blocks before the subloop and match up Phi's
717 correctly.
718
719 i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
720 It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
721
722 There are then a number of checks along the lines of no calls, no
723 exceptions, inner loop IV is consistent, etc. Note that for loops requiring
724 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
725 UnrollAndJamLoop if the trip count cannot be easily calculated.
726 */
727
728 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
729 return false;
730 Loop *SubLoop = L->getSubLoops()[0];
731 if (!SubLoop->isLoopSimplifyForm())
732 return false;
733
734 BasicBlock *Header = L->getHeader();
735 BasicBlock *Latch = L->getLoopLatch();
736 BasicBlock *Exit = L->getExitingBlock();
737 BasicBlock *SubLoopHeader = SubLoop->getHeader();
738 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
739 BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
740
741 if (Latch != Exit)
742 return false;
743 if (SubLoopLatch != SubLoopExit)
744 return false;
745
David Greenbc2e1c32018-08-02 07:30:53 +0000746 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
747 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
David Green963401d2018-07-01 12:47:30 +0000748 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000749 }
David Green963401d2018-07-01 12:47:30 +0000750
751 // Split blocks into Fore/SubLoop/Aft based on dominators
David Green2557e432018-07-12 10:44:47 +0000752 BasicBlockSet SubLoopBlocks;
753 BasicBlockSet ForeBlocks;
754 BasicBlockSet AftBlocks;
David Green963401d2018-07-01 12:47:30 +0000755 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
David Greenbc2e1c32018-08-02 07:30:53 +0000756 AftBlocks, &DT)) {
757 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
David Green963401d2018-07-01 12:47:30 +0000758 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000759 }
David Green963401d2018-07-01 12:47:30 +0000760
761 // Aft blocks may need to move instructions to fore blocks, which becomes more
762 // difficult if there are multiple (potentially conditionally executed)
763 // blocks. For now we just exclude loops with multiple aft blocks.
David Greenbc2e1c32018-08-02 07:30:53 +0000764 if (AftBlocks.size() != 1) {
765 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
766 "multiple blocks after the loop\n");
David Green963401d2018-07-01 12:47:30 +0000767 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000768 }
David Green963401d2018-07-01 12:47:30 +0000769
David Greenbc2e1c32018-08-02 07:30:53 +0000770 // Check inner loop backedge count is consistent on all iterations of the
771 // outer loop
David Green6cb64782018-08-15 10:59:41 +0000772 if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
David Greenbc2e1c32018-08-02 07:30:53 +0000773 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
774 "not consistent on each iteration\n");
David Green963401d2018-07-01 12:47:30 +0000775 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000776 }
David Green963401d2018-07-01 12:47:30 +0000777
778 // Check the loop safety info for exceptions.
Max Kazantsev9c90ec22018-10-16 08:31:05 +0000779 SimpleLoopSafetyInfo LSI;
Max Kazantsev530b8d12018-08-15 05:55:43 +0000780 LSI.computeLoopSafetyInfo(L);
781 if (LSI.anyBlockMayThrow()) {
David Greenbc2e1c32018-08-02 07:30:53 +0000782 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
David Green963401d2018-07-01 12:47:30 +0000783 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000784 }
David Green963401d2018-07-01 12:47:30 +0000785
786 // We've ruled out the easy stuff and now need to check that there are no
787 // interdependencies which may prevent us from moving the:
788 // ForeBlocks before Subloop and AftBlocks.
789 // Subloop before AftBlocks.
790 // ForeBlock phi operands before the subloop
791
792 // Make sure we can move all instructions we need to before the subloop
David Greeneda3c9e2018-07-26 15:19:07 +0000793 if (!processHeaderPhiOperands(
794 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
795 if (SubLoop->contains(I->getParent()))
796 return false;
797 if (AftBlocks.count(I->getParent())) {
798 // If we hit a phi node in afts we know we are done (probably
799 // LCSSA)
800 if (isa<PHINode>(I))
801 return false;
802 // Can't move instructions with side effects or memory
803 // reads/writes
804 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
805 return false;
806 }
807 // Keep going
808 return true;
David Greenbc2e1c32018-08-02 07:30:53 +0000809 })) {
810 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
811 "instructions after subloop to before it\n");
David Greeneda3c9e2018-07-26 15:19:07 +0000812 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000813 }
David Green963401d2018-07-01 12:47:30 +0000814
815 // Check for memory dependencies which prohibit the unrolling we are doing.
816 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
817 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
David Greenbc2e1c32018-08-02 07:30:53 +0000818 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
819 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
David Green963401d2018-07-01 12:47:30 +0000820 return false;
David Greenbc2e1c32018-08-02 07:30:53 +0000821 }
David Green963401d2018-07-01 12:47:30 +0000822
823 return true;
824}