blob: 1dcfcda31a8ec316e6f09ada717b677f9513eb16 [file] [log] [blame]
Adam Nemet938d3d62015-05-14 12:05:18 +00001//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the Loop Distribution Pass. Its main focus is to
11// distribute loops that cannot be vectorized due to dependence cycles. It
12// tries to isolate the offending dependences into a new loop allowing
13// vectorization of the remaining parts.
14//
15// For dependence analysis, the pass uses the LoopVectorizer's
16// LoopAccessAnalysis. Because this analysis presumes no change in the order of
17// memory operations, special care is taken to preserve the lexical order of
18// these operations.
19//
20// Similarly to the Vectorizer, the pass also supports loop versioning to
21// run-time disambiguate potentially overlapping arrays.
22//
23//===----------------------------------------------------------------------===//
24
25#include "llvm/ADT/DepthFirstIterator.h"
26#include "llvm/ADT/EquivalenceClasses.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/Analysis/LoopAccessAnalysis.h"
30#include "llvm/Analysis/LoopInfo.h"
31#include "llvm/IR/Dominators.h"
32#include "llvm/Pass.h"
33#include "llvm/Support/CommandLine.h"
34#include "llvm/Support/Debug.h"
35#include "llvm/Transforms/Utils/BasicBlockUtils.h"
36#include "llvm/Transforms/Utils/Cloning.h"
Ashutosh Nemac5b7b552015-08-19 05:40:42 +000037#include "llvm/Transforms/Utils/LoopUtils.h"
Adam Nemet215746b2015-07-10 18:55:13 +000038#include "llvm/Transforms/Utils/LoopVersioning.h"
Adam Nemet938d3d62015-05-14 12:05:18 +000039#include <list>
40
41#define LDIST_NAME "loop-distribute"
42#define DEBUG_TYPE LDIST_NAME
43
44using namespace llvm;
45
46static cl::opt<bool>
47 LDistVerify("loop-distribute-verify", cl::Hidden,
48 cl::desc("Turn on DominatorTree and LoopInfo verification "
49 "after Loop Distribution"),
50 cl::init(false));
51
52static cl::opt<bool> DistributeNonIfConvertible(
53 "loop-distribute-non-if-convertible", cl::Hidden,
54 cl::desc("Whether to distribute into a loop that may not be "
55 "if-convertible by the loop vectorizer"),
56 cl::init(false));
57
Silviu Baranga2910a4f2015-11-09 13:26:09 +000058static cl::opt<unsigned> DistributeSCEVCheckThreshold(
59 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
60 cl::desc("The maximum number of SCEV checks allowed for Loop "
61 "Distribution"));
62
Adam Nemetd2fa4142016-04-27 05:28:18 +000063static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
64 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
65 cl::Hidden,
66 cl::desc(
67 "The maximum number of SCEV checks allowed for Loop "
68 "Distribution for loop marked with #pragma loop distribute(enable)"));
69
70// Note that the initial value for this depends on whether the pass is invoked
71// directly or from the optimization pipeline.
72static cl::opt<bool> EnableLoopDistribute(
73 "enable-loop-distribute", cl::Hidden,
74 cl::desc("Enable the new, experimental LoopDistribution Pass"));
75
Adam Nemet938d3d62015-05-14 12:05:18 +000076STATISTIC(NumLoopsDistributed, "Number of loops distributed");
77
Adam Nemet2f85b732015-05-14 12:33:32 +000078namespace {
Adam Nemet938d3d62015-05-14 12:05:18 +000079/// \brief Maintains the set of instructions of the loop for a partition before
80/// cloning. After cloning, it hosts the new loop.
81class InstPartition {
82 typedef SmallPtrSet<Instruction *, 8> InstructionSet;
83
84public:
85 InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
86 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
87 Set.insert(I);
88 }
89
90 /// \brief Returns whether this partition contains a dependence cycle.
91 bool hasDepCycle() const { return DepCycle; }
92
93 /// \brief Adds an instruction to this partition.
94 void add(Instruction *I) { Set.insert(I); }
95
96 /// \brief Collection accessors.
97 InstructionSet::iterator begin() { return Set.begin(); }
98 InstructionSet::iterator end() { return Set.end(); }
99 InstructionSet::const_iterator begin() const { return Set.begin(); }
100 InstructionSet::const_iterator end() const { return Set.end(); }
101 bool empty() const { return Set.empty(); }
102
103 /// \brief Moves this partition into \p Other. This partition becomes empty
104 /// after this.
105 void moveTo(InstPartition &Other) {
106 Other.Set.insert(Set.begin(), Set.end());
107 Set.clear();
108 Other.DepCycle |= DepCycle;
109 }
110
111 /// \brief Populates the partition with a transitive closure of all the
112 /// instructions that the seeded instructions dependent on.
113 void populateUsedSet() {
114 // FIXME: We currently don't use control-dependence but simply include all
115 // blocks (possibly empty at the end) and let simplifycfg mostly clean this
116 // up.
117 for (auto *B : OrigLoop->getBlocks())
118 Set.insert(B->getTerminator());
119
120 // Follow the use-def chains to form a transitive closure of all the
121 // instructions that the originally seeded instructions depend on.
122 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
123 while (!Worklist.empty()) {
124 Instruction *I = Worklist.pop_back_val();
125 // Insert instructions from the loop that we depend on.
126 for (Value *V : I->operand_values()) {
127 auto *I = dyn_cast<Instruction>(V);
128 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
129 Worklist.push_back(I);
130 }
131 }
132 }
133
134 /// \brief Clones the original loop.
135 ///
136 /// Updates LoopInfo and DominatorTree using the information that block \p
137 /// LoopDomBB dominates the loop.
138 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
139 unsigned Index, LoopInfo *LI,
140 DominatorTree *DT) {
141 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
142 VMap, Twine(".ldist") + Twine(Index),
143 LI, DT, ClonedLoopBlocks);
144 return ClonedLoop;
145 }
146
147 /// \brief The cloned loop. If this partition is mapped to the original loop,
148 /// this is null.
149 const Loop *getClonedLoop() const { return ClonedLoop; }
150
151 /// \brief Returns the loop where this partition ends up after distribution.
152 /// If this partition is mapped to the original loop then use the block from
153 /// the loop.
154 const Loop *getDistributedLoop() const {
155 return ClonedLoop ? ClonedLoop : OrigLoop;
156 }
157
158 /// \brief The VMap that is populated by cloning and then used in
159 /// remapinstruction to remap the cloned instructions.
160 ValueToValueMapTy &getVMap() { return VMap; }
161
162 /// \brief Remaps the cloned instructions using VMap.
Adam Nemet1a689182015-07-10 18:55:09 +0000163 void remapInstructions() {
164 remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
165 }
Adam Nemet938d3d62015-05-14 12:05:18 +0000166
167 /// \brief Based on the set of instructions selected for this partition,
168 /// removes the unnecessary ones.
169 void removeUnusedInsts() {
170 SmallVector<Instruction *, 8> Unused;
171
172 for (auto *Block : OrigLoop->getBlocks())
173 for (auto &Inst : *Block)
174 if (!Set.count(&Inst)) {
175 Instruction *NewInst = &Inst;
176 if (!VMap.empty())
177 NewInst = cast<Instruction>(VMap[NewInst]);
178
179 assert(!isa<BranchInst>(NewInst) &&
180 "Branches are marked used early on");
181 Unused.push_back(NewInst);
182 }
183
184 // Delete the instructions backwards, as it has a reduced likelihood of
185 // having to update as many def-use and use-def chains.
Pete Cooper7679afd2015-07-24 21:13:43 +0000186 for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) {
Adam Nemet938d3d62015-05-14 12:05:18 +0000187 if (!Inst->use_empty())
188 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
189 Inst->eraseFromParent();
190 }
191 }
192
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000193 void print() const {
Adam Nemet938d3d62015-05-14 12:05:18 +0000194 if (DepCycle)
195 dbgs() << " (cycle)\n";
196 for (auto *I : Set)
197 // Prefix with the block name.
198 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
199 }
200
201 void printBlocks() const {
202 for (auto *BB : getDistributedLoop()->getBlocks())
203 dbgs() << *BB;
204 }
205
206private:
207 /// \brief Instructions from OrigLoop selected for this partition.
208 InstructionSet Set;
209
210 /// \brief Whether this partition contains a dependence cycle.
211 bool DepCycle;
212
213 /// \brief The original loop.
214 Loop *OrigLoop;
215
216 /// \brief The cloned loop. If this partition is mapped to the original loop,
217 /// this is null.
218 Loop *ClonedLoop;
219
220 /// \brief The blocks of ClonedLoop including the preheader. If this
221 /// partition is mapped to the original loop, this is empty.
222 SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
223
224 /// \brief These gets populated once the set of instructions have been
225 /// finalized. If this partition is mapped to the original loop, these are not
226 /// set.
227 ValueToValueMapTy VMap;
228};
229
230/// \brief Holds the set of Partitions. It populates them, merges them and then
231/// clones the loops.
232class InstPartitionContainer {
233 typedef DenseMap<Instruction *, int> InstToPartitionIdT;
234
235public:
236 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
237 : L(L), LI(LI), DT(DT) {}
238
239 /// \brief Returns the number of partitions.
240 unsigned getSize() const { return PartitionContainer.size(); }
241
242 /// \brief Adds \p Inst into the current partition if that is marked to
243 /// contain cycles. Otherwise start a new partition for it.
244 void addToCyclicPartition(Instruction *Inst) {
245 // If the current partition is non-cyclic. Start a new one.
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000246 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
247 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
Adam Nemet938d3d62015-05-14 12:05:18 +0000248 else
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000249 PartitionContainer.back().add(Inst);
Adam Nemet938d3d62015-05-14 12:05:18 +0000250 }
251
252 /// \brief Adds \p Inst into a partition that is not marked to contain
253 /// dependence cycles.
254 ///
255 // Initially we isolate memory instructions into as many partitions as
256 // possible, then later we may merge them back together.
257 void addToNewNonCyclicPartition(Instruction *Inst) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000258 PartitionContainer.emplace_back(Inst, L);
Adam Nemet938d3d62015-05-14 12:05:18 +0000259 }
260
261 /// \brief Merges adjacent non-cyclic partitions.
262 ///
263 /// The idea is that we currently only want to isolate the non-vectorizable
264 /// partition. We could later allow more distribution among these partition
265 /// too.
266 void mergeAdjacentNonCyclic() {
267 mergeAdjacentPartitionsIf(
268 [](const InstPartition *P) { return !P->hasDepCycle(); });
269 }
270
271 /// \brief If a partition contains only conditional stores, we won't vectorize
272 /// it. Try to merge it with a previous cyclic partition.
273 void mergeNonIfConvertible() {
274 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
275 if (Partition->hasDepCycle())
276 return true;
277
278 // Now, check if all stores are conditional in this partition.
279 bool seenStore = false;
280
281 for (auto *Inst : *Partition)
282 if (isa<StoreInst>(Inst)) {
283 seenStore = true;
284 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
285 return false;
286 }
287 return seenStore;
288 });
289 }
290
291 /// \brief Merges the partitions according to various heuristics.
292 void mergeBeforePopulating() {
293 mergeAdjacentNonCyclic();
294 if (!DistributeNonIfConvertible)
295 mergeNonIfConvertible();
296 }
297
298 /// \brief Merges partitions in order to ensure that no loads are duplicated.
299 ///
300 /// We can't duplicate loads because that could potentially reorder them.
301 /// LoopAccessAnalysis provides dependency information with the context that
302 /// the order of memory operation is preserved.
303 ///
304 /// Return if any partitions were merged.
305 bool mergeToAvoidDuplicatedLoads() {
306 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
307 typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
308
309 LoadToPartitionT LoadToPartition;
310 ToBeMergedT ToBeMerged;
311
312 // Step through the partitions and create equivalence between partitions
313 // that contain the same load. Also put partitions in between them in the
314 // same equivalence class to avoid reordering of memory operations.
315 for (PartitionContainerT::iterator I = PartitionContainer.begin(),
316 E = PartitionContainer.end();
317 I != E; ++I) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000318 auto *PartI = &*I;
Adam Nemet938d3d62015-05-14 12:05:18 +0000319
320 // If a load occurs in two partitions PartI and PartJ, merge all
321 // partitions (PartI, PartJ] into PartI.
322 for (Instruction *Inst : *PartI)
323 if (isa<LoadInst>(Inst)) {
324 bool NewElt;
325 LoadToPartitionT::iterator LoadToPart;
326
327 std::tie(LoadToPart, NewElt) =
328 LoadToPartition.insert(std::make_pair(Inst, PartI));
329 if (!NewElt) {
330 DEBUG(dbgs() << "Merging partitions due to this load in multiple "
331 << "partitions: " << PartI << ", "
332 << LoadToPart->second << "\n" << *Inst << "\n");
333
334 auto PartJ = I;
335 do {
336 --PartJ;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000337 ToBeMerged.unionSets(PartI, &*PartJ);
338 } while (&*PartJ != LoadToPart->second);
Adam Nemet938d3d62015-05-14 12:05:18 +0000339 }
340 }
341 }
342 if (ToBeMerged.empty())
343 return false;
344
345 // Merge the member of an equivalence class into its class leader. This
346 // makes the members empty.
347 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
348 I != E; ++I) {
349 if (!I->isLeader())
350 continue;
351
352 auto PartI = I->getData();
353 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
354 ToBeMerged.member_end())) {
355 PartJ->moveTo(*PartI);
356 }
357 }
358
359 // Remove the empty partitions.
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000360 PartitionContainer.remove_if(
361 [](const InstPartition &P) { return P.empty(); });
Adam Nemet938d3d62015-05-14 12:05:18 +0000362
363 return true;
364 }
365
366 /// \brief Sets up the mapping between instructions to partitions. If the
367 /// instruction is duplicated across multiple partitions, set the entry to -1.
368 void setupPartitionIdOnInstructions() {
369 int PartitionID = 0;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000370 for (const auto &Partition : PartitionContainer) {
371 for (Instruction *Inst : Partition) {
Adam Nemet938d3d62015-05-14 12:05:18 +0000372 bool NewElt;
373 InstToPartitionIdT::iterator Iter;
374
375 std::tie(Iter, NewElt) =
376 InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
377 if (!NewElt)
378 Iter->second = -1;
379 }
380 ++PartitionID;
381 }
382 }
383
384 /// \brief Populates the partition with everything that the seeding
385 /// instructions require.
386 void populateUsedSet() {
387 for (auto &P : PartitionContainer)
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000388 P.populateUsedSet();
Adam Nemet938d3d62015-05-14 12:05:18 +0000389 }
390
391 /// \brief This performs the main chunk of the work of cloning the loops for
392 /// the partitions.
Justin Bogner843fb202015-12-15 19:40:57 +0000393 void cloneLoops() {
Adam Nemet938d3d62015-05-14 12:05:18 +0000394 BasicBlock *OrigPH = L->getLoopPreheader();
395 // At this point the predecessor of the preheader is either the memcheck
396 // block or the top part of the original preheader.
397 BasicBlock *Pred = OrigPH->getSinglePredecessor();
398 assert(Pred && "Preheader does not have a single predecessor");
399 BasicBlock *ExitBlock = L->getExitBlock();
400 assert(ExitBlock && "No single exit block");
401 Loop *NewLoop;
402
403 assert(!PartitionContainer.empty() && "at least two partitions expected");
404 // We're cloning the preheader along with the loop so we already made sure
405 // it was empty.
406 assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
407 "preheader not empty");
408
409 // Create a loop for each partition except the last. Clone the original
410 // loop before PH along with adding a preheader for the cloned loop. Then
411 // update PH to point to the newly added preheader.
412 BasicBlock *TopPH = OrigPH;
413 unsigned Index = getSize() - 1;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000414 for (auto I = std::next(PartitionContainer.rbegin()),
415 E = PartitionContainer.rend();
Adam Nemet938d3d62015-05-14 12:05:18 +0000416 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000417 auto *Part = &*I;
Adam Nemet938d3d62015-05-14 12:05:18 +0000418
419 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
420
421 Part->getVMap()[ExitBlock] = TopPH;
422 Part->remapInstructions();
423 }
424 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
425
426 // Now go in forward order and update the immediate dominator for the
427 // preheaders with the exiting block of the previous loop. Dominance
428 // within the loop is updated in cloneLoopWithPreheader.
429 for (auto Curr = PartitionContainer.cbegin(),
430 Next = std::next(PartitionContainer.cbegin()),
431 E = PartitionContainer.cend();
432 Next != E; ++Curr, ++Next)
433 DT->changeImmediateDominator(
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000434 Next->getDistributedLoop()->getLoopPreheader(),
435 Curr->getDistributedLoop()->getExitingBlock());
Adam Nemet938d3d62015-05-14 12:05:18 +0000436 }
437
438 /// \brief Removes the dead instructions from the cloned loops.
439 void removeUnusedInsts() {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000440 for (auto &Partition : PartitionContainer)
441 Partition.removeUnusedInsts();
Adam Nemet938d3d62015-05-14 12:05:18 +0000442 }
443
444 /// \brief For each memory pointer, it computes the partitionId the pointer is
445 /// used in.
446 ///
447 /// This returns an array of int where the I-th entry corresponds to I-th
448 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
449 /// partitions its entry is set to -1.
450 SmallVector<int, 8>
451 computePartitionSetForPointers(const LoopAccessInfo &LAI) {
Adam Nemet7cdebac2015-07-14 22:32:44 +0000452 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
Adam Nemet938d3d62015-05-14 12:05:18 +0000453
454 unsigned N = RtPtrCheck->Pointers.size();
455 SmallVector<int, 8> PtrToPartitions(N);
456 for (unsigned I = 0; I < N; ++I) {
Adam Nemet9f7dedc2015-07-14 22:32:50 +0000457 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
Adam Nemet938d3d62015-05-14 12:05:18 +0000458 auto Instructions =
Adam Nemet9f7dedc2015-07-14 22:32:50 +0000459 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
Adam Nemet938d3d62015-05-14 12:05:18 +0000460
461 int &Partition = PtrToPartitions[I];
462 // First set it to uninitialized.
463 Partition = -2;
464 for (Instruction *Inst : Instructions) {
465 // Note that this could be -1 if Inst is duplicated across multiple
466 // partitions.
467 int ThisPartition = this->InstToPartitionId[Inst];
468 if (Partition == -2)
469 Partition = ThisPartition;
470 // -1 means belonging to multiple partitions.
471 else if (Partition == -1)
472 break;
473 else if (Partition != (int)ThisPartition)
474 Partition = -1;
475 }
476 assert(Partition != -2 && "Pointer not belonging to any partition");
477 }
478
479 return PtrToPartitions;
480 }
481
482 void print(raw_ostream &OS) const {
483 unsigned Index = 0;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000484 for (const auto &P : PartitionContainer) {
485 OS << "Partition " << Index++ << " (" << &P << "):\n";
486 P.print();
Adam Nemet938d3d62015-05-14 12:05:18 +0000487 }
488 }
489
490 void dump() const { print(dbgs()); }
491
492#ifndef NDEBUG
493 friend raw_ostream &operator<<(raw_ostream &OS,
494 const InstPartitionContainer &Partitions) {
495 Partitions.print(OS);
496 return OS;
497 }
498#endif
499
500 void printBlocks() const {
501 unsigned Index = 0;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000502 for (const auto &P : PartitionContainer) {
503 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
504 P.printBlocks();
Adam Nemet938d3d62015-05-14 12:05:18 +0000505 }
506 }
507
508private:
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000509 typedef std::list<InstPartition> PartitionContainerT;
Adam Nemet938d3d62015-05-14 12:05:18 +0000510
511 /// \brief List of partitions.
512 PartitionContainerT PartitionContainer;
513
514 /// \brief Mapping from Instruction to partition Id. If the instruction
515 /// belongs to multiple partitions the entry contains -1.
516 InstToPartitionIdT InstToPartitionId;
517
518 Loop *L;
519 LoopInfo *LI;
520 DominatorTree *DT;
521
522 /// \brief The control structure to merge adjacent partitions if both satisfy
523 /// the \p Predicate.
524 template <class UnaryPredicate>
525 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
526 InstPartition *PrevMatch = nullptr;
527 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000528 auto DoesMatch = Predicate(&*I);
Adam Nemet938d3d62015-05-14 12:05:18 +0000529 if (PrevMatch == nullptr && DoesMatch) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000530 PrevMatch = &*I;
Adam Nemet938d3d62015-05-14 12:05:18 +0000531 ++I;
532 } else if (PrevMatch != nullptr && DoesMatch) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000533 I->moveTo(*PrevMatch);
Adam Nemet938d3d62015-05-14 12:05:18 +0000534 I = PartitionContainer.erase(I);
535 } else {
536 PrevMatch = nullptr;
537 ++I;
538 }
539 }
540 }
541};
542
543/// \brief For each memory instruction, this class maintains difference of the
544/// number of unsafe dependences that start out from this instruction minus
545/// those that end here.
546///
547/// By traversing the memory instructions in program order and accumulating this
548/// number, we know whether any unsafe dependence crosses over a program point.
549class MemoryInstructionDependences {
550 typedef MemoryDepChecker::Dependence Dependence;
551
552public:
553 struct Entry {
554 Instruction *Inst;
555 unsigned NumUnsafeDependencesStartOrEnd;
556
557 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
558 };
559
560 typedef SmallVector<Entry, 8> AccessesType;
561
562 AccessesType::const_iterator begin() const { return Accesses.begin(); }
563 AccessesType::const_iterator end() const { return Accesses.end(); }
564
565 MemoryInstructionDependences(
566 const SmallVectorImpl<Instruction *> &Instructions,
Adam Nemeta2df7502015-11-03 21:39:52 +0000567 const SmallVectorImpl<Dependence> &Dependences) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000568 Accesses.append(Instructions.begin(), Instructions.end());
Adam Nemet938d3d62015-05-14 12:05:18 +0000569
570 DEBUG(dbgs() << "Backward dependences:\n");
Adam Nemeta2df7502015-11-03 21:39:52 +0000571 for (auto &Dep : Dependences)
Adam Nemet938d3d62015-05-14 12:05:18 +0000572 if (Dep.isPossiblyBackward()) {
573 // Note that the designations source and destination follow the program
574 // order, i.e. source is always first. (The direction is given by the
575 // DepType.)
576 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
577 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
578
579 DEBUG(Dep.print(dbgs(), 2, Instructions));
580 }
581 }
582
583private:
584 AccessesType Accesses;
585};
586
Adam Nemet61399ac2016-04-27 00:31:03 +0000587/// \brief The actual class performing the per-loop work.
588class LoopDistributeForLoop {
Adam Nemet938d3d62015-05-14 12:05:18 +0000589public:
Adam Nemet61399ac2016-04-27 00:31:03 +0000590 LoopDistributeForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
591 DominatorTree *DT, ScalarEvolution *SE)
Adam Nemetd2fa4142016-04-27 05:28:18 +0000592 : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {
593 setForced();
594 }
Adam Nemetc75ad692015-07-30 03:29:16 +0000595
Adam Nemet938d3d62015-05-14 12:05:18 +0000596 /// \brief Try to distribute an inner-most loop.
Adam Nemet61399ac2016-04-27 00:31:03 +0000597 bool processLoop() {
Adam Nemet938d3d62015-05-14 12:05:18 +0000598 assert(L->empty() && "Only process inner loops.");
599
600 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
601 << "\" checking " << *L << "\n");
602
603 BasicBlock *PH = L->getLoopPreheader();
Adam Nemet7f38e112016-04-28 23:08:27 +0000604 if (!PH)
Adam Nemetadeccf72016-04-28 23:08:30 +0000605 return fail("no preheader");
Adam Nemet7f38e112016-04-28 23:08:27 +0000606 if (!L->getExitBlock())
Adam Nemetadeccf72016-04-28 23:08:30 +0000607 return fail("multiple exit blocks");
Adam Nemet938d3d62015-05-14 12:05:18 +0000608 // LAA will check that we only have a single exiting block.
609
Adam Nemet938d3d62015-05-14 12:05:18 +0000610 // Currently, we only distribute to isolate the part of the loop with
611 // dependence cycles to enable partial vectorization.
Adam Nemet7f38e112016-04-28 23:08:27 +0000612 if (LAI.canVectorizeMemory())
Adam Nemetadeccf72016-04-28 23:08:30 +0000613 return fail("memory operations are safe for vectorization");
Adam Nemet7f38e112016-04-28 23:08:27 +0000614
Adam Nemeta2df7502015-11-03 21:39:52 +0000615 auto *Dependences = LAI.getDepChecker().getDependences();
Adam Nemet7f38e112016-04-28 23:08:27 +0000616 if (!Dependences || Dependences->empty())
Adam Nemetadeccf72016-04-28 23:08:30 +0000617 return fail("no unsafe dependences to isolate");
Adam Nemet938d3d62015-05-14 12:05:18 +0000618
619 InstPartitionContainer Partitions(L, LI, DT);
620
621 // First, go through each memory operation and assign them to consecutive
622 // partitions (the order of partitions follows program order). Put those
623 // with unsafe dependences into "cyclic" partition otherwise put each store
624 // in its own "non-cyclic" partition (we'll merge these later).
625 //
626 // Note that a memory operation (e.g. Load2 below) at a program point that
627 // has an unsafe dependence (Store3->Load1) spanning over it must be
628 // included in the same cyclic partition as the dependent operations. This
629 // is to preserve the original program order after distribution. E.g.:
630 //
631 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
632 // Load1 -. 1 0->1
633 // Load2 | /Unsafe/ 0 1
634 // Store3 -' -1 1->0
635 // Load4 0 0
636 //
637 // NumUnsafeDependencesActive > 0 indicates this situation and in this case
638 // we just keep assigning to the same cyclic partition until
639 // NumUnsafeDependencesActive reaches 0.
640 const MemoryDepChecker &DepChecker = LAI.getDepChecker();
641 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
Adam Nemeta2df7502015-11-03 21:39:52 +0000642 *Dependences);
Adam Nemet938d3d62015-05-14 12:05:18 +0000643
644 int NumUnsafeDependencesActive = 0;
645 for (auto &InstDep : MID) {
646 Instruction *I = InstDep.Inst;
647 // We update NumUnsafeDependencesActive post-instruction, catch the
648 // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
649 if (NumUnsafeDependencesActive ||
650 InstDep.NumUnsafeDependencesStartOrEnd > 0)
651 Partitions.addToCyclicPartition(I);
652 else
653 Partitions.addToNewNonCyclicPartition(I);
654 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
655 assert(NumUnsafeDependencesActive >= 0 &&
656 "Negative number of dependences active");
657 }
658
659 // Add partitions for values used outside. These partitions can be out of
660 // order from the original program order. This is OK because if the
661 // partition uses a load we will merge this partition with the original
662 // partition of the load that we set up in the previous loop (see
663 // mergeToAvoidDuplicatedLoads).
664 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
665 for (auto *Inst : DefsUsedOutside)
666 Partitions.addToNewNonCyclicPartition(Inst);
667
668 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
669 if (Partitions.getSize() < 2)
Adam Nemet7f38e112016-04-28 23:08:27 +0000670 return fail("cannot isolate unsafe dependencies");
Adam Nemet938d3d62015-05-14 12:05:18 +0000671
672 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
673 // should be able to vectorize these together.
674 Partitions.mergeBeforePopulating();
675 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
676 if (Partitions.getSize() < 2)
Adam Nemet7f38e112016-04-28 23:08:27 +0000677 return fail("cannot isolate unsafe dependencies");
Adam Nemet938d3d62015-05-14 12:05:18 +0000678
679 // Now, populate the partitions with non-memory operations.
680 Partitions.populateUsedSet();
681 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
682
683 // In order to preserve original lexical order for loads, keep them in the
684 // partition that we set up in the MemoryInstructionDependences loop.
685 if (Partitions.mergeToAvoidDuplicatedLoads()) {
686 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
687 << Partitions);
688 if (Partitions.getSize() < 2)
Adam Nemet7f38e112016-04-28 23:08:27 +0000689 return fail("cannot isolate unsafe dependencies");
Adam Nemet938d3d62015-05-14 12:05:18 +0000690 }
691
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000692 // Don't distribute the loop if we need too many SCEV run-time checks.
Silviu Baranga9cd9a7e2015-12-09 16:06:28 +0000693 const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
Adam Nemetd2fa4142016-04-27 05:28:18 +0000694 if (Pred.getComplexity() > (IsForced.getValueOr(false)
695 ? PragmaDistributeSCEVCheckThreshold
Adam Nemet7f38e112016-04-28 23:08:27 +0000696 : DistributeSCEVCheckThreshold))
Adam Nemetadeccf72016-04-28 23:08:30 +0000697 return fail("too many SCEV run-time checks needed.\n");
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000698
Adam Nemet938d3d62015-05-14 12:05:18 +0000699 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
700 // We're done forming the partitions set up the reverse mapping from
701 // instructions to partitions.
702 Partitions.setupPartitionIdOnInstructions();
703
704 // To keep things simple have an empty preheader before we version or clone
705 // the loop. (Also split if this has no predecessor, i.e. entry, because we
706 // rely on PH having a predecessor.)
707 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
708 SplitBlock(PH, PH->getTerminator(), DT, LI);
709
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000710 // If we need run-time checks, version the loop now.
Adam Nemet772a1502015-06-19 19:32:41 +0000711 auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI);
Adam Nemetc75ad692015-07-30 03:29:16 +0000712 const auto *RtPtrChecking = LAI.getRuntimePointerChecking();
Adam Nemet15840392015-08-07 22:44:15 +0000713 const auto &AllChecks = RtPtrChecking->getChecks();
Adam Nemetc75ad692015-07-30 03:29:16 +0000714 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
715 RtPtrChecking);
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000716
717 if (!Pred.isAlwaysTrue() || !Checks.empty()) {
Adam Nemet772a1502015-06-19 19:32:41 +0000718 DEBUG(dbgs() << "\nPointers:\n");
Adam Nemet0a674402015-07-28 05:01:53 +0000719 DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000720 LoopVersioning LVer(LAI, L, LI, DT, SE, false);
721 LVer.setAliasChecks(std::move(Checks));
Silviu Baranga9cd9a7e2015-12-09 16:06:28 +0000722 LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
Adam Nemete4813402015-08-20 17:22:29 +0000723 LVer.versionLoop(DefsUsedOutside);
Adam Nemet5eccf072016-03-17 20:32:32 +0000724 LVer.annotateLoopWithNoAlias();
Adam Nemet938d3d62015-05-14 12:05:18 +0000725 }
726
727 // Create identical copies of the original loop for each partition and hook
728 // them up sequentially.
Justin Bogner843fb202015-12-15 19:40:57 +0000729 Partitions.cloneLoops();
Adam Nemet938d3d62015-05-14 12:05:18 +0000730
731 // Now, we remove the instruction from each loop that don't belong to that
732 // partition.
733 Partitions.removeUnusedInsts();
734 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
735 DEBUG(Partitions.printBlocks());
736
737 if (LDistVerify) {
738 LI->verify();
739 DT->verifyDomTree();
740 }
741
742 ++NumLoopsDistributed;
743 return true;
744 }
745
Adam Nemet7f38e112016-04-28 23:08:27 +0000746 /// \brief Provide diagnostics then \return with false.
747 bool fail(llvm::StringRef Message) {
Adam Nemetadeccf72016-04-28 23:08:30 +0000748 DEBUG(dbgs() << "Skipping; " << Message << "\n");
Adam Nemet7f38e112016-04-28 23:08:27 +0000749 return false;
750 }
751
Adam Nemetd2fa4142016-04-27 05:28:18 +0000752 /// \brief Return if distribution forced to be enabled/disabled for the loop.
753 ///
754 /// If the optional has a value, it indicates whether distribution was forced
755 /// to be enabled (true) or disabled (false). If the optional has no value
756 /// distribution was not forced either way.
757 const Optional<bool> &isForced() const { return IsForced; }
758
Adam Nemet61399ac2016-04-27 00:31:03 +0000759private:
760 /// \brief Filter out checks between pointers from the same partition.
761 ///
762 /// \p PtrToPartition contains the partition number for pointers. Partition
763 /// number -1 means that the pointer is used in multiple partitions. In this
764 /// case we can't safely omit the check.
765 SmallVector<RuntimePointerChecking::PointerCheck, 4>
766 includeOnlyCrossPartitionChecks(
767 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
768 const SmallVectorImpl<int> &PtrToPartition,
769 const RuntimePointerChecking *RtPtrChecking) {
770 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
771
772 std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
773 [&](const RuntimePointerChecking::PointerCheck &Check) {
774 for (unsigned PtrIdx1 : Check.first->Members)
775 for (unsigned PtrIdx2 : Check.second->Members)
776 // Only include this check if there is a pair of pointers
777 // that require checking and the pointers fall into
778 // separate partitions.
779 //
780 // (Note that we already know at this point that the two
781 // pointer groups need checking but it doesn't follow
782 // that each pair of pointers within the two groups need
783 // checking as well.
784 //
785 // In other words we don't want to include a check just
786 // because there is a pair of pointers between the two
787 // pointer groups that require checks and a different
788 // pair whose pointers fall into different partitions.)
789 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
790 !RuntimePointerChecking::arePointersInSamePartition(
791 PtrToPartition, PtrIdx1, PtrIdx2))
792 return true;
793 return false;
794 });
795
796 return Checks;
797 }
798
Adam Nemetd2fa4142016-04-27 05:28:18 +0000799 /// \brief Check whether the loop metadata is forcing distribution to be
800 /// enabled/disabled.
801 void setForced() {
802 Optional<const MDOperand *> Value =
803 findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
804 if (!Value)
805 return;
806
807 const MDOperand *Op = *Value;
808 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
809 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
810 }
811
Adam Nemet938d3d62015-05-14 12:05:18 +0000812 // Analyses used.
Adam Nemet61399ac2016-04-27 00:31:03 +0000813 Loop *L;
Adam Nemet938d3d62015-05-14 12:05:18 +0000814 LoopInfo *LI;
Adam Nemet61399ac2016-04-27 00:31:03 +0000815 const LoopAccessInfo &LAI;
Adam Nemet938d3d62015-05-14 12:05:18 +0000816 DominatorTree *DT;
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000817 ScalarEvolution *SE;
Adam Nemetd2fa4142016-04-27 05:28:18 +0000818
819 /// \brief Indicates whether distribution is forced to be enabled/disabled for
820 /// the loop.
821 ///
822 /// If the optional has a value, it indicates whether distribution was forced
823 /// to be enabled (true) or disabled (false). If the optional has no value
824 /// distribution was not forced either way.
825 Optional<bool> IsForced;
Adam Nemet938d3d62015-05-14 12:05:18 +0000826};
Adam Nemet61399ac2016-04-27 00:31:03 +0000827
828/// \brief The pass class.
829class LoopDistribute : public FunctionPass {
830public:
Adam Nemetd2fa4142016-04-27 05:28:18 +0000831 /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
832 /// performed by default. Pass -enable-loop-distribute={0,1} overrides this
833 /// default. We use this to keep LoopDistribution off by default when invoked
834 /// from the optimization pipeline but on when invoked explicitly from opt.
835 LoopDistribute(bool ProcessAllLoopsByDefault = true)
836 : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
837 // The default is set by the caller.
838 if (EnableLoopDistribute.getNumOccurrences() > 0)
839 ProcessAllLoops = EnableLoopDistribute;
Adam Nemet61399ac2016-04-27 00:31:03 +0000840 initializeLoopDistributePass(*PassRegistry::getPassRegistry());
841 }
842
843 bool runOnFunction(Function &F) override {
844 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
845 auto *LAA = &getAnalysis<LoopAccessAnalysis>();
846 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
847 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
848
849 // Build up a worklist of inner-loops to vectorize. This is necessary as the
850 // act of distributing a loop creates new loops and can invalidate iterators
851 // across the loops.
852 SmallVector<Loop *, 8> Worklist;
853
854 for (Loop *TopLevelLoop : *LI)
855 for (Loop *L : depth_first(TopLevelLoop))
856 // We only handle inner-most loops.
857 if (L->empty())
858 Worklist.push_back(L);
859
860 // Now walk the identified inner loops.
861 bool Changed = false;
862 for (Loop *L : Worklist) {
863 const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
864 LoopDistributeForLoop LDL(L, LI, LAI, DT, SE);
Adam Nemetd2fa4142016-04-27 05:28:18 +0000865
866 // If distribution was forced for the specific loop to be
867 // enabled/disabled, follow that. Otherwise use the global flag.
868 if (LDL.isForced().getValueOr(ProcessAllLoops))
869 Changed |= LDL.processLoop();
Adam Nemet61399ac2016-04-27 00:31:03 +0000870 }
871
872 // Process each loop nest in the function.
873 return Changed;
874 }
875
876 void getAnalysisUsage(AnalysisUsage &AU) const override {
877 AU.addRequired<ScalarEvolutionWrapperPass>();
878 AU.addRequired<LoopInfoWrapperPass>();
879 AU.addPreserved<LoopInfoWrapperPass>();
880 AU.addRequired<LoopAccessAnalysis>();
881 AU.addRequired<DominatorTreeWrapperPass>();
882 AU.addPreserved<DominatorTreeWrapperPass>();
883 }
884
885 static char ID;
Adam Nemetd2fa4142016-04-27 05:28:18 +0000886
887private:
888 /// \brief Whether distribution should be on in this function. The per-loop
889 /// pragma can override this.
890 bool ProcessAllLoops;
Adam Nemet61399ac2016-04-27 00:31:03 +0000891};
Adam Nemet938d3d62015-05-14 12:05:18 +0000892} // anonymous namespace
893
894char LoopDistribute::ID;
895static const char ldist_name[] = "Loop Distribition";
896
897INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
898INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
899INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
900INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000901INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
Adam Nemet938d3d62015-05-14 12:05:18 +0000902INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
903
904namespace llvm {
Adam Nemetd2fa4142016-04-27 05:28:18 +0000905FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
906 return new LoopDistribute(ProcessAllLoopsByDefault);
907}
Adam Nemet938d3d62015-05-14 12:05:18 +0000908}