blob: 8bafb2db3db485c35c8793d13c8f8fc88d46b0d8 [file] [log] [blame]
Adam Nemet938d3d62015-05-14 12:05:18 +00001//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the Loop Distribution Pass. Its main focus is to
11// distribute loops that cannot be vectorized due to dependence cycles. It
12// tries to isolate the offending dependences into a new loop allowing
13// vectorization of the remaining parts.
14//
15// For dependence analysis, the pass uses the LoopVectorizer's
16// LoopAccessAnalysis. Because this analysis presumes no change in the order of
17// memory operations, special care is taken to preserve the lexical order of
18// these operations.
19//
20// Similarly to the Vectorizer, the pass also supports loop versioning to
21// run-time disambiguate potentially overlapping arrays.
22//
23//===----------------------------------------------------------------------===//
24
25#include "llvm/ADT/DepthFirstIterator.h"
26#include "llvm/ADT/EquivalenceClasses.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/Analysis/LoopAccessAnalysis.h"
30#include "llvm/Analysis/LoopInfo.h"
31#include "llvm/IR/Dominators.h"
32#include "llvm/Pass.h"
33#include "llvm/Support/CommandLine.h"
34#include "llvm/Support/Debug.h"
35#include "llvm/Transforms/Utils/BasicBlockUtils.h"
36#include "llvm/Transforms/Utils/Cloning.h"
Ashutosh Nemac5b7b552015-08-19 05:40:42 +000037#include "llvm/Transforms/Utils/LoopUtils.h"
Adam Nemet215746b2015-07-10 18:55:13 +000038#include "llvm/Transforms/Utils/LoopVersioning.h"
Adam Nemet938d3d62015-05-14 12:05:18 +000039#include <list>
40
41#define LDIST_NAME "loop-distribute"
42#define DEBUG_TYPE LDIST_NAME
43
44using namespace llvm;
45
46static cl::opt<bool>
47 LDistVerify("loop-distribute-verify", cl::Hidden,
48 cl::desc("Turn on DominatorTree and LoopInfo verification "
49 "after Loop Distribution"),
50 cl::init(false));
51
52static cl::opt<bool> DistributeNonIfConvertible(
53 "loop-distribute-non-if-convertible", cl::Hidden,
54 cl::desc("Whether to distribute into a loop that may not be "
55 "if-convertible by the loop vectorizer"),
56 cl::init(false));
57
Silviu Baranga2910a4f2015-11-09 13:26:09 +000058static cl::opt<unsigned> DistributeSCEVCheckThreshold(
59 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
60 cl::desc("The maximum number of SCEV checks allowed for Loop "
61 "Distribution"));
62
Adam Nemetd2fa4142016-04-27 05:28:18 +000063static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
64 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
65 cl::Hidden,
66 cl::desc(
67 "The maximum number of SCEV checks allowed for Loop "
68 "Distribution for loop marked with #pragma loop distribute(enable)"));
69
70// Note that the initial value for this depends on whether the pass is invoked
71// directly or from the optimization pipeline.
72static cl::opt<bool> EnableLoopDistribute(
73 "enable-loop-distribute", cl::Hidden,
74 cl::desc("Enable the new, experimental LoopDistribution Pass"));
75
Adam Nemet938d3d62015-05-14 12:05:18 +000076STATISTIC(NumLoopsDistributed, "Number of loops distributed");
77
Adam Nemet2f85b732015-05-14 12:33:32 +000078namespace {
Adam Nemet938d3d62015-05-14 12:05:18 +000079/// \brief Maintains the set of instructions of the loop for a partition before
80/// cloning. After cloning, it hosts the new loop.
81class InstPartition {
82 typedef SmallPtrSet<Instruction *, 8> InstructionSet;
83
84public:
85 InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
86 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
87 Set.insert(I);
88 }
89
90 /// \brief Returns whether this partition contains a dependence cycle.
91 bool hasDepCycle() const { return DepCycle; }
92
93 /// \brief Adds an instruction to this partition.
94 void add(Instruction *I) { Set.insert(I); }
95
96 /// \brief Collection accessors.
97 InstructionSet::iterator begin() { return Set.begin(); }
98 InstructionSet::iterator end() { return Set.end(); }
99 InstructionSet::const_iterator begin() const { return Set.begin(); }
100 InstructionSet::const_iterator end() const { return Set.end(); }
101 bool empty() const { return Set.empty(); }
102
103 /// \brief Moves this partition into \p Other. This partition becomes empty
104 /// after this.
105 void moveTo(InstPartition &Other) {
106 Other.Set.insert(Set.begin(), Set.end());
107 Set.clear();
108 Other.DepCycle |= DepCycle;
109 }
110
111 /// \brief Populates the partition with a transitive closure of all the
112 /// instructions that the seeded instructions dependent on.
113 void populateUsedSet() {
114 // FIXME: We currently don't use control-dependence but simply include all
115 // blocks (possibly empty at the end) and let simplifycfg mostly clean this
116 // up.
117 for (auto *B : OrigLoop->getBlocks())
118 Set.insert(B->getTerminator());
119
120 // Follow the use-def chains to form a transitive closure of all the
121 // instructions that the originally seeded instructions depend on.
122 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
123 while (!Worklist.empty()) {
124 Instruction *I = Worklist.pop_back_val();
125 // Insert instructions from the loop that we depend on.
126 for (Value *V : I->operand_values()) {
127 auto *I = dyn_cast<Instruction>(V);
128 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
129 Worklist.push_back(I);
130 }
131 }
132 }
133
134 /// \brief Clones the original loop.
135 ///
136 /// Updates LoopInfo and DominatorTree using the information that block \p
137 /// LoopDomBB dominates the loop.
138 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
139 unsigned Index, LoopInfo *LI,
140 DominatorTree *DT) {
141 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
142 VMap, Twine(".ldist") + Twine(Index),
143 LI, DT, ClonedLoopBlocks);
144 return ClonedLoop;
145 }
146
147 /// \brief The cloned loop. If this partition is mapped to the original loop,
148 /// this is null.
149 const Loop *getClonedLoop() const { return ClonedLoop; }
150
151 /// \brief Returns the loop where this partition ends up after distribution.
152 /// If this partition is mapped to the original loop then use the block from
153 /// the loop.
154 const Loop *getDistributedLoop() const {
155 return ClonedLoop ? ClonedLoop : OrigLoop;
156 }
157
158 /// \brief The VMap that is populated by cloning and then used in
159 /// remapinstruction to remap the cloned instructions.
160 ValueToValueMapTy &getVMap() { return VMap; }
161
162 /// \brief Remaps the cloned instructions using VMap.
Adam Nemet1a689182015-07-10 18:55:09 +0000163 void remapInstructions() {
164 remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
165 }
Adam Nemet938d3d62015-05-14 12:05:18 +0000166
167 /// \brief Based on the set of instructions selected for this partition,
168 /// removes the unnecessary ones.
169 void removeUnusedInsts() {
170 SmallVector<Instruction *, 8> Unused;
171
172 for (auto *Block : OrigLoop->getBlocks())
173 for (auto &Inst : *Block)
174 if (!Set.count(&Inst)) {
175 Instruction *NewInst = &Inst;
176 if (!VMap.empty())
177 NewInst = cast<Instruction>(VMap[NewInst]);
178
179 assert(!isa<BranchInst>(NewInst) &&
180 "Branches are marked used early on");
181 Unused.push_back(NewInst);
182 }
183
184 // Delete the instructions backwards, as it has a reduced likelihood of
185 // having to update as many def-use and use-def chains.
Pete Cooper7679afd2015-07-24 21:13:43 +0000186 for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) {
Adam Nemet938d3d62015-05-14 12:05:18 +0000187 if (!Inst->use_empty())
188 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
189 Inst->eraseFromParent();
190 }
191 }
192
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000193 void print() const {
Adam Nemet938d3d62015-05-14 12:05:18 +0000194 if (DepCycle)
195 dbgs() << " (cycle)\n";
196 for (auto *I : Set)
197 // Prefix with the block name.
198 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
199 }
200
201 void printBlocks() const {
202 for (auto *BB : getDistributedLoop()->getBlocks())
203 dbgs() << *BB;
204 }
205
206private:
207 /// \brief Instructions from OrigLoop selected for this partition.
208 InstructionSet Set;
209
210 /// \brief Whether this partition contains a dependence cycle.
211 bool DepCycle;
212
213 /// \brief The original loop.
214 Loop *OrigLoop;
215
216 /// \brief The cloned loop. If this partition is mapped to the original loop,
217 /// this is null.
218 Loop *ClonedLoop;
219
220 /// \brief The blocks of ClonedLoop including the preheader. If this
221 /// partition is mapped to the original loop, this is empty.
222 SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
223
224 /// \brief These gets populated once the set of instructions have been
225 /// finalized. If this partition is mapped to the original loop, these are not
226 /// set.
227 ValueToValueMapTy VMap;
228};
229
230/// \brief Holds the set of Partitions. It populates them, merges them and then
231/// clones the loops.
232class InstPartitionContainer {
233 typedef DenseMap<Instruction *, int> InstToPartitionIdT;
234
235public:
236 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
237 : L(L), LI(LI), DT(DT) {}
238
239 /// \brief Returns the number of partitions.
240 unsigned getSize() const { return PartitionContainer.size(); }
241
242 /// \brief Adds \p Inst into the current partition if that is marked to
243 /// contain cycles. Otherwise start a new partition for it.
244 void addToCyclicPartition(Instruction *Inst) {
245 // If the current partition is non-cyclic. Start a new one.
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000246 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
247 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
Adam Nemet938d3d62015-05-14 12:05:18 +0000248 else
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000249 PartitionContainer.back().add(Inst);
Adam Nemet938d3d62015-05-14 12:05:18 +0000250 }
251
252 /// \brief Adds \p Inst into a partition that is not marked to contain
253 /// dependence cycles.
254 ///
255 // Initially we isolate memory instructions into as many partitions as
256 // possible, then later we may merge them back together.
257 void addToNewNonCyclicPartition(Instruction *Inst) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000258 PartitionContainer.emplace_back(Inst, L);
Adam Nemet938d3d62015-05-14 12:05:18 +0000259 }
260
261 /// \brief Merges adjacent non-cyclic partitions.
262 ///
263 /// The idea is that we currently only want to isolate the non-vectorizable
264 /// partition. We could later allow more distribution among these partition
265 /// too.
266 void mergeAdjacentNonCyclic() {
267 mergeAdjacentPartitionsIf(
268 [](const InstPartition *P) { return !P->hasDepCycle(); });
269 }
270
271 /// \brief If a partition contains only conditional stores, we won't vectorize
272 /// it. Try to merge it with a previous cyclic partition.
273 void mergeNonIfConvertible() {
274 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
275 if (Partition->hasDepCycle())
276 return true;
277
278 // Now, check if all stores are conditional in this partition.
279 bool seenStore = false;
280
281 for (auto *Inst : *Partition)
282 if (isa<StoreInst>(Inst)) {
283 seenStore = true;
284 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
285 return false;
286 }
287 return seenStore;
288 });
289 }
290
291 /// \brief Merges the partitions according to various heuristics.
292 void mergeBeforePopulating() {
293 mergeAdjacentNonCyclic();
294 if (!DistributeNonIfConvertible)
295 mergeNonIfConvertible();
296 }
297
298 /// \brief Merges partitions in order to ensure that no loads are duplicated.
299 ///
300 /// We can't duplicate loads because that could potentially reorder them.
301 /// LoopAccessAnalysis provides dependency information with the context that
302 /// the order of memory operation is preserved.
303 ///
304 /// Return if any partitions were merged.
305 bool mergeToAvoidDuplicatedLoads() {
306 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
307 typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
308
309 LoadToPartitionT LoadToPartition;
310 ToBeMergedT ToBeMerged;
311
312 // Step through the partitions and create equivalence between partitions
313 // that contain the same load. Also put partitions in between them in the
314 // same equivalence class to avoid reordering of memory operations.
315 for (PartitionContainerT::iterator I = PartitionContainer.begin(),
316 E = PartitionContainer.end();
317 I != E; ++I) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000318 auto *PartI = &*I;
Adam Nemet938d3d62015-05-14 12:05:18 +0000319
320 // If a load occurs in two partitions PartI and PartJ, merge all
321 // partitions (PartI, PartJ] into PartI.
322 for (Instruction *Inst : *PartI)
323 if (isa<LoadInst>(Inst)) {
324 bool NewElt;
325 LoadToPartitionT::iterator LoadToPart;
326
327 std::tie(LoadToPart, NewElt) =
328 LoadToPartition.insert(std::make_pair(Inst, PartI));
329 if (!NewElt) {
330 DEBUG(dbgs() << "Merging partitions due to this load in multiple "
331 << "partitions: " << PartI << ", "
332 << LoadToPart->second << "\n" << *Inst << "\n");
333
334 auto PartJ = I;
335 do {
336 --PartJ;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000337 ToBeMerged.unionSets(PartI, &*PartJ);
338 } while (&*PartJ != LoadToPart->second);
Adam Nemet938d3d62015-05-14 12:05:18 +0000339 }
340 }
341 }
342 if (ToBeMerged.empty())
343 return false;
344
345 // Merge the member of an equivalence class into its class leader. This
346 // makes the members empty.
347 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
348 I != E; ++I) {
349 if (!I->isLeader())
350 continue;
351
352 auto PartI = I->getData();
353 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
354 ToBeMerged.member_end())) {
355 PartJ->moveTo(*PartI);
356 }
357 }
358
359 // Remove the empty partitions.
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000360 PartitionContainer.remove_if(
361 [](const InstPartition &P) { return P.empty(); });
Adam Nemet938d3d62015-05-14 12:05:18 +0000362
363 return true;
364 }
365
366 /// \brief Sets up the mapping between instructions to partitions. If the
367 /// instruction is duplicated across multiple partitions, set the entry to -1.
368 void setupPartitionIdOnInstructions() {
369 int PartitionID = 0;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000370 for (const auto &Partition : PartitionContainer) {
371 for (Instruction *Inst : Partition) {
Adam Nemet938d3d62015-05-14 12:05:18 +0000372 bool NewElt;
373 InstToPartitionIdT::iterator Iter;
374
375 std::tie(Iter, NewElt) =
376 InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
377 if (!NewElt)
378 Iter->second = -1;
379 }
380 ++PartitionID;
381 }
382 }
383
384 /// \brief Populates the partition with everything that the seeding
385 /// instructions require.
386 void populateUsedSet() {
387 for (auto &P : PartitionContainer)
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000388 P.populateUsedSet();
Adam Nemet938d3d62015-05-14 12:05:18 +0000389 }
390
391 /// \brief This performs the main chunk of the work of cloning the loops for
392 /// the partitions.
Justin Bogner843fb202015-12-15 19:40:57 +0000393 void cloneLoops() {
Adam Nemet938d3d62015-05-14 12:05:18 +0000394 BasicBlock *OrigPH = L->getLoopPreheader();
395 // At this point the predecessor of the preheader is either the memcheck
396 // block or the top part of the original preheader.
397 BasicBlock *Pred = OrigPH->getSinglePredecessor();
398 assert(Pred && "Preheader does not have a single predecessor");
399 BasicBlock *ExitBlock = L->getExitBlock();
400 assert(ExitBlock && "No single exit block");
401 Loop *NewLoop;
402
403 assert(!PartitionContainer.empty() && "at least two partitions expected");
404 // We're cloning the preheader along with the loop so we already made sure
405 // it was empty.
406 assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
407 "preheader not empty");
408
409 // Create a loop for each partition except the last. Clone the original
410 // loop before PH along with adding a preheader for the cloned loop. Then
411 // update PH to point to the newly added preheader.
412 BasicBlock *TopPH = OrigPH;
413 unsigned Index = getSize() - 1;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000414 for (auto I = std::next(PartitionContainer.rbegin()),
415 E = PartitionContainer.rend();
Adam Nemet938d3d62015-05-14 12:05:18 +0000416 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000417 auto *Part = &*I;
Adam Nemet938d3d62015-05-14 12:05:18 +0000418
419 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
420
421 Part->getVMap()[ExitBlock] = TopPH;
422 Part->remapInstructions();
423 }
424 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
425
426 // Now go in forward order and update the immediate dominator for the
427 // preheaders with the exiting block of the previous loop. Dominance
428 // within the loop is updated in cloneLoopWithPreheader.
429 for (auto Curr = PartitionContainer.cbegin(),
430 Next = std::next(PartitionContainer.cbegin()),
431 E = PartitionContainer.cend();
432 Next != E; ++Curr, ++Next)
433 DT->changeImmediateDominator(
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000434 Next->getDistributedLoop()->getLoopPreheader(),
435 Curr->getDistributedLoop()->getExitingBlock());
Adam Nemet938d3d62015-05-14 12:05:18 +0000436 }
437
438 /// \brief Removes the dead instructions from the cloned loops.
439 void removeUnusedInsts() {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000440 for (auto &Partition : PartitionContainer)
441 Partition.removeUnusedInsts();
Adam Nemet938d3d62015-05-14 12:05:18 +0000442 }
443
444 /// \brief For each memory pointer, it computes the partitionId the pointer is
445 /// used in.
446 ///
447 /// This returns an array of int where the I-th entry corresponds to I-th
448 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
449 /// partitions its entry is set to -1.
450 SmallVector<int, 8>
451 computePartitionSetForPointers(const LoopAccessInfo &LAI) {
Adam Nemet7cdebac2015-07-14 22:32:44 +0000452 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
Adam Nemet938d3d62015-05-14 12:05:18 +0000453
454 unsigned N = RtPtrCheck->Pointers.size();
455 SmallVector<int, 8> PtrToPartitions(N);
456 for (unsigned I = 0; I < N; ++I) {
Adam Nemet9f7dedc2015-07-14 22:32:50 +0000457 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
Adam Nemet938d3d62015-05-14 12:05:18 +0000458 auto Instructions =
Adam Nemet9f7dedc2015-07-14 22:32:50 +0000459 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
Adam Nemet938d3d62015-05-14 12:05:18 +0000460
461 int &Partition = PtrToPartitions[I];
462 // First set it to uninitialized.
463 Partition = -2;
464 for (Instruction *Inst : Instructions) {
465 // Note that this could be -1 if Inst is duplicated across multiple
466 // partitions.
467 int ThisPartition = this->InstToPartitionId[Inst];
468 if (Partition == -2)
469 Partition = ThisPartition;
470 // -1 means belonging to multiple partitions.
471 else if (Partition == -1)
472 break;
473 else if (Partition != (int)ThisPartition)
474 Partition = -1;
475 }
476 assert(Partition != -2 && "Pointer not belonging to any partition");
477 }
478
479 return PtrToPartitions;
480 }
481
482 void print(raw_ostream &OS) const {
483 unsigned Index = 0;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000484 for (const auto &P : PartitionContainer) {
485 OS << "Partition " << Index++ << " (" << &P << "):\n";
486 P.print();
Adam Nemet938d3d62015-05-14 12:05:18 +0000487 }
488 }
489
490 void dump() const { print(dbgs()); }
491
492#ifndef NDEBUG
493 friend raw_ostream &operator<<(raw_ostream &OS,
494 const InstPartitionContainer &Partitions) {
495 Partitions.print(OS);
496 return OS;
497 }
498#endif
499
500 void printBlocks() const {
501 unsigned Index = 0;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000502 for (const auto &P : PartitionContainer) {
503 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
504 P.printBlocks();
Adam Nemet938d3d62015-05-14 12:05:18 +0000505 }
506 }
507
508private:
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000509 typedef std::list<InstPartition> PartitionContainerT;
Adam Nemet938d3d62015-05-14 12:05:18 +0000510
511 /// \brief List of partitions.
512 PartitionContainerT PartitionContainer;
513
514 /// \brief Mapping from Instruction to partition Id. If the instruction
515 /// belongs to multiple partitions the entry contains -1.
516 InstToPartitionIdT InstToPartitionId;
517
518 Loop *L;
519 LoopInfo *LI;
520 DominatorTree *DT;
521
522 /// \brief The control structure to merge adjacent partitions if both satisfy
523 /// the \p Predicate.
524 template <class UnaryPredicate>
525 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
526 InstPartition *PrevMatch = nullptr;
527 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000528 auto DoesMatch = Predicate(&*I);
Adam Nemet938d3d62015-05-14 12:05:18 +0000529 if (PrevMatch == nullptr && DoesMatch) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000530 PrevMatch = &*I;
Adam Nemet938d3d62015-05-14 12:05:18 +0000531 ++I;
532 } else if (PrevMatch != nullptr && DoesMatch) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000533 I->moveTo(*PrevMatch);
Adam Nemet938d3d62015-05-14 12:05:18 +0000534 I = PartitionContainer.erase(I);
535 } else {
536 PrevMatch = nullptr;
537 ++I;
538 }
539 }
540 }
541};
542
543/// \brief For each memory instruction, this class maintains difference of the
544/// number of unsafe dependences that start out from this instruction minus
545/// those that end here.
546///
547/// By traversing the memory instructions in program order and accumulating this
548/// number, we know whether any unsafe dependence crosses over a program point.
549class MemoryInstructionDependences {
550 typedef MemoryDepChecker::Dependence Dependence;
551
552public:
553 struct Entry {
554 Instruction *Inst;
555 unsigned NumUnsafeDependencesStartOrEnd;
556
557 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
558 };
559
560 typedef SmallVector<Entry, 8> AccessesType;
561
562 AccessesType::const_iterator begin() const { return Accesses.begin(); }
563 AccessesType::const_iterator end() const { return Accesses.end(); }
564
565 MemoryInstructionDependences(
566 const SmallVectorImpl<Instruction *> &Instructions,
Adam Nemeta2df7502015-11-03 21:39:52 +0000567 const SmallVectorImpl<Dependence> &Dependences) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000568 Accesses.append(Instructions.begin(), Instructions.end());
Adam Nemet938d3d62015-05-14 12:05:18 +0000569
570 DEBUG(dbgs() << "Backward dependences:\n");
Adam Nemeta2df7502015-11-03 21:39:52 +0000571 for (auto &Dep : Dependences)
Adam Nemet938d3d62015-05-14 12:05:18 +0000572 if (Dep.isPossiblyBackward()) {
573 // Note that the designations source and destination follow the program
574 // order, i.e. source is always first. (The direction is given by the
575 // DepType.)
576 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
577 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
578
579 DEBUG(Dep.print(dbgs(), 2, Instructions));
580 }
581 }
582
583private:
584 AccessesType Accesses;
585};
586
Adam Nemet61399ac2016-04-27 00:31:03 +0000587/// \brief The actual class performing the per-loop work.
588class LoopDistributeForLoop {
Adam Nemet938d3d62015-05-14 12:05:18 +0000589public:
Adam Nemet61399ac2016-04-27 00:31:03 +0000590 LoopDistributeForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
591 DominatorTree *DT, ScalarEvolution *SE)
Adam Nemetd2fa4142016-04-27 05:28:18 +0000592 : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {
593 setForced();
594 }
Adam Nemetc75ad692015-07-30 03:29:16 +0000595
Adam Nemet938d3d62015-05-14 12:05:18 +0000596 /// \brief Try to distribute an inner-most loop.
Adam Nemet61399ac2016-04-27 00:31:03 +0000597 bool processLoop() {
Adam Nemet938d3d62015-05-14 12:05:18 +0000598 assert(L->empty() && "Only process inner loops.");
599
600 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
601 << "\" checking " << *L << "\n");
602
603 BasicBlock *PH = L->getLoopPreheader();
604 if (!PH) {
605 DEBUG(dbgs() << "Skipping; no preheader");
606 return false;
607 }
608 if (!L->getExitBlock()) {
609 DEBUG(dbgs() << "Skipping; multiple exit blocks");
610 return false;
611 }
612 // LAA will check that we only have a single exiting block.
613
Adam Nemet938d3d62015-05-14 12:05:18 +0000614 // Currently, we only distribute to isolate the part of the loop with
615 // dependence cycles to enable partial vectorization.
616 if (LAI.canVectorizeMemory()) {
617 DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization");
618 return false;
619 }
Adam Nemeta2df7502015-11-03 21:39:52 +0000620 auto *Dependences = LAI.getDepChecker().getDependences();
621 if (!Dependences || Dependences->empty()) {
Adam Nemet938d3d62015-05-14 12:05:18 +0000622 DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate");
623 return false;
624 }
625
626 InstPartitionContainer Partitions(L, LI, DT);
627
628 // First, go through each memory operation and assign them to consecutive
629 // partitions (the order of partitions follows program order). Put those
630 // with unsafe dependences into "cyclic" partition otherwise put each store
631 // in its own "non-cyclic" partition (we'll merge these later).
632 //
633 // Note that a memory operation (e.g. Load2 below) at a program point that
634 // has an unsafe dependence (Store3->Load1) spanning over it must be
635 // included in the same cyclic partition as the dependent operations. This
636 // is to preserve the original program order after distribution. E.g.:
637 //
638 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
639 // Load1 -. 1 0->1
640 // Load2 | /Unsafe/ 0 1
641 // Store3 -' -1 1->0
642 // Load4 0 0
643 //
644 // NumUnsafeDependencesActive > 0 indicates this situation and in this case
645 // we just keep assigning to the same cyclic partition until
646 // NumUnsafeDependencesActive reaches 0.
647 const MemoryDepChecker &DepChecker = LAI.getDepChecker();
648 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
Adam Nemeta2df7502015-11-03 21:39:52 +0000649 *Dependences);
Adam Nemet938d3d62015-05-14 12:05:18 +0000650
651 int NumUnsafeDependencesActive = 0;
652 for (auto &InstDep : MID) {
653 Instruction *I = InstDep.Inst;
654 // We update NumUnsafeDependencesActive post-instruction, catch the
655 // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
656 if (NumUnsafeDependencesActive ||
657 InstDep.NumUnsafeDependencesStartOrEnd > 0)
658 Partitions.addToCyclicPartition(I);
659 else
660 Partitions.addToNewNonCyclicPartition(I);
661 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
662 assert(NumUnsafeDependencesActive >= 0 &&
663 "Negative number of dependences active");
664 }
665
666 // Add partitions for values used outside. These partitions can be out of
667 // order from the original program order. This is OK because if the
668 // partition uses a load we will merge this partition with the original
669 // partition of the load that we set up in the previous loop (see
670 // mergeToAvoidDuplicatedLoads).
671 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
672 for (auto *Inst : DefsUsedOutside)
673 Partitions.addToNewNonCyclicPartition(Inst);
674
675 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
676 if (Partitions.getSize() < 2)
677 return false;
678
679 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
680 // should be able to vectorize these together.
681 Partitions.mergeBeforePopulating();
682 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
683 if (Partitions.getSize() < 2)
684 return false;
685
686 // Now, populate the partitions with non-memory operations.
687 Partitions.populateUsedSet();
688 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
689
690 // In order to preserve original lexical order for loads, keep them in the
691 // partition that we set up in the MemoryInstructionDependences loop.
692 if (Partitions.mergeToAvoidDuplicatedLoads()) {
693 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
694 << Partitions);
695 if (Partitions.getSize() < 2)
696 return false;
697 }
698
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000699 // Don't distribute the loop if we need too many SCEV run-time checks.
Silviu Baranga9cd9a7e2015-12-09 16:06:28 +0000700 const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
Adam Nemetd2fa4142016-04-27 05:28:18 +0000701 if (Pred.getComplexity() > (IsForced.getValueOr(false)
702 ? PragmaDistributeSCEVCheckThreshold
703 : DistributeSCEVCheckThreshold)) {
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000704 DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
705 return false;
706 }
707
Adam Nemet938d3d62015-05-14 12:05:18 +0000708 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
709 // We're done forming the partitions set up the reverse mapping from
710 // instructions to partitions.
711 Partitions.setupPartitionIdOnInstructions();
712
713 // To keep things simple have an empty preheader before we version or clone
714 // the loop. (Also split if this has no predecessor, i.e. entry, because we
715 // rely on PH having a predecessor.)
716 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
717 SplitBlock(PH, PH->getTerminator(), DT, LI);
718
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000719 // If we need run-time checks, version the loop now.
Adam Nemet772a1502015-06-19 19:32:41 +0000720 auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI);
Adam Nemetc75ad692015-07-30 03:29:16 +0000721 const auto *RtPtrChecking = LAI.getRuntimePointerChecking();
Adam Nemet15840392015-08-07 22:44:15 +0000722 const auto &AllChecks = RtPtrChecking->getChecks();
Adam Nemetc75ad692015-07-30 03:29:16 +0000723 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
724 RtPtrChecking);
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000725
726 if (!Pred.isAlwaysTrue() || !Checks.empty()) {
Adam Nemet772a1502015-06-19 19:32:41 +0000727 DEBUG(dbgs() << "\nPointers:\n");
Adam Nemet0a674402015-07-28 05:01:53 +0000728 DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000729 LoopVersioning LVer(LAI, L, LI, DT, SE, false);
730 LVer.setAliasChecks(std::move(Checks));
Silviu Baranga9cd9a7e2015-12-09 16:06:28 +0000731 LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
Adam Nemete4813402015-08-20 17:22:29 +0000732 LVer.versionLoop(DefsUsedOutside);
Adam Nemet5eccf072016-03-17 20:32:32 +0000733 LVer.annotateLoopWithNoAlias();
Adam Nemet938d3d62015-05-14 12:05:18 +0000734 }
735
736 // Create identical copies of the original loop for each partition and hook
737 // them up sequentially.
Justin Bogner843fb202015-12-15 19:40:57 +0000738 Partitions.cloneLoops();
Adam Nemet938d3d62015-05-14 12:05:18 +0000739
740 // Now, we remove the instruction from each loop that don't belong to that
741 // partition.
742 Partitions.removeUnusedInsts();
743 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
744 DEBUG(Partitions.printBlocks());
745
746 if (LDistVerify) {
747 LI->verify();
748 DT->verifyDomTree();
749 }
750
751 ++NumLoopsDistributed;
752 return true;
753 }
754
Adam Nemetd2fa4142016-04-27 05:28:18 +0000755 /// \brief Return if distribution forced to be enabled/disabled for the loop.
756 ///
757 /// If the optional has a value, it indicates whether distribution was forced
758 /// to be enabled (true) or disabled (false). If the optional has no value
759 /// distribution was not forced either way.
760 const Optional<bool> &isForced() const { return IsForced; }
761
Adam Nemet61399ac2016-04-27 00:31:03 +0000762private:
763 /// \brief Filter out checks between pointers from the same partition.
764 ///
765 /// \p PtrToPartition contains the partition number for pointers. Partition
766 /// number -1 means that the pointer is used in multiple partitions. In this
767 /// case we can't safely omit the check.
768 SmallVector<RuntimePointerChecking::PointerCheck, 4>
769 includeOnlyCrossPartitionChecks(
770 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
771 const SmallVectorImpl<int> &PtrToPartition,
772 const RuntimePointerChecking *RtPtrChecking) {
773 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
774
775 std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
776 [&](const RuntimePointerChecking::PointerCheck &Check) {
777 for (unsigned PtrIdx1 : Check.first->Members)
778 for (unsigned PtrIdx2 : Check.second->Members)
779 // Only include this check if there is a pair of pointers
780 // that require checking and the pointers fall into
781 // separate partitions.
782 //
783 // (Note that we already know at this point that the two
784 // pointer groups need checking but it doesn't follow
785 // that each pair of pointers within the two groups need
786 // checking as well.
787 //
788 // In other words we don't want to include a check just
789 // because there is a pair of pointers between the two
790 // pointer groups that require checks and a different
791 // pair whose pointers fall into different partitions.)
792 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
793 !RuntimePointerChecking::arePointersInSamePartition(
794 PtrToPartition, PtrIdx1, PtrIdx2))
795 return true;
796 return false;
797 });
798
799 return Checks;
800 }
801
Adam Nemetd2fa4142016-04-27 05:28:18 +0000802 /// \brief Check whether the loop metadata is forcing distribution to be
803 /// enabled/disabled.
804 void setForced() {
805 Optional<const MDOperand *> Value =
806 findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
807 if (!Value)
808 return;
809
810 const MDOperand *Op = *Value;
811 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
812 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
813 }
814
Adam Nemet938d3d62015-05-14 12:05:18 +0000815 // Analyses used.
Adam Nemet61399ac2016-04-27 00:31:03 +0000816 Loop *L;
Adam Nemet938d3d62015-05-14 12:05:18 +0000817 LoopInfo *LI;
Adam Nemet61399ac2016-04-27 00:31:03 +0000818 const LoopAccessInfo &LAI;
Adam Nemet938d3d62015-05-14 12:05:18 +0000819 DominatorTree *DT;
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000820 ScalarEvolution *SE;
Adam Nemetd2fa4142016-04-27 05:28:18 +0000821
822 /// \brief Indicates whether distribution is forced to be enabled/disabled for
823 /// the loop.
824 ///
825 /// If the optional has a value, it indicates whether distribution was forced
826 /// to be enabled (true) or disabled (false). If the optional has no value
827 /// distribution was not forced either way.
828 Optional<bool> IsForced;
Adam Nemet938d3d62015-05-14 12:05:18 +0000829};
Adam Nemet61399ac2016-04-27 00:31:03 +0000830
831/// \brief The pass class.
832class LoopDistribute : public FunctionPass {
833public:
Adam Nemetd2fa4142016-04-27 05:28:18 +0000834 /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
835 /// performed by default. Pass -enable-loop-distribute={0,1} overrides this
836 /// default. We use this to keep LoopDistribution off by default when invoked
837 /// from the optimization pipeline but on when invoked explicitly from opt.
838 LoopDistribute(bool ProcessAllLoopsByDefault = true)
839 : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
840 // The default is set by the caller.
841 if (EnableLoopDistribute.getNumOccurrences() > 0)
842 ProcessAllLoops = EnableLoopDistribute;
Adam Nemet61399ac2016-04-27 00:31:03 +0000843 initializeLoopDistributePass(*PassRegistry::getPassRegistry());
844 }
845
846 bool runOnFunction(Function &F) override {
847 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
848 auto *LAA = &getAnalysis<LoopAccessAnalysis>();
849 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
850 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
851
852 // Build up a worklist of inner-loops to vectorize. This is necessary as the
853 // act of distributing a loop creates new loops and can invalidate iterators
854 // across the loops.
855 SmallVector<Loop *, 8> Worklist;
856
857 for (Loop *TopLevelLoop : *LI)
858 for (Loop *L : depth_first(TopLevelLoop))
859 // We only handle inner-most loops.
860 if (L->empty())
861 Worklist.push_back(L);
862
863 // Now walk the identified inner loops.
864 bool Changed = false;
865 for (Loop *L : Worklist) {
866 const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
867 LoopDistributeForLoop LDL(L, LI, LAI, DT, SE);
Adam Nemetd2fa4142016-04-27 05:28:18 +0000868
869 // If distribution was forced for the specific loop to be
870 // enabled/disabled, follow that. Otherwise use the global flag.
871 if (LDL.isForced().getValueOr(ProcessAllLoops))
872 Changed |= LDL.processLoop();
Adam Nemet61399ac2016-04-27 00:31:03 +0000873 }
874
875 // Process each loop nest in the function.
876 return Changed;
877 }
878
879 void getAnalysisUsage(AnalysisUsage &AU) const override {
880 AU.addRequired<ScalarEvolutionWrapperPass>();
881 AU.addRequired<LoopInfoWrapperPass>();
882 AU.addPreserved<LoopInfoWrapperPass>();
883 AU.addRequired<LoopAccessAnalysis>();
884 AU.addRequired<DominatorTreeWrapperPass>();
885 AU.addPreserved<DominatorTreeWrapperPass>();
886 }
887
888 static char ID;
Adam Nemetd2fa4142016-04-27 05:28:18 +0000889
890private:
891 /// \brief Whether distribution should be on in this function. The per-loop
892 /// pragma can override this.
893 bool ProcessAllLoops;
Adam Nemet61399ac2016-04-27 00:31:03 +0000894};
Adam Nemet938d3d62015-05-14 12:05:18 +0000895} // anonymous namespace
896
897char LoopDistribute::ID;
898static const char ldist_name[] = "Loop Distribition";
899
900INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
901INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
902INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
903INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000904INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
Adam Nemet938d3d62015-05-14 12:05:18 +0000905INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
906
907namespace llvm {
Adam Nemetd2fa4142016-04-27 05:28:18 +0000908FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
909 return new LoopDistribute(ProcessAllLoopsByDefault);
910}
Adam Nemet938d3d62015-05-14 12:05:18 +0000911}