blob: f438542aed7e78218f94f4b518a748b2c6b49f45 [file] [log] [blame]
Adam Nemet938d3d62015-05-14 12:05:18 +00001//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the Loop Distribution Pass. Its main focus is to
11// distribute loops that cannot be vectorized due to dependence cycles. It
12// tries to isolate the offending dependences into a new loop allowing
13// vectorization of the remaining parts.
14//
15// For dependence analysis, the pass uses the LoopVectorizer's
16// LoopAccessAnalysis. Because this analysis presumes no change in the order of
17// memory operations, special care is taken to preserve the lexical order of
18// these operations.
19//
20// Similarly to the Vectorizer, the pass also supports loop versioning to
21// run-time disambiguate potentially overlapping arrays.
22//
23//===----------------------------------------------------------------------===//
24
25#include "llvm/ADT/DepthFirstIterator.h"
26#include "llvm/ADT/EquivalenceClasses.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/Analysis/LoopAccessAnalysis.h"
30#include "llvm/Analysis/LoopInfo.h"
Adam Nemet0ba164b2016-04-28 23:08:32 +000031#include "llvm/IR/DiagnosticInfo.h"
Adam Nemet938d3d62015-05-14 12:05:18 +000032#include "llvm/IR/Dominators.h"
33#include "llvm/Pass.h"
34#include "llvm/Support/CommandLine.h"
35#include "llvm/Support/Debug.h"
36#include "llvm/Transforms/Utils/BasicBlockUtils.h"
37#include "llvm/Transforms/Utils/Cloning.h"
Ashutosh Nemac5b7b552015-08-19 05:40:42 +000038#include "llvm/Transforms/Utils/LoopUtils.h"
Adam Nemet215746b2015-07-10 18:55:13 +000039#include "llvm/Transforms/Utils/LoopVersioning.h"
Adam Nemet938d3d62015-05-14 12:05:18 +000040#include <list>
41
42#define LDIST_NAME "loop-distribute"
43#define DEBUG_TYPE LDIST_NAME
44
45using namespace llvm;
46
47static cl::opt<bool>
48 LDistVerify("loop-distribute-verify", cl::Hidden,
49 cl::desc("Turn on DominatorTree and LoopInfo verification "
50 "after Loop Distribution"),
51 cl::init(false));
52
53static cl::opt<bool> DistributeNonIfConvertible(
54 "loop-distribute-non-if-convertible", cl::Hidden,
55 cl::desc("Whether to distribute into a loop that may not be "
56 "if-convertible by the loop vectorizer"),
57 cl::init(false));
58
Silviu Baranga2910a4f2015-11-09 13:26:09 +000059static cl::opt<unsigned> DistributeSCEVCheckThreshold(
60 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
61 cl::desc("The maximum number of SCEV checks allowed for Loop "
62 "Distribution"));
63
Adam Nemetd2fa4142016-04-27 05:28:18 +000064static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
65 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
66 cl::Hidden,
67 cl::desc(
68 "The maximum number of SCEV checks allowed for Loop "
69 "Distribution for loop marked with #pragma loop distribute(enable)"));
70
71// Note that the initial value for this depends on whether the pass is invoked
72// directly or from the optimization pipeline.
73static cl::opt<bool> EnableLoopDistribute(
74 "enable-loop-distribute", cl::Hidden,
75 cl::desc("Enable the new, experimental LoopDistribution Pass"));
76
Adam Nemet938d3d62015-05-14 12:05:18 +000077STATISTIC(NumLoopsDistributed, "Number of loops distributed");
78
Adam Nemet2f85b732015-05-14 12:33:32 +000079namespace {
Adam Nemet938d3d62015-05-14 12:05:18 +000080/// \brief Maintains the set of instructions of the loop for a partition before
81/// cloning. After cloning, it hosts the new loop.
82class InstPartition {
83 typedef SmallPtrSet<Instruction *, 8> InstructionSet;
84
85public:
86 InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
87 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
88 Set.insert(I);
89 }
90
91 /// \brief Returns whether this partition contains a dependence cycle.
92 bool hasDepCycle() const { return DepCycle; }
93
94 /// \brief Adds an instruction to this partition.
95 void add(Instruction *I) { Set.insert(I); }
96
97 /// \brief Collection accessors.
98 InstructionSet::iterator begin() { return Set.begin(); }
99 InstructionSet::iterator end() { return Set.end(); }
100 InstructionSet::const_iterator begin() const { return Set.begin(); }
101 InstructionSet::const_iterator end() const { return Set.end(); }
102 bool empty() const { return Set.empty(); }
103
104 /// \brief Moves this partition into \p Other. This partition becomes empty
105 /// after this.
106 void moveTo(InstPartition &Other) {
107 Other.Set.insert(Set.begin(), Set.end());
108 Set.clear();
109 Other.DepCycle |= DepCycle;
110 }
111
112 /// \brief Populates the partition with a transitive closure of all the
113 /// instructions that the seeded instructions dependent on.
114 void populateUsedSet() {
115 // FIXME: We currently don't use control-dependence but simply include all
116 // blocks (possibly empty at the end) and let simplifycfg mostly clean this
117 // up.
118 for (auto *B : OrigLoop->getBlocks())
119 Set.insert(B->getTerminator());
120
121 // Follow the use-def chains to form a transitive closure of all the
122 // instructions that the originally seeded instructions depend on.
123 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
124 while (!Worklist.empty()) {
125 Instruction *I = Worklist.pop_back_val();
126 // Insert instructions from the loop that we depend on.
127 for (Value *V : I->operand_values()) {
128 auto *I = dyn_cast<Instruction>(V);
129 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
130 Worklist.push_back(I);
131 }
132 }
133 }
134
135 /// \brief Clones the original loop.
136 ///
137 /// Updates LoopInfo and DominatorTree using the information that block \p
138 /// LoopDomBB dominates the loop.
139 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
140 unsigned Index, LoopInfo *LI,
141 DominatorTree *DT) {
142 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
143 VMap, Twine(".ldist") + Twine(Index),
144 LI, DT, ClonedLoopBlocks);
145 return ClonedLoop;
146 }
147
148 /// \brief The cloned loop. If this partition is mapped to the original loop,
149 /// this is null.
150 const Loop *getClonedLoop() const { return ClonedLoop; }
151
152 /// \brief Returns the loop where this partition ends up after distribution.
153 /// If this partition is mapped to the original loop then use the block from
154 /// the loop.
155 const Loop *getDistributedLoop() const {
156 return ClonedLoop ? ClonedLoop : OrigLoop;
157 }
158
159 /// \brief The VMap that is populated by cloning and then used in
160 /// remapinstruction to remap the cloned instructions.
161 ValueToValueMapTy &getVMap() { return VMap; }
162
163 /// \brief Remaps the cloned instructions using VMap.
Adam Nemet1a689182015-07-10 18:55:09 +0000164 void remapInstructions() {
165 remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
166 }
Adam Nemet938d3d62015-05-14 12:05:18 +0000167
168 /// \brief Based on the set of instructions selected for this partition,
169 /// removes the unnecessary ones.
170 void removeUnusedInsts() {
171 SmallVector<Instruction *, 8> Unused;
172
173 for (auto *Block : OrigLoop->getBlocks())
174 for (auto &Inst : *Block)
175 if (!Set.count(&Inst)) {
176 Instruction *NewInst = &Inst;
177 if (!VMap.empty())
178 NewInst = cast<Instruction>(VMap[NewInst]);
179
180 assert(!isa<BranchInst>(NewInst) &&
181 "Branches are marked used early on");
182 Unused.push_back(NewInst);
183 }
184
185 // Delete the instructions backwards, as it has a reduced likelihood of
186 // having to update as many def-use and use-def chains.
David Majnemerd7708772016-06-24 04:05:21 +0000187 for (auto *Inst : reverse(Unused)) {
Adam Nemet938d3d62015-05-14 12:05:18 +0000188 if (!Inst->use_empty())
189 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
190 Inst->eraseFromParent();
191 }
192 }
193
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000194 void print() const {
Adam Nemet938d3d62015-05-14 12:05:18 +0000195 if (DepCycle)
196 dbgs() << " (cycle)\n";
197 for (auto *I : Set)
198 // Prefix with the block name.
199 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
200 }
201
202 void printBlocks() const {
203 for (auto *BB : getDistributedLoop()->getBlocks())
204 dbgs() << *BB;
205 }
206
207private:
208 /// \brief Instructions from OrigLoop selected for this partition.
209 InstructionSet Set;
210
211 /// \brief Whether this partition contains a dependence cycle.
212 bool DepCycle;
213
214 /// \brief The original loop.
215 Loop *OrigLoop;
216
217 /// \brief The cloned loop. If this partition is mapped to the original loop,
218 /// this is null.
219 Loop *ClonedLoop;
220
221 /// \brief The blocks of ClonedLoop including the preheader. If this
222 /// partition is mapped to the original loop, this is empty.
223 SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
224
225 /// \brief These gets populated once the set of instructions have been
226 /// finalized. If this partition is mapped to the original loop, these are not
227 /// set.
228 ValueToValueMapTy VMap;
229};
230
231/// \brief Holds the set of Partitions. It populates them, merges them and then
232/// clones the loops.
233class InstPartitionContainer {
234 typedef DenseMap<Instruction *, int> InstToPartitionIdT;
235
236public:
237 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
238 : L(L), LI(LI), DT(DT) {}
239
240 /// \brief Returns the number of partitions.
241 unsigned getSize() const { return PartitionContainer.size(); }
242
243 /// \brief Adds \p Inst into the current partition if that is marked to
244 /// contain cycles. Otherwise start a new partition for it.
245 void addToCyclicPartition(Instruction *Inst) {
246 // If the current partition is non-cyclic. Start a new one.
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000247 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
248 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
Adam Nemet938d3d62015-05-14 12:05:18 +0000249 else
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000250 PartitionContainer.back().add(Inst);
Adam Nemet938d3d62015-05-14 12:05:18 +0000251 }
252
253 /// \brief Adds \p Inst into a partition that is not marked to contain
254 /// dependence cycles.
255 ///
256 // Initially we isolate memory instructions into as many partitions as
257 // possible, then later we may merge them back together.
258 void addToNewNonCyclicPartition(Instruction *Inst) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000259 PartitionContainer.emplace_back(Inst, L);
Adam Nemet938d3d62015-05-14 12:05:18 +0000260 }
261
262 /// \brief Merges adjacent non-cyclic partitions.
263 ///
264 /// The idea is that we currently only want to isolate the non-vectorizable
265 /// partition. We could later allow more distribution among these partition
266 /// too.
267 void mergeAdjacentNonCyclic() {
268 mergeAdjacentPartitionsIf(
269 [](const InstPartition *P) { return !P->hasDepCycle(); });
270 }
271
272 /// \brief If a partition contains only conditional stores, we won't vectorize
273 /// it. Try to merge it with a previous cyclic partition.
274 void mergeNonIfConvertible() {
275 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
276 if (Partition->hasDepCycle())
277 return true;
278
279 // Now, check if all stores are conditional in this partition.
280 bool seenStore = false;
281
282 for (auto *Inst : *Partition)
283 if (isa<StoreInst>(Inst)) {
284 seenStore = true;
285 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
286 return false;
287 }
288 return seenStore;
289 });
290 }
291
292 /// \brief Merges the partitions according to various heuristics.
293 void mergeBeforePopulating() {
294 mergeAdjacentNonCyclic();
295 if (!DistributeNonIfConvertible)
296 mergeNonIfConvertible();
297 }
298
299 /// \brief Merges partitions in order to ensure that no loads are duplicated.
300 ///
301 /// We can't duplicate loads because that could potentially reorder them.
302 /// LoopAccessAnalysis provides dependency information with the context that
303 /// the order of memory operation is preserved.
304 ///
305 /// Return if any partitions were merged.
306 bool mergeToAvoidDuplicatedLoads() {
307 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
308 typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
309
310 LoadToPartitionT LoadToPartition;
311 ToBeMergedT ToBeMerged;
312
313 // Step through the partitions and create equivalence between partitions
314 // that contain the same load. Also put partitions in between them in the
315 // same equivalence class to avoid reordering of memory operations.
316 for (PartitionContainerT::iterator I = PartitionContainer.begin(),
317 E = PartitionContainer.end();
318 I != E; ++I) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000319 auto *PartI = &*I;
Adam Nemet938d3d62015-05-14 12:05:18 +0000320
321 // If a load occurs in two partitions PartI and PartJ, merge all
322 // partitions (PartI, PartJ] into PartI.
323 for (Instruction *Inst : *PartI)
324 if (isa<LoadInst>(Inst)) {
325 bool NewElt;
326 LoadToPartitionT::iterator LoadToPart;
327
328 std::tie(LoadToPart, NewElt) =
329 LoadToPartition.insert(std::make_pair(Inst, PartI));
330 if (!NewElt) {
331 DEBUG(dbgs() << "Merging partitions due to this load in multiple "
332 << "partitions: " << PartI << ", "
333 << LoadToPart->second << "\n" << *Inst << "\n");
334
335 auto PartJ = I;
336 do {
337 --PartJ;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000338 ToBeMerged.unionSets(PartI, &*PartJ);
339 } while (&*PartJ != LoadToPart->second);
Adam Nemet938d3d62015-05-14 12:05:18 +0000340 }
341 }
342 }
343 if (ToBeMerged.empty())
344 return false;
345
346 // Merge the member of an equivalence class into its class leader. This
347 // makes the members empty.
348 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
349 I != E; ++I) {
350 if (!I->isLeader())
351 continue;
352
353 auto PartI = I->getData();
354 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
355 ToBeMerged.member_end())) {
356 PartJ->moveTo(*PartI);
357 }
358 }
359
360 // Remove the empty partitions.
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000361 PartitionContainer.remove_if(
362 [](const InstPartition &P) { return P.empty(); });
Adam Nemet938d3d62015-05-14 12:05:18 +0000363
364 return true;
365 }
366
367 /// \brief Sets up the mapping between instructions to partitions. If the
368 /// instruction is duplicated across multiple partitions, set the entry to -1.
369 void setupPartitionIdOnInstructions() {
370 int PartitionID = 0;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000371 for (const auto &Partition : PartitionContainer) {
372 for (Instruction *Inst : Partition) {
Adam Nemet938d3d62015-05-14 12:05:18 +0000373 bool NewElt;
374 InstToPartitionIdT::iterator Iter;
375
376 std::tie(Iter, NewElt) =
377 InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
378 if (!NewElt)
379 Iter->second = -1;
380 }
381 ++PartitionID;
382 }
383 }
384
385 /// \brief Populates the partition with everything that the seeding
386 /// instructions require.
387 void populateUsedSet() {
388 for (auto &P : PartitionContainer)
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000389 P.populateUsedSet();
Adam Nemet938d3d62015-05-14 12:05:18 +0000390 }
391
392 /// \brief This performs the main chunk of the work of cloning the loops for
393 /// the partitions.
Justin Bogner843fb202015-12-15 19:40:57 +0000394 void cloneLoops() {
Adam Nemet938d3d62015-05-14 12:05:18 +0000395 BasicBlock *OrigPH = L->getLoopPreheader();
396 // At this point the predecessor of the preheader is either the memcheck
397 // block or the top part of the original preheader.
398 BasicBlock *Pred = OrigPH->getSinglePredecessor();
399 assert(Pred && "Preheader does not have a single predecessor");
400 BasicBlock *ExitBlock = L->getExitBlock();
401 assert(ExitBlock && "No single exit block");
402 Loop *NewLoop;
403
404 assert(!PartitionContainer.empty() && "at least two partitions expected");
405 // We're cloning the preheader along with the loop so we already made sure
406 // it was empty.
407 assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
408 "preheader not empty");
409
410 // Create a loop for each partition except the last. Clone the original
411 // loop before PH along with adding a preheader for the cloned loop. Then
412 // update PH to point to the newly added preheader.
413 BasicBlock *TopPH = OrigPH;
414 unsigned Index = getSize() - 1;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000415 for (auto I = std::next(PartitionContainer.rbegin()),
416 E = PartitionContainer.rend();
Adam Nemet938d3d62015-05-14 12:05:18 +0000417 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000418 auto *Part = &*I;
Adam Nemet938d3d62015-05-14 12:05:18 +0000419
420 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
421
422 Part->getVMap()[ExitBlock] = TopPH;
423 Part->remapInstructions();
424 }
425 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
426
427 // Now go in forward order and update the immediate dominator for the
428 // preheaders with the exiting block of the previous loop. Dominance
429 // within the loop is updated in cloneLoopWithPreheader.
430 for (auto Curr = PartitionContainer.cbegin(),
431 Next = std::next(PartitionContainer.cbegin()),
432 E = PartitionContainer.cend();
433 Next != E; ++Curr, ++Next)
434 DT->changeImmediateDominator(
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000435 Next->getDistributedLoop()->getLoopPreheader(),
436 Curr->getDistributedLoop()->getExitingBlock());
Adam Nemet938d3d62015-05-14 12:05:18 +0000437 }
438
439 /// \brief Removes the dead instructions from the cloned loops.
440 void removeUnusedInsts() {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000441 for (auto &Partition : PartitionContainer)
442 Partition.removeUnusedInsts();
Adam Nemet938d3d62015-05-14 12:05:18 +0000443 }
444
445 /// \brief For each memory pointer, it computes the partitionId the pointer is
446 /// used in.
447 ///
448 /// This returns an array of int where the I-th entry corresponds to I-th
449 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
450 /// partitions its entry is set to -1.
451 SmallVector<int, 8>
452 computePartitionSetForPointers(const LoopAccessInfo &LAI) {
Adam Nemet7cdebac2015-07-14 22:32:44 +0000453 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
Adam Nemet938d3d62015-05-14 12:05:18 +0000454
455 unsigned N = RtPtrCheck->Pointers.size();
456 SmallVector<int, 8> PtrToPartitions(N);
457 for (unsigned I = 0; I < N; ++I) {
Adam Nemet9f7dedc2015-07-14 22:32:50 +0000458 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
Adam Nemet938d3d62015-05-14 12:05:18 +0000459 auto Instructions =
Adam Nemet9f7dedc2015-07-14 22:32:50 +0000460 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
Adam Nemet938d3d62015-05-14 12:05:18 +0000461
462 int &Partition = PtrToPartitions[I];
463 // First set it to uninitialized.
464 Partition = -2;
465 for (Instruction *Inst : Instructions) {
466 // Note that this could be -1 if Inst is duplicated across multiple
467 // partitions.
468 int ThisPartition = this->InstToPartitionId[Inst];
469 if (Partition == -2)
470 Partition = ThisPartition;
471 // -1 means belonging to multiple partitions.
472 else if (Partition == -1)
473 break;
474 else if (Partition != (int)ThisPartition)
475 Partition = -1;
476 }
477 assert(Partition != -2 && "Pointer not belonging to any partition");
478 }
479
480 return PtrToPartitions;
481 }
482
483 void print(raw_ostream &OS) const {
484 unsigned Index = 0;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000485 for (const auto &P : PartitionContainer) {
486 OS << "Partition " << Index++ << " (" << &P << "):\n";
487 P.print();
Adam Nemet938d3d62015-05-14 12:05:18 +0000488 }
489 }
490
491 void dump() const { print(dbgs()); }
492
493#ifndef NDEBUG
494 friend raw_ostream &operator<<(raw_ostream &OS,
495 const InstPartitionContainer &Partitions) {
496 Partitions.print(OS);
497 return OS;
498 }
499#endif
500
501 void printBlocks() const {
502 unsigned Index = 0;
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000503 for (const auto &P : PartitionContainer) {
504 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
505 P.printBlocks();
Adam Nemet938d3d62015-05-14 12:05:18 +0000506 }
507 }
508
509private:
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000510 typedef std::list<InstPartition> PartitionContainerT;
Adam Nemet938d3d62015-05-14 12:05:18 +0000511
512 /// \brief List of partitions.
513 PartitionContainerT PartitionContainer;
514
515 /// \brief Mapping from Instruction to partition Id. If the instruction
516 /// belongs to multiple partitions the entry contains -1.
517 InstToPartitionIdT InstToPartitionId;
518
519 Loop *L;
520 LoopInfo *LI;
521 DominatorTree *DT;
522
523 /// \brief The control structure to merge adjacent partitions if both satisfy
524 /// the \p Predicate.
525 template <class UnaryPredicate>
526 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
527 InstPartition *PrevMatch = nullptr;
528 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000529 auto DoesMatch = Predicate(&*I);
Adam Nemet938d3d62015-05-14 12:05:18 +0000530 if (PrevMatch == nullptr && DoesMatch) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000531 PrevMatch = &*I;
Adam Nemet938d3d62015-05-14 12:05:18 +0000532 ++I;
533 } else if (PrevMatch != nullptr && DoesMatch) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000534 I->moveTo(*PrevMatch);
Adam Nemet938d3d62015-05-14 12:05:18 +0000535 I = PartitionContainer.erase(I);
536 } else {
537 PrevMatch = nullptr;
538 ++I;
539 }
540 }
541 }
542};
543
544/// \brief For each memory instruction, this class maintains difference of the
545/// number of unsafe dependences that start out from this instruction minus
546/// those that end here.
547///
548/// By traversing the memory instructions in program order and accumulating this
549/// number, we know whether any unsafe dependence crosses over a program point.
550class MemoryInstructionDependences {
551 typedef MemoryDepChecker::Dependence Dependence;
552
553public:
554 struct Entry {
555 Instruction *Inst;
556 unsigned NumUnsafeDependencesStartOrEnd;
557
558 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
559 };
560
561 typedef SmallVector<Entry, 8> AccessesType;
562
563 AccessesType::const_iterator begin() const { return Accesses.begin(); }
564 AccessesType::const_iterator end() const { return Accesses.end(); }
565
566 MemoryInstructionDependences(
567 const SmallVectorImpl<Instruction *> &Instructions,
Adam Nemeta2df7502015-11-03 21:39:52 +0000568 const SmallVectorImpl<Dependence> &Dependences) {
Benjamin Kramere6987bf2015-05-21 18:32:07 +0000569 Accesses.append(Instructions.begin(), Instructions.end());
Adam Nemet938d3d62015-05-14 12:05:18 +0000570
571 DEBUG(dbgs() << "Backward dependences:\n");
Adam Nemeta2df7502015-11-03 21:39:52 +0000572 for (auto &Dep : Dependences)
Adam Nemet938d3d62015-05-14 12:05:18 +0000573 if (Dep.isPossiblyBackward()) {
574 // Note that the designations source and destination follow the program
575 // order, i.e. source is always first. (The direction is given by the
576 // DepType.)
577 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
578 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
579
580 DEBUG(Dep.print(dbgs(), 2, Instructions));
581 }
582 }
583
584private:
585 AccessesType Accesses;
586};
587
Adam Nemet61399ac2016-04-27 00:31:03 +0000588/// \brief The actual class performing the per-loop work.
589class LoopDistributeForLoop {
Adam Nemet938d3d62015-05-14 12:05:18 +0000590public:
Adam Nemeteff76642016-05-13 04:20:31 +0000591 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
Adam Nemet4338d672016-04-29 07:10:39 +0000592 ScalarEvolution *SE)
Adam Nemeteff76642016-05-13 04:20:31 +0000593 : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE) {
Adam Nemetd2fa4142016-04-27 05:28:18 +0000594 setForced();
595 }
Adam Nemetc75ad692015-07-30 03:29:16 +0000596
Adam Nemet938d3d62015-05-14 12:05:18 +0000597 /// \brief Try to distribute an inner-most loop.
Xinliang David Li7853c1d2016-07-08 20:55:26 +0000598 bool processLoop(LoopAccessLegacyAnalysis *LAA) {
Adam Nemet938d3d62015-05-14 12:05:18 +0000599 assert(L->empty() && "Only process inner loops.");
600
601 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
602 << "\" checking " << *L << "\n");
603
604 BasicBlock *PH = L->getLoopPreheader();
Adam Nemet7f38e112016-04-28 23:08:27 +0000605 if (!PH)
Adam Nemetadeccf72016-04-28 23:08:30 +0000606 return fail("no preheader");
Adam Nemet7f38e112016-04-28 23:08:27 +0000607 if (!L->getExitBlock())
Adam Nemetadeccf72016-04-28 23:08:30 +0000608 return fail("multiple exit blocks");
Adam Nemeteff76642016-05-13 04:20:31 +0000609
Adam Nemet938d3d62015-05-14 12:05:18 +0000610 // LAA will check that we only have a single exiting block.
Adam Nemetbdbc5222016-06-16 08:26:56 +0000611 LAI = &LAA->getInfo(L);
Adam Nemet938d3d62015-05-14 12:05:18 +0000612
Adam Nemet938d3d62015-05-14 12:05:18 +0000613 // Currently, we only distribute to isolate the part of the loop with
614 // dependence cycles to enable partial vectorization.
Adam Nemeteff76642016-05-13 04:20:31 +0000615 if (LAI->canVectorizeMemory())
Adam Nemetadeccf72016-04-28 23:08:30 +0000616 return fail("memory operations are safe for vectorization");
Adam Nemet7f38e112016-04-28 23:08:27 +0000617
Adam Nemeteff76642016-05-13 04:20:31 +0000618 auto *Dependences = LAI->getDepChecker().getDependences();
Adam Nemet7f38e112016-04-28 23:08:27 +0000619 if (!Dependences || Dependences->empty())
Adam Nemetadeccf72016-04-28 23:08:30 +0000620 return fail("no unsafe dependences to isolate");
Adam Nemet938d3d62015-05-14 12:05:18 +0000621
622 InstPartitionContainer Partitions(L, LI, DT);
623
624 // First, go through each memory operation and assign them to consecutive
625 // partitions (the order of partitions follows program order). Put those
626 // with unsafe dependences into "cyclic" partition otherwise put each store
627 // in its own "non-cyclic" partition (we'll merge these later).
628 //
629 // Note that a memory operation (e.g. Load2 below) at a program point that
630 // has an unsafe dependence (Store3->Load1) spanning over it must be
631 // included in the same cyclic partition as the dependent operations. This
632 // is to preserve the original program order after distribution. E.g.:
633 //
634 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
635 // Load1 -. 1 0->1
636 // Load2 | /Unsafe/ 0 1
637 // Store3 -' -1 1->0
638 // Load4 0 0
639 //
640 // NumUnsafeDependencesActive > 0 indicates this situation and in this case
641 // we just keep assigning to the same cyclic partition until
642 // NumUnsafeDependencesActive reaches 0.
Adam Nemeteff76642016-05-13 04:20:31 +0000643 const MemoryDepChecker &DepChecker = LAI->getDepChecker();
Adam Nemet938d3d62015-05-14 12:05:18 +0000644 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
Adam Nemeta2df7502015-11-03 21:39:52 +0000645 *Dependences);
Adam Nemet938d3d62015-05-14 12:05:18 +0000646
647 int NumUnsafeDependencesActive = 0;
648 for (auto &InstDep : MID) {
649 Instruction *I = InstDep.Inst;
650 // We update NumUnsafeDependencesActive post-instruction, catch the
651 // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
652 if (NumUnsafeDependencesActive ||
653 InstDep.NumUnsafeDependencesStartOrEnd > 0)
654 Partitions.addToCyclicPartition(I);
655 else
656 Partitions.addToNewNonCyclicPartition(I);
657 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
658 assert(NumUnsafeDependencesActive >= 0 &&
659 "Negative number of dependences active");
660 }
661
662 // Add partitions for values used outside. These partitions can be out of
663 // order from the original program order. This is OK because if the
664 // partition uses a load we will merge this partition with the original
665 // partition of the load that we set up in the previous loop (see
666 // mergeToAvoidDuplicatedLoads).
667 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
668 for (auto *Inst : DefsUsedOutside)
669 Partitions.addToNewNonCyclicPartition(Inst);
670
671 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
672 if (Partitions.getSize() < 2)
Adam Nemet7f38e112016-04-28 23:08:27 +0000673 return fail("cannot isolate unsafe dependencies");
Adam Nemet938d3d62015-05-14 12:05:18 +0000674
675 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
676 // should be able to vectorize these together.
677 Partitions.mergeBeforePopulating();
678 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
679 if (Partitions.getSize() < 2)
Adam Nemet7f38e112016-04-28 23:08:27 +0000680 return fail("cannot isolate unsafe dependencies");
Adam Nemet938d3d62015-05-14 12:05:18 +0000681
682 // Now, populate the partitions with non-memory operations.
683 Partitions.populateUsedSet();
684 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
685
686 // In order to preserve original lexical order for loads, keep them in the
687 // partition that we set up in the MemoryInstructionDependences loop.
688 if (Partitions.mergeToAvoidDuplicatedLoads()) {
689 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
690 << Partitions);
691 if (Partitions.getSize() < 2)
Adam Nemet7f38e112016-04-28 23:08:27 +0000692 return fail("cannot isolate unsafe dependencies");
Adam Nemet938d3d62015-05-14 12:05:18 +0000693 }
694
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000695 // Don't distribute the loop if we need too many SCEV run-time checks.
Xinliang David Li94734ee2016-07-01 05:59:55 +0000696 const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
Adam Nemetd2fa4142016-04-27 05:28:18 +0000697 if (Pred.getComplexity() > (IsForced.getValueOr(false)
698 ? PragmaDistributeSCEVCheckThreshold
Adam Nemet7f38e112016-04-28 23:08:27 +0000699 : DistributeSCEVCheckThreshold))
Adam Nemetadeccf72016-04-28 23:08:30 +0000700 return fail("too many SCEV run-time checks needed.\n");
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000701
Adam Nemet938d3d62015-05-14 12:05:18 +0000702 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
703 // We're done forming the partitions set up the reverse mapping from
704 // instructions to partitions.
705 Partitions.setupPartitionIdOnInstructions();
706
707 // To keep things simple have an empty preheader before we version or clone
708 // the loop. (Also split if this has no predecessor, i.e. entry, because we
709 // rely on PH having a predecessor.)
710 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
711 SplitBlock(PH, PH->getTerminator(), DT, LI);
712
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000713 // If we need run-time checks, version the loop now.
Adam Nemeteff76642016-05-13 04:20:31 +0000714 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
715 const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
Adam Nemet15840392015-08-07 22:44:15 +0000716 const auto &AllChecks = RtPtrChecking->getChecks();
Adam Nemetc75ad692015-07-30 03:29:16 +0000717 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
718 RtPtrChecking);
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000719
720 if (!Pred.isAlwaysTrue() || !Checks.empty()) {
Adam Nemet772a1502015-06-19 19:32:41 +0000721 DEBUG(dbgs() << "\nPointers:\n");
Adam Nemeteff76642016-05-13 04:20:31 +0000722 DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
723 LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000724 LVer.setAliasChecks(std::move(Checks));
Xinliang David Li94734ee2016-07-01 05:59:55 +0000725 LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
Adam Nemete4813402015-08-20 17:22:29 +0000726 LVer.versionLoop(DefsUsedOutside);
Adam Nemet5eccf072016-03-17 20:32:32 +0000727 LVer.annotateLoopWithNoAlias();
Adam Nemet938d3d62015-05-14 12:05:18 +0000728 }
729
730 // Create identical copies of the original loop for each partition and hook
731 // them up sequentially.
Justin Bogner843fb202015-12-15 19:40:57 +0000732 Partitions.cloneLoops();
Adam Nemet938d3d62015-05-14 12:05:18 +0000733
734 // Now, we remove the instruction from each loop that don't belong to that
735 // partition.
736 Partitions.removeUnusedInsts();
737 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
738 DEBUG(Partitions.printBlocks());
739
740 if (LDistVerify) {
741 LI->verify();
742 DT->verifyDomTree();
743 }
744
745 ++NumLoopsDistributed;
Adam Nemet88ec4912016-04-29 07:10:46 +0000746 // Report the success.
747 emitOptimizationRemark(F->getContext(), LDIST_NAME, *F, L->getStartLoc(),
748 "distributed loop");
Adam Nemet938d3d62015-05-14 12:05:18 +0000749 return true;
750 }
751
Adam Nemet7f38e112016-04-28 23:08:27 +0000752 /// \brief Provide diagnostics then \return with false.
753 bool fail(llvm::StringRef Message) {
Adam Nemet0ba164b2016-04-28 23:08:32 +0000754 LLVMContext &Ctx = F->getContext();
755 bool Forced = isForced().getValueOr(false);
756
Adam Nemetadeccf72016-04-28 23:08:30 +0000757 DEBUG(dbgs() << "Skipping; " << Message << "\n");
Adam Nemet0ba164b2016-04-28 23:08:32 +0000758
759 // With Rpass-missed report that distribution failed.
760 emitOptimizationRemarkMissed(
761 Ctx, LDIST_NAME, *F, L->getStartLoc(),
762 "loop not distributed: use -Rpass-analysis=loop-distribute for more "
763 "info");
764
765 // With Rpass-analysis report why. This is on by default if distribution
766 // was requested explicitly.
767 emitOptimizationRemarkAnalysis(
Adam Nemetad437ff2016-06-29 04:55:19 +0000768 Ctx, Forced ? DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint
769 : LDIST_NAME,
770 *F, L->getStartLoc(), Twine("loop not distributed: ") + Message);
Adam Nemet0ba164b2016-04-28 23:08:32 +0000771
772 // Also issue a warning if distribution was requested explicitly but it
773 // failed.
774 if (Forced)
775 Ctx.diagnose(DiagnosticInfoOptimizationFailure(
776 *F, L->getStartLoc(), "loop not disributed: failed "
777 "explicitly specified loop distribution"));
778
Adam Nemet7f38e112016-04-28 23:08:27 +0000779 return false;
780 }
781
Adam Nemetd2fa4142016-04-27 05:28:18 +0000782 /// \brief Return if distribution forced to be enabled/disabled for the loop.
783 ///
784 /// If the optional has a value, it indicates whether distribution was forced
785 /// to be enabled (true) or disabled (false). If the optional has no value
786 /// distribution was not forced either way.
787 const Optional<bool> &isForced() const { return IsForced; }
788
Adam Nemet61399ac2016-04-27 00:31:03 +0000789private:
790 /// \brief Filter out checks between pointers from the same partition.
791 ///
792 /// \p PtrToPartition contains the partition number for pointers. Partition
793 /// number -1 means that the pointer is used in multiple partitions. In this
794 /// case we can't safely omit the check.
795 SmallVector<RuntimePointerChecking::PointerCheck, 4>
796 includeOnlyCrossPartitionChecks(
797 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
798 const SmallVectorImpl<int> &PtrToPartition,
799 const RuntimePointerChecking *RtPtrChecking) {
800 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
801
802 std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
803 [&](const RuntimePointerChecking::PointerCheck &Check) {
804 for (unsigned PtrIdx1 : Check.first->Members)
805 for (unsigned PtrIdx2 : Check.second->Members)
806 // Only include this check if there is a pair of pointers
807 // that require checking and the pointers fall into
808 // separate partitions.
809 //
810 // (Note that we already know at this point that the two
811 // pointer groups need checking but it doesn't follow
812 // that each pair of pointers within the two groups need
813 // checking as well.
814 //
815 // In other words we don't want to include a check just
816 // because there is a pair of pointers between the two
817 // pointer groups that require checks and a different
818 // pair whose pointers fall into different partitions.)
819 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
820 !RuntimePointerChecking::arePointersInSamePartition(
821 PtrToPartition, PtrIdx1, PtrIdx2))
822 return true;
823 return false;
824 });
825
826 return Checks;
827 }
828
Adam Nemetd2fa4142016-04-27 05:28:18 +0000829 /// \brief Check whether the loop metadata is forcing distribution to be
830 /// enabled/disabled.
831 void setForced() {
832 Optional<const MDOperand *> Value =
833 findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
834 if (!Value)
835 return;
836
837 const MDOperand *Op = *Value;
838 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
839 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
840 }
841
Adam Nemet61399ac2016-04-27 00:31:03 +0000842 Loop *L;
Adam Nemet4338d672016-04-29 07:10:39 +0000843 Function *F;
844
845 // Analyses used.
Adam Nemet938d3d62015-05-14 12:05:18 +0000846 LoopInfo *LI;
Adam Nemeteff76642016-05-13 04:20:31 +0000847 const LoopAccessInfo *LAI;
Adam Nemet938d3d62015-05-14 12:05:18 +0000848 DominatorTree *DT;
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000849 ScalarEvolution *SE;
Adam Nemetd2fa4142016-04-27 05:28:18 +0000850
851 /// \brief Indicates whether distribution is forced to be enabled/disabled for
852 /// the loop.
853 ///
854 /// If the optional has a value, it indicates whether distribution was forced
855 /// to be enabled (true) or disabled (false). If the optional has no value
856 /// distribution was not forced either way.
857 Optional<bool> IsForced;
Adam Nemet938d3d62015-05-14 12:05:18 +0000858};
Adam Nemet61399ac2016-04-27 00:31:03 +0000859
860/// \brief The pass class.
861class LoopDistribute : public FunctionPass {
862public:
Adam Nemetd2fa4142016-04-27 05:28:18 +0000863 /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
864 /// performed by default. Pass -enable-loop-distribute={0,1} overrides this
865 /// default. We use this to keep LoopDistribution off by default when invoked
866 /// from the optimization pipeline but on when invoked explicitly from opt.
867 LoopDistribute(bool ProcessAllLoopsByDefault = true)
868 : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
869 // The default is set by the caller.
870 if (EnableLoopDistribute.getNumOccurrences() > 0)
871 ProcessAllLoops = EnableLoopDistribute;
Adam Nemet61399ac2016-04-27 00:31:03 +0000872 initializeLoopDistributePass(*PassRegistry::getPassRegistry());
873 }
874
875 bool runOnFunction(Function &F) override {
Andrew Kaylor50271f72016-05-03 22:32:30 +0000876 if (skipFunction(F))
877 return false;
878
Adam Nemet61399ac2016-04-27 00:31:03 +0000879 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
Xinliang David Li7853c1d2016-07-08 20:55:26 +0000880 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
Adam Nemet61399ac2016-04-27 00:31:03 +0000881 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
882 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
883
884 // Build up a worklist of inner-loops to vectorize. This is necessary as the
885 // act of distributing a loop creates new loops and can invalidate iterators
886 // across the loops.
887 SmallVector<Loop *, 8> Worklist;
888
889 for (Loop *TopLevelLoop : *LI)
890 for (Loop *L : depth_first(TopLevelLoop))
891 // We only handle inner-most loops.
892 if (L->empty())
893 Worklist.push_back(L);
894
895 // Now walk the identified inner loops.
896 bool Changed = false;
897 for (Loop *L : Worklist) {
Adam Nemeteff76642016-05-13 04:20:31 +0000898 LoopDistributeForLoop LDL(L, &F, LI, DT, SE);
Adam Nemetd2fa4142016-04-27 05:28:18 +0000899
900 // If distribution was forced for the specific loop to be
901 // enabled/disabled, follow that. Otherwise use the global flag.
902 if (LDL.isForced().getValueOr(ProcessAllLoops))
Adam Nemeteff76642016-05-13 04:20:31 +0000903 Changed |= LDL.processLoop(LAA);
Adam Nemet61399ac2016-04-27 00:31:03 +0000904 }
905
906 // Process each loop nest in the function.
907 return Changed;
908 }
909
910 void getAnalysisUsage(AnalysisUsage &AU) const override {
911 AU.addRequired<ScalarEvolutionWrapperPass>();
912 AU.addRequired<LoopInfoWrapperPass>();
913 AU.addPreserved<LoopInfoWrapperPass>();
Xinliang David Li7853c1d2016-07-08 20:55:26 +0000914 AU.addRequired<LoopAccessLegacyAnalysis>();
Adam Nemet61399ac2016-04-27 00:31:03 +0000915 AU.addRequired<DominatorTreeWrapperPass>();
916 AU.addPreserved<DominatorTreeWrapperPass>();
917 }
918
919 static char ID;
Adam Nemetd2fa4142016-04-27 05:28:18 +0000920
921private:
922 /// \brief Whether distribution should be on in this function. The per-loop
923 /// pragma can override this.
924 bool ProcessAllLoops;
Adam Nemet61399ac2016-04-27 00:31:03 +0000925};
Adam Nemet938d3d62015-05-14 12:05:18 +0000926} // anonymous namespace
927
928char LoopDistribute::ID;
929static const char ldist_name[] = "Loop Distribition";
930
931INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
932INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
Xinliang David Li7853c1d2016-07-08 20:55:26 +0000933INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
Adam Nemet938d3d62015-05-14 12:05:18 +0000934INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
Silviu Baranga2910a4f2015-11-09 13:26:09 +0000935INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
Adam Nemet938d3d62015-05-14 12:05:18 +0000936INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
937
938namespace llvm {
Adam Nemetd2fa4142016-04-27 05:28:18 +0000939FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
940 return new LoopDistribute(ProcessAllLoopsByDefault);
941}
Adam Nemet938d3d62015-05-14 12:05:18 +0000942}