blob: ad3a85a6f760fca5cc6e8dae88f10fe73018a512 [file] [log] [blame]
Florian Hahn09e516c2018-11-14 13:11:49 +00001//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9/// This file implements SLP analysis based on VPlan. The analysis is based on
10/// the ideas described in
11///
12/// Look-ahead SLP: auto-vectorization in the presence of commutative
13/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
14/// Luís F. W. Góes
15///
16//===----------------------------------------------------------------------===//
17
18#include "VPlan.h"
19#include "llvm/ADT/DepthFirstIterator.h"
20#include "llvm/ADT/PostOrderIterator.h"
21#include "llvm/ADT/SmallVector.h"
22#include "llvm/ADT/Twine.h"
23#include "llvm/Analysis/LoopInfo.h"
24#include "llvm/Analysis/VectorUtils.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/CFG.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/InstrTypes.h"
29#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Instructions.h"
31#include "llvm/IR/Type.h"
32#include "llvm/IR/Value.h"
33#include "llvm/Support/Casting.h"
34#include "llvm/Support/Debug.h"
35#include "llvm/Support/ErrorHandling.h"
36#include "llvm/Support/GraphWriter.h"
37#include "llvm/Support/raw_ostream.h"
38#include "llvm/Transforms/Utils/BasicBlockUtils.h"
39#include <cassert>
40#include <iterator>
41#include <string>
42#include <vector>
43
44using namespace llvm;
45
46#define DEBUG_TYPE "vplan-slp"
47
48// Number of levels to look ahead when re-ordering multi node operands.
49static unsigned LookaheadMaxDepth = 5;
50
51VPInstruction *VPlanSlp::markFailed() {
52 // FIXME: Currently this is used to signal we hit instructions we cannot
53 // trivially SLP'ize.
54 CompletelySLP = false;
55 return nullptr;
56}
57
58void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
59 if (all_of(Operands, [](VPValue *V) {
60 return cast<VPInstruction>(V)->getUnderlyingInstr();
61 })) {
62 unsigned BundleSize = 0;
63 for (VPValue *V : Operands) {
64 Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
65 assert(!T->isVectorTy() && "Only scalar types supported for now");
66 BundleSize += T->getScalarSizeInBits();
67 }
68 WidestBundleBits = std::max(WidestBundleBits, BundleSize);
69 }
70
71 auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
72 assert(Res.second &&
73 "Already created a combined instruction for the operand bundle");
74 (void)Res;
75}
76
77bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
78 // Currently we only support VPInstructions.
79 if (!all_of(Operands, [](VPValue *Op) {
80 return Op && isa<VPInstruction>(Op) &&
81 cast<VPInstruction>(Op)->getUnderlyingInstr();
82 })) {
83 LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
84 return false;
85 }
86
87 // Check if opcodes and type width agree for all instructions in the bundle.
88 // FIXME: Differing widths/opcodes can be handled by inserting additional
89 // instructions.
90 // FIXME: Deal with non-primitive types.
91 const Instruction *OriginalInstr =
92 cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
93 unsigned Opcode = OriginalInstr->getOpcode();
94 unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
95 if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
96 const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
97 return I->getOpcode() == Opcode &&
98 I->getType()->getPrimitiveSizeInBits() == Width;
99 })) {
100 LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
101 return false;
102 }
103
104 // For now, all operands must be defined in the same BB.
105 if (any_of(Operands, [this](VPValue *Op) {
106 return cast<VPInstruction>(Op)->getParent() != &this->BB;
107 })) {
108 LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
109 return false;
110 }
111
112 if (any_of(Operands,
113 [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
114 LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
115 return false;
116 }
117
118 // For loads, check that there are no instructions writing to memory in
119 // between them.
120 // TODO: we only have to forbid instructions writing to memory that could
121 // interfere with any of the loads in the bundle
122 if (Opcode == Instruction::Load) {
123 unsigned LoadsSeen = 0;
124 VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
125 for (auto &I : *Parent) {
126 auto *VPI = cast<VPInstruction>(&I);
127 if (VPI->getOpcode() == Instruction::Load &&
128 std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
129 LoadsSeen++;
130
131 if (LoadsSeen == Operands.size())
132 break;
133 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
134 LLVM_DEBUG(
135 dbgs() << "VPSLP: instruction modifying memory between loads\n");
136 return false;
137 }
138 }
139
140 if (!all_of(Operands, [](VPValue *Op) {
141 return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
142 ->isSimple();
143 })) {
144 LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
145 return false;
146 }
147 }
148
149 if (Opcode == Instruction::Store)
150 if (!all_of(Operands, [](VPValue *Op) {
151 return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
152 ->isSimple();
153 })) {
154 LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
155 return false;
156 }
157
158 return true;
159}
160
161static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
162 unsigned OperandIndex) {
163 SmallVector<VPValue *, 4> Operands;
164 for (VPValue *V : Values) {
165 auto *U = cast<VPUser>(V);
166 Operands.push_back(U->getOperand(OperandIndex));
167 }
168 return Operands;
169}
170
171static bool areCommutative(ArrayRef<VPValue *> Values) {
172 return Instruction::isCommutative(
173 cast<VPInstruction>(Values[0])->getOpcode());
174}
175
176static SmallVector<SmallVector<VPValue *, 4>, 4>
177getOperands(ArrayRef<VPValue *> Values) {
178 SmallVector<SmallVector<VPValue *, 4>, 4> Result;
179 auto *VPI = cast<VPInstruction>(Values[0]);
180
181 switch (VPI->getOpcode()) {
182 case Instruction::Load:
183 llvm_unreachable("Loads terminate a tree, no need to get operands");
184 case Instruction::Store:
185 Result.push_back(getOperands(Values, 0));
186 break;
187 default:
188 for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
189 Result.push_back(getOperands(Values, I));
190 break;
191 }
192
193 return Result;
194}
195
196/// Returns the opcode of Values or ~0 if they do not all agree.
197static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
198 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
199 if (any_of(Values, [Opcode](VPValue *V) {
200 return cast<VPInstruction>(V)->getOpcode() != Opcode;
201 }))
202 return None;
203 return {Opcode};
204}
205
206/// Returns true if A and B access sequential memory if they are loads or
207/// stores or if they have identical opcodes otherwise.
208static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
209 VPInterleavedAccessInfo &IAI) {
210 if (A->getOpcode() != B->getOpcode())
211 return false;
212
213 if (A->getOpcode() != Instruction::Load &&
214 A->getOpcode() != Instruction::Store)
215 return true;
216 auto *GA = IAI.getInterleaveGroup(A);
217 auto *GB = IAI.getInterleaveGroup(B);
218
219 return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
220}
221
222/// Implements getLAScore from Listing 7 in the paper.
223/// Traverses and compares operands of V1 and V2 to MaxLevel.
224static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
225 VPInterleavedAccessInfo &IAI) {
226 if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
227 return 0;
228
229 if (MaxLevel == 0)
230 return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
231 cast<VPInstruction>(V2), IAI);
232
233 unsigned Score = 0;
234 for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
235 for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
236 Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
237 cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
238 return Score;
239}
240
241std::pair<VPlanSlp::OpMode, VPValue *>
242VPlanSlp::getBest(OpMode Mode, VPValue *Last,
Florian Hahn6df11862018-11-14 15:58:40 +0000243 SmallPtrSetImpl<VPValue *> &Candidates,
Florian Hahn09e516c2018-11-14 13:11:49 +0000244 VPInterleavedAccessInfo &IAI) {
Florian Hahn6df11862018-11-14 15:58:40 +0000245 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
246 "Currently we only handle load and commutative opcodes");
Florian Hahn09e516c2018-11-14 13:11:49 +0000247 LLVM_DEBUG(dbgs() << " getBest\n");
Florian Hahn09e516c2018-11-14 13:11:49 +0000248
Florian Hahn6df11862018-11-14 15:58:40 +0000249 SmallVector<VPValue *, 4> BestCandidates;
Florian Hahn09e516c2018-11-14 13:11:49 +0000250 LLVM_DEBUG(dbgs() << " Candidates for "
251 << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
252 for (auto *Candidate : Candidates) {
253 auto *LastI = cast<VPInstruction>(Last);
254 auto *CandidateI = cast<VPInstruction>(Candidate);
255 if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
256 LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
257 << " ");
258 BestCandidates.push_back(Candidate);
259 }
260 }
261 LLVM_DEBUG(dbgs() << "\n");
262
263 if (BestCandidates.empty())
264 return {OpMode::Failed, nullptr};
265
266 if (BestCandidates.size() == 1)
267 return {Mode, BestCandidates[0]};
268
Florian Hahn6df11862018-11-14 15:58:40 +0000269 VPValue *Best = nullptr;
270 unsigned BestScore = 0;
271 for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
272 unsigned PrevScore = ~0u;
273 bool AllSame = true;
Florian Hahn09e516c2018-11-14 13:11:49 +0000274
Florian Hahn6df11862018-11-14 15:58:40 +0000275 // FIXME: Avoid visiting the same operands multiple times.
276 for (auto *Candidate : BestCandidates) {
277 unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
278 if (PrevScore == ~0u)
Florian Hahn09e516c2018-11-14 13:11:49 +0000279 PrevScore = Score;
Florian Hahn6df11862018-11-14 15:58:40 +0000280 if (PrevScore != Score)
281 AllSame = false;
282 PrevScore = Score;
Florian Hahn09e516c2018-11-14 13:11:49 +0000283
Florian Hahn6df11862018-11-14 15:58:40 +0000284 if (Score > BestScore) {
285 BestScore = Score;
286 Best = Candidate;
Florian Hahn09e516c2018-11-14 13:11:49 +0000287 }
Florian Hahn09e516c2018-11-14 13:11:49 +0000288 }
Florian Hahn6df11862018-11-14 15:58:40 +0000289 if (!AllSame)
290 break;
Florian Hahn09e516c2018-11-14 13:11:49 +0000291 }
292 LLVM_DEBUG(dbgs() << "Found best "
293 << *cast<VPInstruction>(Best)->getUnderlyingInstr()
294 << "\n");
Florian Hahn6df11862018-11-14 15:58:40 +0000295 Candidates.erase(Best);
Florian Hahn09e516c2018-11-14 13:11:49 +0000296
297 return {Mode, Best};
298}
299
300SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
301 SmallVector<MultiNodeOpTy, 4> FinalOrder;
302 SmallVector<OpMode, 4> Mode;
303 FinalOrder.reserve(MultiNodeOps.size());
304 Mode.reserve(MultiNodeOps.size());
305
306 LLVM_DEBUG(dbgs() << "Reordering multinode\n");
307
308 for (auto &Operands : MultiNodeOps) {
309 FinalOrder.push_back({Operands.first, {Operands.second[0]}});
310 if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
311 Instruction::Load)
312 Mode.push_back(OpMode::Load);
313 else
314 Mode.push_back(OpMode::Opcode);
315 }
316
317 for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
318 LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
Florian Hahn6df11862018-11-14 15:58:40 +0000319 SmallPtrSet<VPValue *, 4> Candidates;
Florian Hahn09e516c2018-11-14 13:11:49 +0000320 LLVM_DEBUG(dbgs() << " Candidates ");
321 for (auto Ops : MultiNodeOps) {
322 LLVM_DEBUG(
323 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
324 << " ");
Florian Hahn6df11862018-11-14 15:58:40 +0000325 Candidates.insert(Ops.second[Lane]);
Florian Hahn09e516c2018-11-14 13:11:49 +0000326 }
327 LLVM_DEBUG(dbgs() << "\n");
328
329 for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
330 LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
331 if (Mode[Op] == OpMode::Failed)
332 continue;
333
334 VPValue *Last = FinalOrder[Op].second[Lane - 1];
335 std::pair<OpMode, VPValue *> Res =
336 getBest(Mode[Op], Last, Candidates, IAI);
337 if (Res.second)
338 FinalOrder[Op].second.push_back(Res.second);
339 else
340 // TODO: handle this case
341 FinalOrder[Op].second.push_back(markFailed());
342 }
343 }
344
345 return FinalOrder;
346}
347
348void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
Florian Hahn02cb67d2018-11-14 13:33:44 +0000349 dbgs() << " Ops: ";
Florian Hahn09e516c2018-11-14 13:11:49 +0000350 for (auto Op : Values)
351 if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr())
Florian Hahn02cb67d2018-11-14 13:33:44 +0000352 dbgs() << *Instr << " | ";
Florian Hahn09e516c2018-11-14 13:11:49 +0000353 else
Florian Hahn02cb67d2018-11-14 13:33:44 +0000354 dbgs() << " nullptr | ";
355 dbgs() << "\n";
Florian Hahn09e516c2018-11-14 13:11:49 +0000356}
357
358VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
359 assert(!Values.empty() && "Need some operands!");
360
361 // If we already visited this instruction bundle, re-use the existing node
362 auto I = BundleToCombined.find(to_vector<4>(Values));
363 if (I != BundleToCombined.end()) {
Florian Hahn2eca3722018-11-14 13:21:26 +0000364#ifndef NDEBUG
Florian Hahn09e516c2018-11-14 13:11:49 +0000365 // Check that the resulting graph is a tree. If we re-use a node, this means
366 // its values have multiple users. We only allow this, if all users of each
367 // value are the same instruction.
368 for (auto *V : Values) {
369 auto UI = V->user_begin();
370 auto *FirstUser = *UI++;
Florian Hahn2eca3722018-11-14 13:21:26 +0000371 while (UI != V->user_end()) {
Florian Hahn09e516c2018-11-14 13:11:49 +0000372 assert(*UI == FirstUser && "Currently we only support SLP trees.");
373 UI++;
374 }
375 }
376#endif
377 return I->second;
378 }
379
380 // Dump inputs
381 LLVM_DEBUG({
382 dbgs() << "buildGraph: ";
383 dumpBundle(Values);
384 });
385
386 if (!areVectorizable(Values))
387 return markFailed();
388
389 assert(getOpcode(Values) && "Opcodes for all values must match");
390 unsigned ValuesOpcode = getOpcode(Values).getValue();
391
392 SmallVector<VPValue *, 4> CombinedOperands;
393 if (areCommutative(Values)) {
394 bool MultiNodeRoot = !MultiNodeActive;
395 MultiNodeActive = true;
396 for (auto &Operands : getOperands(Values)) {
397 LLVM_DEBUG({
398 dbgs() << " Visiting Commutative";
399 dumpBundle(Operands);
400 });
401
402 auto OperandsOpcode = getOpcode(Operands);
403 if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
404 LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
405 CombinedOperands.push_back(buildGraph(Operands));
406 } else {
407 LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
408 // Create dummy VPInstruction, which will we replace later by the
409 // re-ordered operand.
410 VPInstruction *Op = new VPInstruction(0, {});
411 CombinedOperands.push_back(Op);
412 MultiNodeOps.emplace_back(Op, Operands);
413 }
414 }
415
416 if (MultiNodeRoot) {
417 LLVM_DEBUG(dbgs() << "Reorder \n");
418 MultiNodeActive = false;
419
420 auto FinalOrder = reorderMultiNodeOps();
421
422 MultiNodeOps.clear();
423 for (auto &Ops : FinalOrder) {
424 VPInstruction *NewOp = buildGraph(Ops.second);
425 Ops.first->replaceAllUsesWith(NewOp);
426 for (unsigned i = 0; i < CombinedOperands.size(); i++)
427 if (CombinedOperands[i] == Ops.first)
428 CombinedOperands[i] = NewOp;
429 delete Ops.first;
430 Ops.first = NewOp;
431 }
432 LLVM_DEBUG(dbgs() << "Found final order\n");
433 }
434 } else {
435 LLVM_DEBUG(dbgs() << " NonCommuntative\n");
436 if (ValuesOpcode == Instruction::Load)
437 for (VPValue *V : Values)
438 CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
439 else
440 for (auto &Operands : getOperands(Values))
441 CombinedOperands.push_back(buildGraph(Operands));
442 }
443
444 unsigned Opcode;
445 switch (ValuesOpcode) {
446 case Instruction::Load:
447 Opcode = VPInstruction::SLPLoad;
448 break;
449 case Instruction::Store:
450 Opcode = VPInstruction::SLPStore;
451 break;
452 default:
453 Opcode = ValuesOpcode;
454 break;
455 }
456
457 if (!CompletelySLP)
458 return markFailed();
459
460 assert(CombinedOperands.size() > 0 && "Need more some operands");
461 auto *VPI = new VPInstruction(Opcode, CombinedOperands);
462 VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
463
464 LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
465 cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
466 addCombined(Values, VPI);
467 return VPI;
468}