|  | //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | /// This file implements SLP analysis based on VPlan. The analysis is based on | 
|  | /// the ideas described in | 
|  | /// | 
|  | ///   Look-ahead SLP: auto-vectorization in the presence of commutative | 
|  | ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, | 
|  | ///   Luís F. W. Góes | 
|  | /// | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "VPlan.h" | 
|  | #include "llvm/ADT/DepthFirstIterator.h" | 
|  | #include "llvm/ADT/PostOrderIterator.h" | 
|  | #include "llvm/ADT/SmallVector.h" | 
|  | #include "llvm/ADT/Twine.h" | 
|  | #include "llvm/Analysis/LoopInfo.h" | 
|  | #include "llvm/Analysis/VectorUtils.h" | 
|  | #include "llvm/IR/BasicBlock.h" | 
|  | #include "llvm/IR/CFG.h" | 
|  | #include "llvm/IR/Dominators.h" | 
|  | #include "llvm/IR/InstrTypes.h" | 
|  | #include "llvm/IR/Instruction.h" | 
|  | #include "llvm/IR/Instructions.h" | 
|  | #include "llvm/IR/Type.h" | 
|  | #include "llvm/IR/Value.h" | 
|  | #include "llvm/Support/Casting.h" | 
|  | #include "llvm/Support/Debug.h" | 
|  | #include "llvm/Support/ErrorHandling.h" | 
|  | #include "llvm/Support/GraphWriter.h" | 
|  | #include "llvm/Support/raw_ostream.h" | 
|  | #include "llvm/Transforms/Utils/BasicBlockUtils.h" | 
|  | #include <cassert> | 
|  | #include <iterator> | 
|  | #include <string> | 
|  | #include <vector> | 
|  |  | 
|  | using namespace llvm; | 
|  |  | 
|  | #define DEBUG_TYPE "vplan-slp" | 
|  |  | 
|  | // Number of levels to look ahead when re-ordering multi node operands. | 
|  | static unsigned LookaheadMaxDepth = 5; | 
|  |  | 
|  | VPInstruction *VPlanSlp::markFailed() { | 
|  | // FIXME: Currently this is used to signal we hit instructions we cannot | 
|  | //        trivially SLP'ize. | 
|  | CompletelySLP = false; | 
|  | return nullptr; | 
|  | } | 
|  |  | 
|  | void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { | 
|  | if (all_of(Operands, [](VPValue *V) { | 
|  | return cast<VPInstruction>(V)->getUnderlyingInstr(); | 
|  | })) { | 
|  | unsigned BundleSize = 0; | 
|  | for (VPValue *V : Operands) { | 
|  | Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); | 
|  | assert(!T->isVectorTy() && "Only scalar types supported for now"); | 
|  | BundleSize += T->getScalarSizeInBits(); | 
|  | } | 
|  | WidestBundleBits = std::max(WidestBundleBits, BundleSize); | 
|  | } | 
|  |  | 
|  | auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); | 
|  | assert(Res.second && | 
|  | "Already created a combined instruction for the operand bundle"); | 
|  | (void)Res; | 
|  | } | 
|  |  | 
|  | bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { | 
|  | // Currently we only support VPInstructions. | 
|  | if (!all_of(Operands, [](VPValue *Op) { | 
|  | return Op && isa<VPInstruction>(Op) && | 
|  | cast<VPInstruction>(Op)->getUnderlyingInstr(); | 
|  | })) { | 
|  | LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // Check if opcodes and type width agree for all instructions in the bundle. | 
|  | // FIXME: Differing widths/opcodes can be handled by inserting additional | 
|  | //        instructions. | 
|  | // FIXME: Deal with non-primitive types. | 
|  | const Instruction *OriginalInstr = | 
|  | cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); | 
|  | unsigned Opcode = OriginalInstr->getOpcode(); | 
|  | unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); | 
|  | if (!all_of(Operands, [Opcode, Width](VPValue *Op) { | 
|  | const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); | 
|  | return I->getOpcode() == Opcode && | 
|  | I->getType()->getPrimitiveSizeInBits() == Width; | 
|  | })) { | 
|  | LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // For now, all operands must be defined in the same BB. | 
|  | if (any_of(Operands, [this](VPValue *Op) { | 
|  | return cast<VPInstruction>(Op)->getParent() != &this->BB; | 
|  | })) { | 
|  | LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | if (any_of(Operands, | 
|  | [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { | 
|  | LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // For loads, check that there are no instructions writing to memory in | 
|  | // between them. | 
|  | // TODO: we only have to forbid instructions writing to memory that could | 
|  | //       interfere with any of the loads in the bundle | 
|  | if (Opcode == Instruction::Load) { | 
|  | unsigned LoadsSeen = 0; | 
|  | VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); | 
|  | for (auto &I : *Parent) { | 
|  | auto *VPI = cast<VPInstruction>(&I); | 
|  | if (VPI->getOpcode() == Instruction::Load && | 
|  | llvm::is_contained(Operands, VPI)) | 
|  | LoadsSeen++; | 
|  |  | 
|  | if (LoadsSeen == Operands.size()) | 
|  | break; | 
|  | if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { | 
|  | LLVM_DEBUG( | 
|  | dbgs() << "VPSLP: instruction modifying memory between loads\n"); | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (!all_of(Operands, [](VPValue *Op) { | 
|  | return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) | 
|  | ->isSimple(); | 
|  | })) { | 
|  | LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (Opcode == Instruction::Store) | 
|  | if (!all_of(Operands, [](VPValue *Op) { | 
|  | return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) | 
|  | ->isSimple(); | 
|  | })) { | 
|  | LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, | 
|  | unsigned OperandIndex) { | 
|  | SmallVector<VPValue *, 4> Operands; | 
|  | for (VPValue *V : Values) { | 
|  | auto *U = cast<VPUser>(V); | 
|  | Operands.push_back(U->getOperand(OperandIndex)); | 
|  | } | 
|  | return Operands; | 
|  | } | 
|  |  | 
|  | static bool areCommutative(ArrayRef<VPValue *> Values) { | 
|  | return Instruction::isCommutative( | 
|  | cast<VPInstruction>(Values[0])->getOpcode()); | 
|  | } | 
|  |  | 
|  | static SmallVector<SmallVector<VPValue *, 4>, 4> | 
|  | getOperands(ArrayRef<VPValue *> Values) { | 
|  | SmallVector<SmallVector<VPValue *, 4>, 4> Result; | 
|  | auto *VPI = cast<VPInstruction>(Values[0]); | 
|  |  | 
|  | switch (VPI->getOpcode()) { | 
|  | case Instruction::Load: | 
|  | llvm_unreachable("Loads terminate a tree, no need to get operands"); | 
|  | case Instruction::Store: | 
|  | Result.push_back(getOperands(Values, 0)); | 
|  | break; | 
|  | default: | 
|  | for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) | 
|  | Result.push_back(getOperands(Values, I)); | 
|  | break; | 
|  | } | 
|  |  | 
|  | return Result; | 
|  | } | 
|  |  | 
|  | /// Returns the opcode of Values or ~0 if they do not all agree. | 
|  | static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { | 
|  | unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); | 
|  | if (any_of(Values, [Opcode](VPValue *V) { | 
|  | return cast<VPInstruction>(V)->getOpcode() != Opcode; | 
|  | })) | 
|  | return None; | 
|  | return {Opcode}; | 
|  | } | 
|  |  | 
|  | /// Returns true if A and B access sequential memory if they are loads or | 
|  | /// stores or if they have identical opcodes otherwise. | 
|  | static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, | 
|  | VPInterleavedAccessInfo &IAI) { | 
|  | if (A->getOpcode() != B->getOpcode()) | 
|  | return false; | 
|  |  | 
|  | if (A->getOpcode() != Instruction::Load && | 
|  | A->getOpcode() != Instruction::Store) | 
|  | return true; | 
|  | auto *GA = IAI.getInterleaveGroup(A); | 
|  | auto *GB = IAI.getInterleaveGroup(B); | 
|  |  | 
|  | return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); | 
|  | } | 
|  |  | 
|  | /// Implements getLAScore from Listing 7 in the paper. | 
|  | /// Traverses and compares operands of V1 and V2 to MaxLevel. | 
|  | static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, | 
|  | VPInterleavedAccessInfo &IAI) { | 
|  | if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2)) | 
|  | return 0; | 
|  |  | 
|  | if (MaxLevel == 0) | 
|  | return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1), | 
|  | cast<VPInstruction>(V2), IAI); | 
|  |  | 
|  | unsigned Score = 0; | 
|  | for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I) | 
|  | for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J) | 
|  | Score += getLAScore(cast<VPUser>(V1)->getOperand(I), | 
|  | cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI); | 
|  | return Score; | 
|  | } | 
|  |  | 
|  | std::pair<VPlanSlp::OpMode, VPValue *> | 
|  | VPlanSlp::getBest(OpMode Mode, VPValue *Last, | 
|  | SmallPtrSetImpl<VPValue *> &Candidates, | 
|  | VPInterleavedAccessInfo &IAI) { | 
|  | assert((Mode == OpMode::Load || Mode == OpMode::Opcode) && | 
|  | "Currently we only handle load and commutative opcodes"); | 
|  | LLVM_DEBUG(dbgs() << "      getBest\n"); | 
|  |  | 
|  | SmallVector<VPValue *, 4> BestCandidates; | 
|  | LLVM_DEBUG(dbgs() << "        Candidates  for " | 
|  | << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); | 
|  | for (auto *Candidate : Candidates) { | 
|  | auto *LastI = cast<VPInstruction>(Last); | 
|  | auto *CandidateI = cast<VPInstruction>(Candidate); | 
|  | if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { | 
|  | LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() | 
|  | << " "); | 
|  | BestCandidates.push_back(Candidate); | 
|  | } | 
|  | } | 
|  | LLVM_DEBUG(dbgs() << "\n"); | 
|  |  | 
|  | if (BestCandidates.empty()) | 
|  | return {OpMode::Failed, nullptr}; | 
|  |  | 
|  | if (BestCandidates.size() == 1) | 
|  | return {Mode, BestCandidates[0]}; | 
|  |  | 
|  | VPValue *Best = nullptr; | 
|  | unsigned BestScore = 0; | 
|  | for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { | 
|  | unsigned PrevScore = ~0u; | 
|  | bool AllSame = true; | 
|  |  | 
|  | // FIXME: Avoid visiting the same operands multiple times. | 
|  | for (auto *Candidate : BestCandidates) { | 
|  | unsigned Score = getLAScore(Last, Candidate, Depth, IAI); | 
|  | if (PrevScore == ~0u) | 
|  | PrevScore = Score; | 
|  | if (PrevScore != Score) | 
|  | AllSame = false; | 
|  | PrevScore = Score; | 
|  |  | 
|  | if (Score > BestScore) { | 
|  | BestScore = Score; | 
|  | Best = Candidate; | 
|  | } | 
|  | } | 
|  | if (!AllSame) | 
|  | break; | 
|  | } | 
|  | LLVM_DEBUG(dbgs() << "Found best " | 
|  | << *cast<VPInstruction>(Best)->getUnderlyingInstr() | 
|  | << "\n"); | 
|  | Candidates.erase(Best); | 
|  |  | 
|  | return {Mode, Best}; | 
|  | } | 
|  |  | 
|  | SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { | 
|  | SmallVector<MultiNodeOpTy, 4> FinalOrder; | 
|  | SmallVector<OpMode, 4> Mode; | 
|  | FinalOrder.reserve(MultiNodeOps.size()); | 
|  | Mode.reserve(MultiNodeOps.size()); | 
|  |  | 
|  | LLVM_DEBUG(dbgs() << "Reordering multinode\n"); | 
|  |  | 
|  | for (auto &Operands : MultiNodeOps) { | 
|  | FinalOrder.push_back({Operands.first, {Operands.second[0]}}); | 
|  | if (cast<VPInstruction>(Operands.second[0])->getOpcode() == | 
|  | Instruction::Load) | 
|  | Mode.push_back(OpMode::Load); | 
|  | else | 
|  | Mode.push_back(OpMode::Opcode); | 
|  | } | 
|  |  | 
|  | for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { | 
|  | LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n"); | 
|  | SmallPtrSet<VPValue *, 4> Candidates; | 
|  | LLVM_DEBUG(dbgs() << "  Candidates  "); | 
|  | for (auto Ops : MultiNodeOps) { | 
|  | LLVM_DEBUG( | 
|  | dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() | 
|  | << " "); | 
|  | Candidates.insert(Ops.second[Lane]); | 
|  | } | 
|  | LLVM_DEBUG(dbgs() << "\n"); | 
|  |  | 
|  | for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { | 
|  | LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n"); | 
|  | if (Mode[Op] == OpMode::Failed) | 
|  | continue; | 
|  |  | 
|  | VPValue *Last = FinalOrder[Op].second[Lane - 1]; | 
|  | std::pair<OpMode, VPValue *> Res = | 
|  | getBest(Mode[Op], Last, Candidates, IAI); | 
|  | if (Res.second) | 
|  | FinalOrder[Op].second.push_back(Res.second); | 
|  | else | 
|  | // TODO: handle this case | 
|  | FinalOrder[Op].second.push_back(markFailed()); | 
|  | } | 
|  | } | 
|  |  | 
|  | return FinalOrder; | 
|  | } | 
|  |  | 
|  | void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { | 
|  | dbgs() << " Ops: "; | 
|  | for (auto Op : Values) { | 
|  | if (auto *VPInstr = cast_or_null<VPInstruction>(Op)) | 
|  | if (auto *Instr = VPInstr->getUnderlyingInstr()) { | 
|  | dbgs() << *Instr << " | "; | 
|  | continue; | 
|  | } | 
|  | dbgs() << " nullptr | "; | 
|  | } | 
|  | dbgs() << "\n"; | 
|  | } | 
|  |  | 
|  | VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { | 
|  | assert(!Values.empty() && "Need some operands!"); | 
|  |  | 
|  | // If we already visited this instruction bundle, re-use the existing node | 
|  | auto I = BundleToCombined.find(to_vector<4>(Values)); | 
|  | if (I != BundleToCombined.end()) { | 
|  | #ifndef NDEBUG | 
|  | // Check that the resulting graph is a tree. If we re-use a node, this means | 
|  | // its values have multiple users. We only allow this, if all users of each | 
|  | // value are the same instruction. | 
|  | for (auto *V : Values) { | 
|  | auto UI = V->user_begin(); | 
|  | auto *FirstUser = *UI++; | 
|  | while (UI != V->user_end()) { | 
|  | assert(*UI == FirstUser && "Currently we only support SLP trees."); | 
|  | UI++; | 
|  | } | 
|  | } | 
|  | #endif | 
|  | return I->second; | 
|  | } | 
|  |  | 
|  | // Dump inputs | 
|  | LLVM_DEBUG({ | 
|  | dbgs() << "buildGraph: "; | 
|  | dumpBundle(Values); | 
|  | }); | 
|  |  | 
|  | if (!areVectorizable(Values)) | 
|  | return markFailed(); | 
|  |  | 
|  | assert(getOpcode(Values) && "Opcodes for all values must match"); | 
|  | unsigned ValuesOpcode = getOpcode(Values).getValue(); | 
|  |  | 
|  | SmallVector<VPValue *, 4> CombinedOperands; | 
|  | if (areCommutative(Values)) { | 
|  | bool MultiNodeRoot = !MultiNodeActive; | 
|  | MultiNodeActive = true; | 
|  | for (auto &Operands : getOperands(Values)) { | 
|  | LLVM_DEBUG({ | 
|  | dbgs() << "  Visiting Commutative"; | 
|  | dumpBundle(Operands); | 
|  | }); | 
|  |  | 
|  | auto OperandsOpcode = getOpcode(Operands); | 
|  | if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { | 
|  | LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n"); | 
|  | CombinedOperands.push_back(buildGraph(Operands)); | 
|  | } else { | 
|  | LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n"); | 
|  | // Create dummy VPInstruction, which will we replace later by the | 
|  | // re-ordered operand. | 
|  | VPInstruction *Op = new VPInstruction(0, {}); | 
|  | CombinedOperands.push_back(Op); | 
|  | MultiNodeOps.emplace_back(Op, Operands); | 
|  | } | 
|  | } | 
|  |  | 
|  | if (MultiNodeRoot) { | 
|  | LLVM_DEBUG(dbgs() << "Reorder \n"); | 
|  | MultiNodeActive = false; | 
|  |  | 
|  | auto FinalOrder = reorderMultiNodeOps(); | 
|  |  | 
|  | MultiNodeOps.clear(); | 
|  | for (auto &Ops : FinalOrder) { | 
|  | VPInstruction *NewOp = buildGraph(Ops.second); | 
|  | Ops.first->replaceAllUsesWith(NewOp); | 
|  | for (unsigned i = 0; i < CombinedOperands.size(); i++) | 
|  | if (CombinedOperands[i] == Ops.first) | 
|  | CombinedOperands[i] = NewOp; | 
|  | delete Ops.first; | 
|  | Ops.first = NewOp; | 
|  | } | 
|  | LLVM_DEBUG(dbgs() << "Found final order\n"); | 
|  | } | 
|  | } else { | 
|  | LLVM_DEBUG(dbgs() << "  NonCommuntative\n"); | 
|  | if (ValuesOpcode == Instruction::Load) | 
|  | for (VPValue *V : Values) | 
|  | CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); | 
|  | else | 
|  | for (auto &Operands : getOperands(Values)) | 
|  | CombinedOperands.push_back(buildGraph(Operands)); | 
|  | } | 
|  |  | 
|  | unsigned Opcode; | 
|  | switch (ValuesOpcode) { | 
|  | case Instruction::Load: | 
|  | Opcode = VPInstruction::SLPLoad; | 
|  | break; | 
|  | case Instruction::Store: | 
|  | Opcode = VPInstruction::SLPStore; | 
|  | break; | 
|  | default: | 
|  | Opcode = ValuesOpcode; | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (!CompletelySLP) | 
|  | return markFailed(); | 
|  |  | 
|  | assert(CombinedOperands.size() > 0 && "Need more some operands"); | 
|  | auto *VPI = new VPInstruction(Opcode, CombinedOperands); | 
|  | VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); | 
|  |  | 
|  | LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); | 
|  | cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n"); | 
|  | addCombined(Values, VPI); | 
|  | return VPI; | 
|  | } |