Add support for bottom-up SLP vectorization infrastructure.

This commit adds the infrastructure for performing bottom-up SLP vectorization (and other optimizations) on parallel computations.
The infrastructure has three potential users:

  1. The loop vectorizer needs to be able to vectorize AOS data structures such as (sum += A[i] + A[i+1]).

  2. The BB-vectorizer needs this infrastructure for bottom-up SLP vectorization, because bottom-up vectorization is faster to compute.

  3. A loop-roller needs to be able to analyze consecutive chains and roll them into a loop, in order to reduce code size. A loop roller does not need to create vector instructions, and this infrastructure separates the chain analysis from the vectorization.

This patch also includes a simple (100 LOC) bottom up SLP vectorizer that uses the infrastructure, and can vectorize this code:

void SAXPY(int *x, int *y, int a, int i) {
  x[i]   = a * x[i]   + y[i];
  x[i+1] = a * x[i+1] + y[i+1];
  x[i+2] = a * x[i+2] + y[i+2];
  x[i+3] = a * x[i+3] + y[i+3];
}

llvm-svn: 179117
diff --git a/llvm/lib/Transforms/Vectorize/VecUtils.cpp b/llvm/lib/Transforms/Vectorize/VecUtils.cpp
new file mode 100644
index 0000000..7e9f12d
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VecUtils.cpp
@@ -0,0 +1,439 @@
+//===- VecUtils.h --- Vectorization Utilities -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "VecUtils"
+
+#include "VecUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <map>
+
+using namespace llvm;
+
+namespace llvm {
+
+BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
+                             TargetTransformInfo *Tti, AliasAnalysis *Aa) :
+                             BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa) {
+  numberInstructions();
+}
+
+void BoUpSLP::numberInstructions() {
+  int Loc = 0;
+  InstrIdx.clear();
+  InstrVec.clear();
+  // Number the instructions in the block.
+  for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) {
+    InstrIdx[it] = Loc++;
+    InstrVec.push_back(it);
+    assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
+  }
+}
+
+Value *BoUpSLP::getPointerOperand(Value *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
+  return 0;
+}
+
+unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
+  if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
+  if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
+  return -1;
+}
+
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB)) return false;
+
+  // Check that A and B are of the same type.
+  if (PtrA->getType() != PtrB->getType()) return false;
+
+  // Calculate the distance.
+  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
+  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
+
+  // Non constant distance.
+  if (!ConstOffSCEV) return false;
+
+  unsigned Offset = ConstOffSCEV->getValue()->getSExtValue();
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  // The Instructions are connsecutive if the size of the first load/store is
+  // the same as the offset.
+  unsigned Sz = (DL ? DL->getTypeStoreSize(Ty) : Ty->getScalarSizeInBits()/8);
+  return ((-Offset) == Sz);
+}
+
+bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) {
+  ValueSet Heads, Tails;
+  SmallDenseMap<Value*, Value*> ConsecutiveChain;
+  bool Changed = false;
+
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of loads that follow each other.
+  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
+    for (unsigned j = 0; j < e; ++j) {
+      if (i == j) continue;
+      if (isConsecutiveAccess(Stores[i], Stores[j])) {
+        Tails.insert(Stores[j]);
+        Heads.insert(Stores[i]);
+        ConsecutiveChain[Stores[i]] = Stores[j];
+      }
+    }
+
+  // For stores that start but don't end a link in the chain:
+  for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) {
+    if (Tails.count(*it)) continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to vectorize it.
+    ValueList Operands;
+    Value *I = *it;
+    int MinCost = 0, MinVF = 0;
+    while (Tails.count(I) || Heads.count(I)) {
+      Operands.push_back(I);
+      unsigned VF = Operands.size();
+      if (isPowerOf2_32(VF) && VF > 1) {
+        int cost = getTreeRollCost(Operands, 0);
+        DEBUG(dbgs() << "Found cost=" << cost << " for VF=" << VF << "\n");
+        if (cost < MinCost) { MinCost = cost; MinVF = VF; }
+      }
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    if (MinCost <= costThreshold && MinVF > 1) {
+      DEBUG(dbgs() << "Decided to vectorize cost=" << MinCost << "\n");
+      vectorizeTree(Operands, MinVF);
+      Stores.clear();
+      // The current numbering is invalid because we added and removed instrs.
+      numberInstructions();
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+int BoUpSLP::getScalarizationCost(Type *Ty) {
+  int Cost = 0;
+  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+  return Cost;
+}
+
+AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI);
+  return AliasAnalysis::Location();
+}
+
+Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
+  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
+  BasicBlock::iterator I = Src, E = Dst;
+  /// Scan all of the instruction from SRC to DST and check if
+  /// the source may alias.
+  for (++I; I != E; ++I) {
+    // Ignore store instructions that are marked as 'ignore'.
+    if (MemBarrierIgnoreList.count(I)) continue;
+    if (Src->mayWriteToMemory()) /* Write */ {
+      if (!I->mayReadOrWriteMemory()) continue;
+    } else /* Read */ {
+      if (!I->mayWriteToMemory()) continue;
+    }
+    AliasAnalysis::Location A = getLocation(&*I);
+    AliasAnalysis::Location B = getLocation(Src);
+
+    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
+      return I;
+  }
+  return 0;
+}
+
+int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) {
+  if (Depth == 6) return max_cost;
+  Type *ScalarTy = VL[0]->getType();
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+
+  /// Don't mess with vectors.
+  if (ScalarTy->isVectorTy()) return max_cost;
+
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+  // Check if all of the operands are constants.
+  bool AllConst = true;
+  bool AllSameScalar = true;
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    AllConst &= isa<Constant>(VL[i]);
+    AllSameScalar &= (VL[0] == VL[i]);
+    // Must have a single use.
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // Need to scalarize instructions with multiple users or from other BBs.
+    if (I && ((I->getNumUses() > 1) || (I->getParent() != BB)))
+      return getScalarizationCost(VecTy);
+  }
+
+  // Is this a simple vector constant.
+  if (AllConst) return 0;
+
+  // If all of the operands are identical we can broadcast them.
+  if (AllSameScalar)
+    return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+
+  // Scalarize unknown structures.
+  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+  if (!VL0) return getScalarizationCost(VecTy);
+  assert(VL0->getParent() == BB && "Wrong BB");
+
+  unsigned Opcode = VL0->getOpcode();
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // If not all of the instructions are identical then we have to scalarize.
+    if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy);
+  }
+
+  // Check if it is safe to sink the loads or the stores.
+  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
+    int MaxIdx = InstrIdx[VL0];
+    for (unsigned i = 1, e = VL.size(); i < e; ++i )
+      MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
+
+    Instruction *Last = InstrVec[MaxIdx];
+    for (unsigned i = 0, e = VL.size(); i < e; ++i ) {
+      if (VL[i] == Last) continue;
+      Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
+      if (Barrier) {
+        DEBUG(dbgs() << "LR: Can't sink " << *VL[i] << "\n down to " <<
+              *Last << "\n because of " << *Barrier << "\n");
+        return max_cost;
+      }
+    }
+  }
+
+  switch (Opcode) {
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    ValueList Operands;
+    int Cost = 0;
+    // Calculate the cost of all of the operands.
+    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+      // Prepare the operand vector.
+      for (unsigned j = 0; j < VL.size(); ++j)
+        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+      Cost += getTreeRollCost(Operands, Depth+1);
+      Operands.clear();
+    }
+
+    // Calculate the cost of this instruction.
+    int ScalarCost = VecTy->getNumElements() *
+      TTI->getArithmeticInstrCost(Opcode, ScalarTy);
+    int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+    Cost += (VecCost - ScalarCost);
+    return Cost;
+  }
+  case Instruction::Load: {
+    // If we are scalarize the loads, add the cost of forming the vector.
+    for (unsigned i = 0, e = VL.size()-1; i < e; ++i)
+      if (!isConsecutiveAccess(VL[i], VL[i+1]))
+        return getScalarizationCost(VecTy);
+
+    // Cost of wide load - cost of scalar loads.
+    int ScalarLdCost = VecTy->getNumElements() *
+      TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+    int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+    return VecLdCost - ScalarLdCost;
+  }
+  case Instruction::Store: {
+    // We know that we can merge the stores. Calculate the cost.
+    int ScalarStCost = VecTy->getNumElements() *
+      TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+    int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0);
+    int StoreCost = VecStCost - ScalarStCost;
+
+    ValueList Operands;
+    for (unsigned j = 0; j < VL.size(); ++j) {
+      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+      MemBarrierIgnoreList.insert(VL[j]);
+    }
+
+    int TotalCost =  StoreCost + getTreeRollCost(Operands, Depth + 1);
+    MemBarrierIgnoreList.clear();
+    return TotalCost;
+  }
+  default:
+    // Unable to vectorize unknown instructions.
+    return getScalarizationCost(VecTy);
+  }
+}
+
+Instruction *BoUpSLP::GetLastInstr(ValueList &VL, unsigned VF) {
+  int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
+  for (unsigned i = 0; i < VF; ++i )
+    MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
+  return InstrVec[MaxIdx + 1];
+}
+
+Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) {
+  IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
+  Value *Vec = UndefValue::get(Ty);
+  for (unsigned i=0; i < Ty->getNumElements(); ++i)
+    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+  return Vec;
+}
+
+Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VF);
+
+  // Check if all of the operands are constants or identical.
+  bool AllConst = true;
+  bool AllSameScalar = true;
+  for (unsigned i = 0, e = VF; i < e; ++i) {
+    AllConst &= !!dyn_cast<Constant>(VL[i]);
+    AllSameScalar &= (VL[0] == VL[i]);
+    // Must have a single use.
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (I && (I->getNumUses() > 1 || I->getParent() != BB))
+      return Scalarize(VL, VecTy);
+  }
+
+  // Is this a simple vector constant.
+  if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);
+
+  // Scalarize unknown structures.
+  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+  if (!VL0) return Scalarize(VL, VecTy);
+
+  unsigned Opcode = VL0->getOpcode();
+  for (unsigned i = 0, e = VF; i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // If not all of the instructions are identical then we have to scalarize.
+    if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy);
+  }
+
+  switch (Opcode) {
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    ValueList LHSVL, RHSVL;
+    for (int i = 0; i < VF; ++i) {
+      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
+    }
+
+    Value *RHS = vectorizeTree(RHSVL, VF);
+    Value *LHS = vectorizeTree(LHSVL, VF);
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    BinaryOperator *BinOp = dyn_cast<BinaryOperator>(VL0);
+    return Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
+  }
+  case Instruction::Load: {
+    LoadInst *LI = dyn_cast<LoadInst>(VL0);
+    unsigned Alignment = LI->getAlignment();
+
+    // Check if all of the loads are consecutive.
+    for (unsigned i = 1, e = VF; i < e; ++i)
+      if (!isConsecutiveAccess(VL[i-1], VL[i]))
+        return Scalarize(VL, VecTy);
+
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+                                          VecTy->getPointerTo());
+    LI = Builder.CreateLoad(VecPtr);
+    LI->setAlignment(Alignment);
+    return LI;
+  }
+  case Instruction::Store: {
+    StoreInst *SI = dyn_cast<StoreInst>(VL0);
+    unsigned Alignment = SI->getAlignment();
+
+    ValueList ValueOp;
+    for (int i = 0; i < VF; ++i)
+      ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
+
+    Value *VecValue = vectorizeTree(ValueOp, VF);
+
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
+                                          VecTy->getPointerTo());
+    Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
+
+    for (int i = 0; i < VF; ++i)
+      cast<Instruction>(VL[i])->eraseFromParent();
+    return 0;
+  }
+  default:
+    return Scalarize(VL, VecTy);
+  }
+}
+
+} // end of namespace