SLPVectorizer: add initial support for reduction variable vectorization.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179470 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 209d287..2f55a00 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -26,6 +26,7 @@
#include "llvm/Analysis/Verifier.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
@@ -38,7 +39,7 @@
using namespace llvm;
static cl::opt<int>
-SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden,
+SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize trees if the gain is above this "
"number. (gain = -cost of vectorization)"));
namespace {
@@ -63,7 +64,7 @@
/// object. We sort the stores to their base objects to reduce the cost of the
/// quadratic search on the stores. TODO: We can further reduce this cost
/// if we flush the chain creation every time we run into a memory barrier.
- bool CollectStores(BasicBlock *BB, BoUpSLP &R) {
+ bool collectStores(BasicBlock *BB, BoUpSLP &R) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
StoreInst *SI = dyn_cast<StoreInst>(it);
if (!SI)
@@ -84,7 +85,79 @@
return true;
}
- bool RollStoreChains(BoUpSLP &R) {
+ bool tryToVectorizePair(BinaryOperator *A, BinaryOperator *B, BoUpSLP &R) {
+ if (!A || !B) return false;
+ BoUpSLP::ValueList VL;
+ VL.push_back(A);
+ VL.push_back(B);
+ int Cost = R.getTreeCost(VL);
+ DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << ".\n");
+ if (Cost >= -SLPCostThreshold) return false;
+ DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
+ R.vectorizeArith(VL);
+ return true;
+ }
+
+ bool tryToVectorizeCandidate(BinaryOperator *V, BoUpSLP &R) {
+ if (!V) return false;
+ BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+ BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+ // Try to vectorize V.
+ if (tryToVectorizePair(A, B, R)) return true;
+
+ // Try to skip B.
+ if (B && B->hasOneUse()) {
+ BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (tryToVectorizePair(A, B0, R)) {
+ B->moveBefore(V);
+ return true;
+ }
+ if (tryToVectorizePair(A, B1, R)) {
+ B->moveBefore(V);
+ return true;
+ }
+ }
+
+ // Try to slip A.
+ if (A && A->hasOneUse()) {
+ BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (tryToVectorizePair(A0, B, R)) {
+ A->moveBefore(V);
+ return true;
+ }
+ if (tryToVectorizePair(A1, B, R)) {
+ A->moveBefore(V);
+ return true;
+ }
+ }
+ return 0;
+ }
+
+ bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
+ bool Changed = false;
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ if (isa<DbgInfoIntrinsic>(it)) continue;
+ PHINode *P = dyn_cast<PHINode>(it);
+ if (!P) return Changed;
+ // Check that the PHI is a reduction PHI.
+ if (P->getNumIncomingValues() != 2) return Changed;
+ Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
+ (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0));
+ // Check if this is a Binary Operator.
+ BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+ if (!BI) continue;
+
+ Value *Inst = BI->getOperand(0);
+ if (Inst == P) Inst = BI->getOperand(1);
+ Changed |= tryToVectorizeCandidate(dyn_cast<BinaryOperator>(Inst), R);
+ }
+
+ return Changed;
+ }
+
+ bool rollStoreChains(BoUpSLP &R) {
bool Changed = false;
// Attempt to sort and vectorize each of the store-groups.
for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
@@ -116,13 +189,15 @@
// he store instructions.
BoUpSLP R(&BB, SE, DL, TTI, AA);
- if (!CollectStores(&BB, R))
- return false;
+ bool Changed = vectorizeReductions(&BB, R);
- bool Changed = RollStoreChains(R);
- if (Changed) {
+ if (!collectStores(&BB, R))
+ return Changed;
+
+ if (rollStoreChains(R)) {
DEBUG(dbgs()<<"SLP: vectorized in \""<<BB.getParent()->getName()<<"\"\n");
DEBUG(verifyFunction(*BB.getParent()));
+ Changed |= true;
}
return Changed;
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
index d8be0ae..4d075c5 100644
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -208,6 +208,16 @@
return 0;
}
+void BoUpSLP::vectorizeArith(ValueList &Operands) {
+ Value *Vec = vectorizeTree(Operands, Operands.size());
+ BasicBlock::iterator Loc = cast<Instruction>(Vec);
+ IRBuilder<> Builder(++Loc);
+ for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+ Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
+ Operands[i]->replaceAllUsesWith(S);
+ }
+}
+
int BoUpSLP::getTreeCost(ValueList &VL) {
// Get rid of the list of stores that were removed, and from the
// lists of instructions with multiple users.
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
index 808fb10..f865236 100644
--- a/lib/Transforms/Vectorize/VecUtils.h
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -66,6 +66,9 @@
/// \returns true if the basic block was modified.
bool vectorizeStores(StoreList &Stores, int costThreshold);
+ /// \brief Vectorize a group of scalars into a vector tree.
+ void vectorizeArith(ValueList &Operands);
+
private:
/// \returns This method contains the recursive part of getTreeCost.
int getTreeCost_rec(ValueList &VL, unsigned Depth);
diff --git a/test/Transforms/SLPVectorizer/X86/reduction.ll b/test/Transforms/SLPVectorizer/X86/reduction.ll
new file mode 100644
index 0000000..ced9f15
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/reduction.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+; int foo(double *A, int n, int m) {
+; double sum = 0, v1 = 2, v0 = 3;
+; for (int i=0; i < n; ++i)
+; sum += 7*A[i*2] + 7*A[i*2+1];
+; return sum;
+; }
+
+;CHECK: reduce
+;CHECK: load <2 x double>
+;CHECK: fmul <2 x double>
+;CHECK: ret
+define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) #0 {
+entry:
+ %cmp13 = icmp sgt i32 %n, 0
+ br i1 %cmp13, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %i.015 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %sum.014 = phi double [ %add6, %for.body ], [ 0.000000e+00, %entry ]
+ %mul = shl nsw i32 %i.015, 1
+ %arrayidx = getelementptr inbounds double* %A, i32 %mul
+ %0 = load double* %arrayidx, align 4, !tbaa !0
+ %mul1 = fmul double %0, 7.000000e+00
+ %add12 = or i32 %mul, 1
+ %arrayidx3 = getelementptr inbounds double* %A, i32 %add12
+ %1 = load double* %arrayidx3, align 4, !tbaa !0
+ %mul4 = fmul double %1, 7.000000e+00
+ %add5 = fadd double %mul1, %mul4
+ %add6 = fadd double %sum.014, %add5
+ %inc = add nsw i32 %i.015, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge: ; preds = %for.body
+ %phitmp = fptosi double %add6 to i32
+ br label %for.end
+
+for.end: ; preds = %for.cond.for.end_crit_edge, %entry
+ %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+ ret i32 %sum.0.lcssa
+}
+
+attributes #0 = { nounwind readonly ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}