SLPVectorizer: add initial support for reduction variable vectorization.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179470 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 209d287..2f55a00 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -38,7 +39,7 @@
 using namespace llvm;
 
 static cl::opt<int>
-SLPCostThreshold("slp-threshold", cl::init(1), cl::Hidden,
+SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
                  cl::desc("Only vectorize trees if the gain is above this "
                           "number. (gain = -cost of vectorization)"));
 namespace {
@@ -63,7 +64,7 @@
   /// object. We sort the stores to their base objects to reduce the cost of the
   /// quadratic search on the stores. TODO: We can further reduce this cost
   /// if we flush the chain creation every time we run into a memory barrier.
-  bool CollectStores(BasicBlock *BB, BoUpSLP &R) {
+  bool collectStores(BasicBlock *BB, BoUpSLP &R) {
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
       StoreInst *SI = dyn_cast<StoreInst>(it);
       if (!SI)
@@ -84,7 +85,79 @@
     return true;
   }
 
-  bool RollStoreChains(BoUpSLP &R) {
+  bool tryToVectorizePair(BinaryOperator *A, BinaryOperator *B,  BoUpSLP &R) {
+    if (!A || !B) return false;
+    BoUpSLP::ValueList VL;
+    VL.push_back(A);
+    VL.push_back(B);
+    int Cost = R.getTreeCost(VL);
+    DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << ".\n");
+    if (Cost >= -SLPCostThreshold) return false;
+    DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
+    R.vectorizeArith(VL);
+    return true;
+  }
+
+  bool tryToVectorizeCandidate(BinaryOperator *V,  BoUpSLP &R) {
+    if (!V) return false;
+    BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+    BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+    // Try to vectorize V.
+    if (tryToVectorizePair(A, B, R)) return true;
+
+    // Try to skip B.
+    if (B && B->hasOneUse()) {
+      BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+      BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+      if (tryToVectorizePair(A, B0, R)) {
+        B->moveBefore(V);
+        return true;
+      }
+      if (tryToVectorizePair(A, B1, R)) {
+        B->moveBefore(V);
+        return true;
+      }
+    }
+
+    // Try to slip A.
+    if (A && A->hasOneUse()) {
+      BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+      BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+      if (tryToVectorizePair(A0, B, R)) {
+        A->moveBefore(V);
+        return true;
+      }
+      if (tryToVectorizePair(A1, B, R)) {
+        A->moveBefore(V);
+        return true;
+      }
+    }
+    return 0;
+  }
+
+  bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
+    bool Changed = false;
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      if (isa<DbgInfoIntrinsic>(it)) continue;
+      PHINode *P = dyn_cast<PHINode>(it);
+      if (!P) return Changed;
+      // Check that the PHI is a reduction PHI.
+      if (P->getNumIncomingValues() != 2) return Changed;
+      Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
+                   (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0));
+      // Check if this is a Binary Operator.
+      BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+      if (!BI) continue;
+
+      Value *Inst = BI->getOperand(0);
+      if (Inst == P) Inst = BI->getOperand(1);
+      Changed |= tryToVectorizeCandidate(dyn_cast<BinaryOperator>(Inst), R);
+    }
+
+    return Changed;
+  }
+
+  bool rollStoreChains(BoUpSLP &R) {
     bool Changed = false;
     // Attempt to sort and vectorize each of the store-groups.
     for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
@@ -116,13 +189,15 @@
     // he store instructions.
     BoUpSLP R(&BB, SE, DL, TTI, AA);
 
-    if (!CollectStores(&BB, R))
-      return false;
+    bool Changed = vectorizeReductions(&BB, R);
 
-    bool Changed = RollStoreChains(R);
-    if (Changed) {
+    if (!collectStores(&BB, R))
+      return Changed;
+
+    if (rollStoreChains(R)) {
       DEBUG(dbgs()<<"SLP: vectorized in \""<<BB.getParent()->getName()<<"\"\n");
       DEBUG(verifyFunction(*BB.getParent()));
+      Changed |= true;
     }
 
     return Changed;
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
index d8be0ae..4d075c5 100644
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -208,6 +208,16 @@
   return 0;
 }
 
+void BoUpSLP::vectorizeArith(ValueList &Operands) {
+  Value *Vec = vectorizeTree(Operands, Operands.size());
+  BasicBlock::iterator Loc = cast<Instruction>(Vec);
+  IRBuilder<> Builder(++Loc);
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
+    Operands[i]->replaceAllUsesWith(S);
+  }
+}
+
 int BoUpSLP::getTreeCost(ValueList &VL) {
   // Get rid of the list of stores that were removed, and from the
   // lists of instructions with multiple users.
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
index 808fb10..f865236 100644
--- a/lib/Transforms/Vectorize/VecUtils.h
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -66,6 +66,9 @@
   /// \returns true if the basic block was modified.
   bool vectorizeStores(StoreList &Stores, int costThreshold);
 
+  /// \brief Vectorize a group of scalars into a vector tree.
+  void vectorizeArith(ValueList &Operands);
+
 private:
   /// \returns This method contains the recursive part of getTreeCost.
   int getTreeCost_rec(ValueList &VL, unsigned Depth);
diff --git a/test/Transforms/SLPVectorizer/X86/reduction.ll b/test/Transforms/SLPVectorizer/X86/reduction.ll
new file mode 100644
index 0000000..ced9f15
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/reduction.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+; int foo(double *A, int n, int m) {
+;   double sum = 0, v1 = 2, v0 = 3;
+;   for (int i=0; i < n; ++i)
+;     sum += 7*A[i*2] + 7*A[i*2+1];
+;   return sum;
+; }
+
+;CHECK: reduce
+;CHECK: load <2 x double>
+;CHECK: fmul <2 x double>
+;CHECK: ret
+define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) #0 {
+entry:
+  %cmp13 = icmp sgt i32 %n, 0
+  br i1 %cmp13, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.014 = phi double [ %add6, %for.body ], [ 0.000000e+00, %entry ]
+  %mul = shl nsw i32 %i.015, 1
+  %arrayidx = getelementptr inbounds double* %A, i32 %mul
+  %0 = load double* %arrayidx, align 4, !tbaa !0
+  %mul1 = fmul double %0, 7.000000e+00
+  %add12 = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds double* %A, i32 %add12
+  %1 = load double* %arrayidx3, align 4, !tbaa !0
+  %mul4 = fmul double %1, 7.000000e+00
+  %add5 = fadd double %mul1, %mul4
+  %add6 = fadd double %sum.014, %add5
+  %inc = add nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %phitmp = fptosi double %add6 to i32
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+attributes #0 = { nounwind readonly ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}