[LV] Avoid vectorizing first order recurrence when phi uses are outside loop

In the vectorization of first order recurrence, we vectorize such
that the last element in the vector will be the one extracted to pass into the
scalar remainder loop. However, this is not true when there is a phi (other
than the primary induction variable) is used outside the loop.
In such a case, we need the value from the second last iteration (i.e.
the phi value), not the last iteration (which would be the phi update).
I've added a test case for this. Also see PR32396.

A follow up patch would generate the correct code gen for such cases,
and turn this vectorization on.

Differential Revision: https://reviews.llvm.org/D31910

Reviewers: mssimpso
llvm-svn: 299985
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index b5e6418..444bc16 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -553,13 +553,23 @@
   if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous))
     return false;
 
-  // Ensure every user of the phi node is dominated by the previous value. The
-  // dominance requirement ensures the loop vectorizer will not need to
-  // vectorize the initial value prior to the first iteration of the loop.
   for (User *U : Phi->users())
-    if (auto *I = dyn_cast<Instruction>(U))
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      // Ensure every user of the phi node is dominated by the previous value.
+      // The dominance requirement ensures the loop vectorizer will not need to
+      // vectorize the initial value prior to the first iteration of the loop.
       if (!DT->dominates(Previous, I))
         return false;
+      // When the phi node has users outside the loop, the current logic for
+      // fixFirstOrderRecurrences may generate incorrect code. Specifically, we
+      // extract the last element from the vectorized phi, which would be the
+      // update to the phi before exiting the loop. However, what we want is the
+      // previous phi value before the update (i.e. the second last update
+      // before end of the vectorized loop).
+      // See added test cases in first-order-recurrence.ll
+      if (!TheLoop->contains(I))
+        return false;
+    }
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 814baca..fff1e29 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4069,7 +4069,12 @@
   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
 
   // Extract the last vector element in the middle block. This will be the
-  // initial value for the recurrence when jumping to the scalar loop.
+  // initial value for the recurrence when jumping to the scalar loop. 
+  // FIXME: Note that the last vector element need not always be the correct one:
+  // consider a loop  where we have phi uses outside the loop - we need the
+  // second last iteration value and not the last one). For now, we avoid
+  // considering such cases as firstOrderRecurrences (see
+  // isFirstOrderRecurrence).
   auto *Extract = Incoming;
   if (VF > 1) {
     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index c7ced75..e090ddf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -235,10 +235,13 @@
 }
 
 ; CHECK-LABEL: @add_phifail2(
-; CHECK: load <16 x i8>, <16 x i8>*
-; CHECK: add nuw nsw <16 x i32>
-; CHECK: store <16 x i8>
+; CHECK-NOT: load <16 x i8>, <16 x i8>*
+; CHECK-NOT: add nuw nsw <16 x i32>
+; CHECK-NOT: store <16 x i8>
 ; Function Attrs: nounwind
+; FIXME: Currently, if we vectorize this loop, we will generate incorrect code
+; if %len evenly divides VF. Vectorized loop code gen returns a_phi = p[len -1],
+; whereas it should be the previous value a_phi = p[len -2]
 define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 7803e74..9155820 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -349,3 +349,25 @@
 for.end:
   ret void
 }
+
+; FIXME: we can vectorize this first order recurrence, by generating two
+; extracts - one for the phi `val.phi` and other for the phi update `addx`.
+; val.phi at end of loop is 94 + x.
+; CHECK-LABEL: extract_second_last_iteration
+; CHECK-NOT: vector.body
+define i32 @extract_second_last_iteration(i32* %cval, i32 %x)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ]
+  %inc = add i32 %inc.phi, 1
+  %bc = zext i32 %inc.phi to i64
+  %addx = add i32 %inc.phi, %x
+  %cmp = icmp eq i32 %inc.phi, 95
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %val.phi
+}