[LV] FoldTail w/o Primary Induction

Introduce a new VPWidenCanonicalIVRecipe to generate a canonical vector
induction for use in fold-tail-with-masking, if a primary induction is absent.

The canonical scalar IV having start = 0 and step = VF*UF, created during code
-gen to control the vector loop, is widened into a canonical vector IV having
start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF} and
step = <VF*UF, VF*UF, ..., VF*UF>.

Differential Revision: https://reviews.llvm.org/D77635
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index d54956e..c87b5df 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -168,9 +168,17 @@
   ret void
 }
 
-; N is unknown, we need a tail. Can't vectorize because loop has no primary
-; induction.
+; Loop has no primary induction as its integer IV has step -1 starting at
+; unknown N, but can still be vectorized.
 ;CHECK-LABEL: @example3(
+; CHECK:       vector.ph:
+; CHECK:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> {{.*}}, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0,
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VPIV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK:    {{.*}} = icmp ule <4 x i64> [[VPIV]], [[BROADCAST_SPLAT2]]
 ;CHECK-NOT: <4 x i32>
 ;CHECK: ret void
 define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize {
@@ -237,12 +245,12 @@
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[TMP6:%.*]]
-; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7
+; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !11
 ; CHECK:         ret void
 ;
   br label %1
@@ -353,12 +361,12 @@
 ; CHECK:       pred.store.continue22:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
-; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[TMP33:%.*]]
-; CHECK:         br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9
+; CHECK:         br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !13
 ; CHECK:         ret void
 ;
   br label %1
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
index 2667bfe..b51cb56 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
@@ -1,12 +1,11 @@
-; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -force-vector-width=4 -S | FileCheck %s
 
-; Check that when we can't predicate this loop that it is still vectorised (with
-; an epilogue).
-; TODO: the reason this can't be predicated is because a primary induction
-; variable can't be found (not yet) for this counting down loop. But with that
-; fixed, this should be able to be predicated.
+; Check that a counting-down loop which has no primary induction variable
+; is vectorized with preferred predication.
 
 ; CHECK-LABEL: vector.body:
+; CHECK-LABEL: middle.block:
+; CHECK-NEXT:    br i1 true,
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"