[LV] For some IVs, use vector phis instead of widening in the loop body
Previously, whenever we needed a vector IV, we would create it on the fly,
by splatting the scalar IV and adding a step vector. Instead, we can create a
real vector IV. This tends to save a couple of instructions per iteration.
This only changes the behavior for the most basic case - integer primary
IVs with a constant step.
Differential Revision: http://reviews.llvm.org/D20315
llvm-svn: 271410
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
index 65b3919..12a1601 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
@@ -43,7 +43,7 @@
; CHECK-LABEL: @s173
; CHECK: load <4 x float>, <4 x float>*
-; CHECK: add i64 %index, 16000
+; CHECK: add nsw i64 %.lhs, 16000
; CHECK: ret i32 0
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 2f1de54..23e363e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -95,7 +95,7 @@
%struct.In = type { float, float }
;AVX512-LABEL: @foo2
-;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.gather.v16f32
;AVX512: llvm.masked.store.v16f32
;AVX512: ret void
@@ -170,10 +170,10 @@
;}
;AVX512-LABEL: @foo3
-;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.gather.v16f32
;AVX512: fadd <16 x float>
-;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.scatter.v16f32
;AVX512: ret void
diff --git a/llvm/test/Transforms/LoopVectorize/cast-induction.ll b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
index fae8997..54f68b7 100644
--- a/llvm/test/Transforms/LoopVectorize/cast-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
@@ -8,7 +8,7 @@
@a = common global [2048 x i32] zeroinitializer, align 16
;CHECK-LABEL: @example12(
-;CHECK: trunc i64
+;CHECK: %vec.ind1 = phi <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret void
define void @example12() nounwind uwtable ssp {
diff --git a/llvm/test/Transforms/LoopVectorize/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/gcc-examples.ll
index 1880901..95b0d16 100644
--- a/llvm/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/llvm/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -368,7 +368,7 @@
}
;CHECK-LABEL: @example12(
-;CHECK: trunc i64
+;CHECK: %vec.ind1 = phi <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret void
define void @example12() nounwind uwtable ssp {
diff --git a/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll b/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll
index ab2fd5e..fb12e17 100644
--- a/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll
+++ b/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll
@@ -12,10 +12,11 @@
; CHECK-LABEL: @foo
; CHECK: vector.body
-; CHECK: %0 = getelementptr inbounds double*, double** %in, i64 %index
-; CHECK: %1 = bitcast double** %0 to <4 x i64>*
-; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %1, align 8
-; CHECK: %2 = icmp eq <4 x i64> %wide.load, zeroinitializer
+; CHECK: %0 = phi
+; CHECK: %2 = getelementptr inbounds double*, double** %in, i64 %0
+; CHECK: %3 = bitcast double** %2 to <4 x i64>*
+; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %3, align 8
+; CHECK: %4 = icmp eq <4 x i64> %wide.load, zeroinitializer
; CHECK: br i1
define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 {
@@ -37,4 +38,4 @@
for.end:
ret void
-}
\ No newline at end of file
+}
diff --git a/llvm/test/Transforms/LoopVectorize/global_alias.ll b/llvm/test/Transforms/LoopVectorize/global_alias.ll
index 84fa48c..0da841b 100644
--- a/llvm/test/Transforms/LoopVectorize/global_alias.ll
+++ b/llvm/test/Transforms/LoopVectorize/global_alias.ll
@@ -12,7 +12,7 @@
@PA = external global i32*
-;; === First, the tests that should always vectorize, wither statically or by adding run-time checks ===
+;; === First, the tests that should always vectorize, whether statically or by adding run-time checks ===
; /// Different objects, positive induction, constant distance
@@ -387,7 +387,7 @@
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias08(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret
define i32 @noAlias08(i32 %a) #0 {
@@ -439,7 +439,7 @@
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias09(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret
define i32 @noAlias09(i32 %a) #0 {
@@ -721,7 +721,7 @@
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias14(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret
define i32 @noAlias14(i32 %a) #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 8e3cf36..c2d4d96 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1,4 +1,6 @@
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -27,8 +29,6 @@
ret void
}
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
-
; Make sure we remove unneeded vectorization of induction variables.
; In order for instcombine to cleanup the vectorized induction variables that we
; create in the loop vectorizer we need to perform some form of redundancy
@@ -241,3 +241,64 @@
exit:
ret void
}
+
+; Check that we generate vectorized IVs in the pre-header
+; instead of widening the scalar IV inside the loop, when
+; we know how to do that.
+; IND-LABEL: veciv
+; IND: vector.body:
+; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %step.add, %vector.body ]
+; IND: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; IND: %index.next = add i32 %index, 2
+; IND: %[[CMP:.*]] = icmp eq i32 %index.next
+; IND: br i1 %[[CMP]]
+; UNROLL-LABEL: veciv
+; UNROLL: vector.body:
+; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %step.add1, %vector.body ]
+; UNROLL: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; UNROLL: %step.add1 = add <2 x i32> %vec.ind, <i32 4, i32 4>
+; UNROLL: %index.next = add i32 %index, 4
+; UNROLL: %[[CMP:.*]] = icmp eq i32 %index.next
+; UNROLL: br i1 %[[CMP]]
+define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
+ store i32 %indvars.iv, i32* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+ %exitcond = icmp eq i32 %indvars.iv.next, %k
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+; IND-LABEL: trunciv
+; IND: vector.body:
+; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %[[VECIND:.*]] = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %[[STEPADD:.*]], %vector.body ]
+; IND: %[[STEPADD]] = add <2 x i32> %[[VECIND]], <i32 2, i32 2>
+; IND: %index.next = add i64 %index, 2
+; IND: %[[CMP:.*]] = icmp eq i64 %index.next
+; IND: br i1 %[[CMP]]
+define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %trunc.iv = trunc i64 %indvars.iv to i32
+ %arrayidx = getelementptr inbounds i32, i32* %a, i32 %trunc.iv
+ store i32 %trunc.iv, i32* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %k
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/induction_plus.ll b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
index 7c4c8f2..5e96d41 100644
--- a/llvm/test/Transforms/LoopVectorize/induction_plus.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
@@ -6,8 +6,11 @@
@array = common global [1024 x i32] zeroinitializer, align 16
;CHECK-LABEL: @array_at_plus_one(
-;CHECK: add i64 %index, 12
-;CHECK: trunc i64
+;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK: %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %step.add, %vector.body ]
+;CHECK: %vec.ind1 = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %step.add2, %vector.body ]
+;CHECK: add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+;CHECK: add nsw <4 x i64> %vec.ind, <i64 12, i64 12, i64 12, i64 12>
;CHECK: ret i32
define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0