[llvm][mlir] Promote the experimental reduction intrinsics to be first class intrinsics.
This change renames the intrinsics to not have "experimental" in the name.
The autoupgrader will handle legacy intrinsics.
Relevant ML thread: http://lists.llvm.org/pipermail/llvm-dev/2020-April/140729.html
Differential Revision: https://reviews.llvm.org/D88787
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index 2862779..2544474 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -65,7 +65,7 @@
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
- %2 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %wide.load)
+ %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
%3 = add i32 %2, %vec.phi
%index.next = add i32 %index, 4
%4 = icmp eq i32 %index.next, %n.vec
@@ -167,7 +167,7 @@
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %2)
+ %4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -267,7 +267,7 @@
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %2)
+ %4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -367,7 +367,7 @@
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %2)
+ %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -467,7 +467,7 @@
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %2)
+ %4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -568,7 +568,7 @@
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %4 = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
+ %4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -665,7 +665,7 @@
br i1 %3, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %4 = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
+ %4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -762,7 +762,7 @@
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %3)
+ %5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -852,7 +852,7 @@
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
- %l5 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
+ %l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
%2 = icmp slt i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
@@ -958,7 +958,7 @@
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %3)
+ %5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -1048,7 +1048,7 @@
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
- %l5 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
+ %l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
%2 = icmp sgt i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
@@ -1154,7 +1154,7 @@
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %3)
+ %5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -1244,7 +1244,7 @@
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
- %l5 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
+ %l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
%2 = icmp ult i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
@@ -1350,7 +1350,7 @@
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %3)
+ %5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -1440,7 +1440,7 @@
%0 = getelementptr inbounds i32, i32* %x, i32 %index
%1 = bitcast i32* %0 to <4 x i32>*
%wide.load = load <4 x i32>, <4 x i32>* %1, align 4
- %l5 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
+ %l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
%2 = icmp ugt i32 %vec.phi, %l5
%3 = select i1 %2, i32 %vec.phi, i32 %l5
%index.next = add i32 %index, 4
@@ -1553,7 +1553,7 @@
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %5 = call float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %3)
+ %5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -1658,7 +1658,7 @@
br i1 %4, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
- %5 = call float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %3)
+ %5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3)
%cmp.n = icmp eq i32 %n.vec, %n
br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
@@ -1722,7 +1722,7 @@
%1 = bitcast i32* %0 to <4 x i32>*
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
- %3 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %2)
+ %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
%4 = add i32 %3, %vec.phi
%index.next = add i32 %index, 4
%5 = icmp eq i32 %index.next, %n.vec
@@ -1777,7 +1777,7 @@
%wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
%5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer
- %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+ %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
%7 = add i32 %6, %vec.phi
%index.next = add i32 %index, 4
%8 = icmp eq i32 %index.next, %n.vec
@@ -1828,7 +1828,7 @@
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%2 = sext <8 x i16> %wide.masked.load to <8 x i32>
%3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer
- %4 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %3)
+ %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
%5 = add i32 %4, %vec.phi
%index.next = add i32 %index, 8
%6 = icmp eq i32 %index.next, %n.vec
@@ -1885,7 +1885,7 @@
%5 = sext <8 x i16> %wide.masked.load14 to <8 x i32>
%6 = mul nsw <8 x i32> %5, %2
%7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer
- %8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %7)
+ %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
%9 = add i32 %8, %vec.phi
%index.next = add i32 %index, 8
%10 = icmp eq i32 %index.next, %n.vec
@@ -1936,7 +1936,7 @@
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%2 = zext <16 x i8> %wide.masked.load to <16 x i32>
%3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer
- %4 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %3)
+ %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
%5 = add i32 %4, %vec.phi
%index.next = add i32 %index, 16
%6 = icmp eq i32 %index.next, %n.vec
@@ -1993,7 +1993,7 @@
%5 = zext <16 x i8> %wide.masked.load14 to <16 x i32>
%6 = mul nuw nsw <16 x i32> %5, %2
%7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer
- %8 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %7)
+ %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
%9 = add i32 %8, %vec.phi
%index.next = add i32 %index, 16
%10 = icmp eq i32 %index.next, %n.vec
@@ -2043,7 +2043,7 @@
%1 = bitcast i16* %0 to <8 x i16>*
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer
- %3 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %2)
+ %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
%4 = add i16 %3, %vec.phi
%index.next = add i32 %index, 8
%5 = icmp eq i32 %index.next, %n.vec
@@ -2098,7 +2098,7 @@
%wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
%4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load
%5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer
- %6 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %5)
+ %6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5)
%7 = add i16 %6, %vec.phi
%index.next = add i32 %index, 8
%8 = icmp eq i32 %index.next, %n.vec
@@ -2149,7 +2149,7 @@
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%2 = zext <16 x i8> %wide.masked.load to <16 x i16>
%3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer
- %4 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %3)
+ %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
%5 = add i16 %4, %vec.phi
%index.next = add i32 %index, 16
%6 = icmp eq i32 %index.next, %n.vec
@@ -2206,7 +2206,7 @@
%5 = zext <16 x i8> %wide.masked.load18 to <16 x i16>
%6 = mul nuw <16 x i16> %5, %2
%7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer
- %8 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %7)
+ %8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7)
%9 = add i16 %8, %vec.phi
%index.next = add i32 %index, 16
%10 = icmp eq i32 %index.next, %n.vec
@@ -2256,7 +2256,7 @@
%1 = bitcast i8* %0 to <16 x i8>*
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer
- %3 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %2)
+ %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
%4 = add i8 %3, %vec.phi
%index.next = add i32 %index, 16
%5 = icmp eq i32 %index.next, %n.vec
@@ -2311,7 +2311,7 @@
%wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
%4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
%5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer
- %6 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %5)
+ %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
%7 = add i8 %6, %vec.phi
%index.next = add i32 %index, 16
%8 = icmp eq i32 %index.next, %n.vec
@@ -2364,7 +2364,7 @@
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%2 = sext <4 x i32> %wide.masked.load to <4 x i64>
%3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer
- %4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %3)
+ %4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
%5 = add i64 %4, %vec.phi
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n.vec
@@ -2423,7 +2423,7 @@
%5 = sext <4 x i32> %wide.masked.load14 to <4 x i64>
%6 = mul nsw <4 x i64> %5, %2
%7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer
- %8 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %7)
+ %8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7)
%9 = add i64 %8, %vec.phi
%index.next = add i32 %index, 4
%10 = icmp eq i32 %index.next, %n.vec
@@ -2482,7 +2482,7 @@
%5 = sext <8 x i16> %wide.masked.load14 to <8 x i64>
%6 = mul nsw <8 x i64> %5, %2
%7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer
- %8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %7)
+ %8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7)
%9 = add i64 %8, %vec.phi
%index.next = add i32 %index, 8
%10 = icmp eq i32 %index.next, %n.vec
@@ -2497,26 +2497,26 @@
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) #3
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3
declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #2
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) #3
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) #3
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) #3
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) #3
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) #3
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) #3
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
-declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
-declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
-declare float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)