blob: d29c39a821246c4a33800e945ec7c7c14a591fd1 [file] [log] [blame]
Samuel Tebbsd9cb8112020-06-30 16:49:20 +01001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
Sjoerd Meijer595270a2020-07-13 11:53:09 +01002; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
3
Samuel Tebbsd9cb8112020-06-30 16:49:20 +01004define arm_aapcs_vfpcc void @round(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
5; CHECK-LABEL: round:
6; CHECK: @ %bb.0: @ %entry
7; CHECK-NEXT: .save {r7, lr}
8; CHECK-NEXT: push {r7, lr}
9; CHECK-NEXT: cmp r2, #0
10; CHECK-NEXT: it eq
11; CHECK-NEXT: popeq {r7, pc}
12; CHECK-NEXT: dlstp.32 lr, r2
13; CHECK-NEXT: .LBB0_1: @ %vector.body
14; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
15; CHECK-NEXT: vldrw.u32 q0, [r0], #16
16; CHECK-NEXT: vrinta.f32 q0, q0
17; CHECK-NEXT: vstrw.32 q0, [r1], #16
18; CHECK-NEXT: letp lr, .LBB0_1
19; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
20; CHECK-NEXT: pop {r7, pc}
21entry:
22 %cmp5 = icmp eq i32 %n, 0
23 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
24
25vector.ph: ; preds = %entry
26 %n.rnd.up = add i32 %n, 3
27 %n.vec = and i32 %n.rnd.up, -4
28 %trip.count.minus.1 = add i32 %n, -1
29 br label %vector.body
30
31vector.body: ; preds = %vector.body, %vector.ph
32 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
33 %next.gep = getelementptr float, float* %pSrcA, i32 %index
34 %next.gep14 = getelementptr float, float* %pDst, i32 %index
Sjoerd Meijerc352e7f2020-08-25 13:53:26 +010035 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
Samuel Tebbsd9cb8112020-06-30 16:49:20 +010036 %0 = bitcast float* %next.gep to <4 x float>*
37 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
38 %1 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %wide.masked.load)
39 %2 = bitcast float* %next.gep14 to <4 x float>*
40 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
41 %index.next = add i32 %index, 4
42 %3 = icmp eq i32 %index.next, %n.vec
43 br i1 %3, label %for.cond.cleanup, label %vector.body
44
45for.cond.cleanup: ; preds = %vector.body, %entry
46 ret void
47}
48
49define arm_aapcs_vfpcc void @rint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
50; CHECK-LABEL: rint:
51; CHECK: @ %bb.0: @ %entry
52; CHECK-NEXT: .save {r7, lr}
53; CHECK-NEXT: push {r7, lr}
54; CHECK-NEXT: cmp r2, #0
55; CHECK-NEXT: it eq
56; CHECK-NEXT: popeq {r7, pc}
57; CHECK-NEXT: dlstp.32 lr, r2
58; CHECK-NEXT: .LBB1_1: @ %vector.body
59; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
60; CHECK-NEXT: vldrw.u32 q0, [r0], #16
61; CHECK-NEXT: vrintx.f32 q0, q0
62; CHECK-NEXT: vstrw.32 q0, [r1], #16
63; CHECK-NEXT: letp lr, .LBB1_1
64; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
65; CHECK-NEXT: pop {r7, pc}
66entry:
67 %cmp5 = icmp eq i32 %n, 0
68 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
69
70vector.ph: ; preds = %entry
71 %n.rnd.up = add i32 %n, 3
72 %n.vec = and i32 %n.rnd.up, -4
73 %trip.count.minus.1 = add i32 %n, -1
74 br label %vector.body
75
76vector.body: ; preds = %vector.body, %vector.ph
77 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
78 %next.gep = getelementptr float, float* %pSrcA, i32 %index
79 %next.gep14 = getelementptr float, float* %pDst, i32 %index
Sjoerd Meijerc352e7f2020-08-25 13:53:26 +010080 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
Samuel Tebbsd9cb8112020-06-30 16:49:20 +010081 %0 = bitcast float* %next.gep to <4 x float>*
82 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
83 %1 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %wide.masked.load)
84 %2 = bitcast float* %next.gep14 to <4 x float>*
85 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
86 %index.next = add i32 %index, 4
87 %3 = icmp eq i32 %index.next, %n.vec
88 br i1 %3, label %for.cond.cleanup, label %vector.body
89
90for.cond.cleanup: ; preds = %vector.body, %entry
91 ret void
92}
93
94define arm_aapcs_vfpcc void @trunc(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
95; CHECK-LABEL: trunc:
96; CHECK: @ %bb.0: @ %entry
97; CHECK-NEXT: .save {r7, lr}
98; CHECK-NEXT: push {r7, lr}
99; CHECK-NEXT: cmp r2, #0
100; CHECK-NEXT: it eq
101; CHECK-NEXT: popeq {r7, pc}
102; CHECK-NEXT: dlstp.32 lr, r2
103; CHECK-NEXT: .LBB2_1: @ %vector.body
104; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
105; CHECK-NEXT: vldrw.u32 q0, [r0], #16
106; CHECK-NEXT: vrintz.f32 q0, q0
107; CHECK-NEXT: vstrw.32 q0, [r1], #16
108; CHECK-NEXT: letp lr, .LBB2_1
109; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
110; CHECK-NEXT: pop {r7, pc}
111entry:
112 %cmp5 = icmp eq i32 %n, 0
113 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
114
115vector.ph: ; preds = %entry
116 %n.rnd.up = add i32 %n, 3
117 %n.vec = and i32 %n.rnd.up, -4
118 %trip.count.minus.1 = add i32 %n, -1
119 br label %vector.body
120
121vector.body: ; preds = %vector.body, %vector.ph
122 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
123 %next.gep = getelementptr float, float* %pSrcA, i32 %index
124 %next.gep14 = getelementptr float, float* %pDst, i32 %index
Sjoerd Meijerc352e7f2020-08-25 13:53:26 +0100125 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
Samuel Tebbsd9cb8112020-06-30 16:49:20 +0100126 %0 = bitcast float* %next.gep to <4 x float>*
127 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
128 %1 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %wide.masked.load)
129 %2 = bitcast float* %next.gep14 to <4 x float>*
130 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
131 %index.next = add i32 %index, 4
132 %3 = icmp eq i32 %index.next, %n.vec
133 br i1 %3, label %for.cond.cleanup, label %vector.body
134
135for.cond.cleanup: ; preds = %vector.body, %entry
136 ret void
137}
138
139define arm_aapcs_vfpcc void @ceil(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
140; CHECK-LABEL: ceil:
141; CHECK: @ %bb.0: @ %entry
142; CHECK-NEXT: .save {r7, lr}
143; CHECK-NEXT: push {r7, lr}
144; CHECK-NEXT: cmp r2, #0
145; CHECK-NEXT: it eq
146; CHECK-NEXT: popeq {r7, pc}
147; CHECK-NEXT: dlstp.32 lr, r2
148; CHECK-NEXT: .LBB3_1: @ %vector.body
149; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
150; CHECK-NEXT: vldrw.u32 q0, [r0], #16
151; CHECK-NEXT: vrintp.f32 q0, q0
152; CHECK-NEXT: vstrw.32 q0, [r1], #16
153; CHECK-NEXT: letp lr, .LBB3_1
154; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
155; CHECK-NEXT: pop {r7, pc}
156entry:
157 %cmp5 = icmp eq i32 %n, 0
158 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
159
160vector.ph: ; preds = %entry
161 %n.rnd.up = add i32 %n, 3
162 %n.vec = and i32 %n.rnd.up, -4
163 %trip.count.minus.1 = add i32 %n, -1
164 br label %vector.body
165
166vector.body: ; preds = %vector.body, %vector.ph
167 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
168 %next.gep = getelementptr float, float* %pSrcA, i32 %index
169 %next.gep14 = getelementptr float, float* %pDst, i32 %index
Sjoerd Meijerc352e7f2020-08-25 13:53:26 +0100170 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
Samuel Tebbsd9cb8112020-06-30 16:49:20 +0100171 %0 = bitcast float* %next.gep to <4 x float>*
172 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
173 %1 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.masked.load)
174 %2 = bitcast float* %next.gep14 to <4 x float>*
175 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
176 %index.next = add i32 %index, 4
177 %3 = icmp eq i32 %index.next, %n.vec
178 br i1 %3, label %for.cond.cleanup, label %vector.body
179
180for.cond.cleanup: ; preds = %vector.body, %entry
181 ret void
182}
183
184define arm_aapcs_vfpcc void @floor(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
185; CHECK-LABEL: floor:
186; CHECK: @ %bb.0: @ %entry
187; CHECK-NEXT: .save {r7, lr}
188; CHECK-NEXT: push {r7, lr}
189; CHECK-NEXT: cmp r2, #0
190; CHECK-NEXT: it eq
191; CHECK-NEXT: popeq {r7, pc}
192; CHECK-NEXT: dlstp.32 lr, r2
193; CHECK-NEXT: .LBB4_1: @ %vector.body
194; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
195; CHECK-NEXT: vldrw.u32 q0, [r0], #16
196; CHECK-NEXT: vrintm.f32 q0, q0
197; CHECK-NEXT: vstrw.32 q0, [r1], #16
198; CHECK-NEXT: letp lr, .LBB4_1
199; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
200; CHECK-NEXT: pop {r7, pc}
201entry:
202 %cmp5 = icmp eq i32 %n, 0
203 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
204
205vector.ph: ; preds = %entry
206 %n.rnd.up = add i32 %n, 3
207 %n.vec = and i32 %n.rnd.up, -4
208 %trip.count.minus.1 = add i32 %n, -1
209 br label %vector.body
210
211vector.body: ; preds = %vector.body, %vector.ph
212 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
213 %next.gep = getelementptr float, float* %pSrcA, i32 %index
214 %next.gep14 = getelementptr float, float* %pDst, i32 %index
Sjoerd Meijerc352e7f2020-08-25 13:53:26 +0100215 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
Samuel Tebbsd9cb8112020-06-30 16:49:20 +0100216 %0 = bitcast float* %next.gep to <4 x float>*
217 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
218 %1 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %wide.masked.load)
219 %2 = bitcast float* %next.gep14 to <4 x float>*
220 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
221 %index.next = add i32 %index, 4
222 %3 = icmp eq i32 %index.next, %n.vec
223 br i1 %3, label %for.cond.cleanup, label %vector.body
224
225for.cond.cleanup: ; preds = %vector.body, %entry
226 ret void
227}
228
229; nearbyint shouldn't be tail predicated because it's lowered into multiple instructions
230define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
231; CHECK-LABEL: nearbyint:
232; CHECK: @ %bb.0: @ %entry
233; CHECK-NEXT: .save {r7, lr}
234; CHECK-NEXT: push {r7, lr}
235; CHECK-NEXT: cmp r2, #0
236; CHECK-NEXT: it eq
237; CHECK-NEXT: popeq {r7, pc}
238; CHECK-NEXT: adds r3, r2, #3
Sjoerd Meijerc352e7f2020-08-25 13:53:26 +0100239; CHECK-NEXT: vdup.32 q1, r2
Samuel Tebbsd9cb8112020-06-30 16:49:20 +0100240; CHECK-NEXT: bic r3, r3, #3
241; CHECK-NEXT: sub.w r12, r3, #4
242; CHECK-NEXT: movs r3, #1
243; CHECK-NEXT: add.w lr, r3, r12, lsr #2
244; CHECK-NEXT: adr r3, .LCPI5_0
Samuel Tebbsd9cb8112020-06-30 16:49:20 +0100245; CHECK-NEXT: vldrw.u32 q0, [r3]
Sjoerd Meijerc352e7f2020-08-25 13:53:26 +0100246; CHECK-NEXT: mov.w r12, #0
Samuel Tebbsd9cb8112020-06-30 16:49:20 +0100247; CHECK-NEXT: dls lr, lr
248; CHECK-NEXT: .LBB5_1: @ %vector.body
249; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
Sjoerd Meijerc352e7f2020-08-25 13:53:26 +0100250; CHECK-NEXT: vadd.i32 q2, q0, r12
251; CHECK-NEXT: vdup.32 q3, r12
Samuel Tebbsd9cb8112020-06-30 16:49:20 +0100252; CHECK-NEXT: vcmp.u32 hi, q3, q2
Sjoerd Meijerc352e7f2020-08-25 13:53:26 +0100253; CHECK-NEXT: add.w r12, r12, #4
Samuel Tebbsd9cb8112020-06-30 16:49:20 +0100254; CHECK-NEXT: vpnot
255; CHECK-NEXT: vpstt
256; CHECK-NEXT: vcmpt.u32 cs, q1, q2
257; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
258; CHECK-NEXT: vrintr.f32 s15, s11
259; CHECK-NEXT: vrintr.f32 s14, s10
260; CHECK-NEXT: vrintr.f32 s13, s9
261; CHECK-NEXT: vrintr.f32 s12, s8
262; CHECK-NEXT: vpst
263; CHECK-NEXT: vstrwt.32 q3, [r1], #16
264; CHECK-NEXT: le lr, .LBB5_1
265; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
266; CHECK-NEXT: pop {r7, pc}
267; CHECK-NEXT: .p2align 4
268; CHECK-NEXT: @ %bb.3:
269; CHECK-NEXT: .LCPI5_0:
270; CHECK-NEXT: .long 0 @ 0x0
271; CHECK-NEXT: .long 1 @ 0x1
272; CHECK-NEXT: .long 2 @ 0x2
273; CHECK-NEXT: .long 3 @ 0x3
274entry:
275 %cmp5 = icmp eq i32 %n, 0
276 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
277
278vector.ph: ; preds = %entry
279 %n.rnd.up = add i32 %n, 3
280 %n.vec = and i32 %n.rnd.up, -4
281 %trip.count.minus.1 = add i32 %n, -1
282 br label %vector.body
283
284vector.body: ; preds = %vector.body, %vector.ph
285 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
286 %next.gep = getelementptr float, float* %pSrcA, i32 %index
287 %next.gep14 = getelementptr float, float* %pDst, i32 %index
Sjoerd Meijerc352e7f2020-08-25 13:53:26 +0100288 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
Samuel Tebbsd9cb8112020-06-30 16:49:20 +0100289 %0 = bitcast float* %next.gep to <4 x float>*
290 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
291 %1 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.masked.load)
292 %2 = bitcast float* %next.gep14 to <4 x float>*
293 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
294 %index.next = add i32 %index, 4
295 %3 = icmp eq i32 %index.next, %n.vec
296 br i1 %3, label %for.cond.cleanup, label %vector.body
297
298for.cond.cleanup: ; preds = %vector.body, %entry
299 ret void
300}
301
302declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
303
304declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2
305
306declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3
307
308declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3
309
310declare <4 x float> @llvm.round.v4f32(<4 x float>) #3
311
312declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3
313
314declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3
315
316declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #1
317
318declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) #4