blob: 962e9df3dc1e93b7f2be03965371fd7ad3df2384 [file] [log] [blame]
Samuel Tebbsd9cb8112020-06-30 16:49:20 +01001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -disable-mve-tail-predication=false -o - %s | FileCheck %s
3define arm_aapcs_vfpcc void @round(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
4; CHECK-LABEL: round:
5; CHECK: @ %bb.0: @ %entry
6; CHECK-NEXT: .save {r7, lr}
7; CHECK-NEXT: push {r7, lr}
8; CHECK-NEXT: cmp r2, #0
9; CHECK-NEXT: it eq
10; CHECK-NEXT: popeq {r7, pc}
11; CHECK-NEXT: dlstp.32 lr, r2
12; CHECK-NEXT: .LBB0_1: @ %vector.body
13; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
14; CHECK-NEXT: vldrw.u32 q0, [r0], #16
15; CHECK-NEXT: vrinta.f32 q0, q0
16; CHECK-NEXT: vstrw.32 q0, [r1], #16
17; CHECK-NEXT: letp lr, .LBB0_1
18; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
19; CHECK-NEXT: pop {r7, pc}
20entry:
21 %cmp5 = icmp eq i32 %n, 0
22 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
23
24vector.ph: ; preds = %entry
25 %n.rnd.up = add i32 %n, 3
26 %n.vec = and i32 %n.rnd.up, -4
27 %trip.count.minus.1 = add i32 %n, -1
28 br label %vector.body
29
30vector.body: ; preds = %vector.body, %vector.ph
31 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
32 %next.gep = getelementptr float, float* %pSrcA, i32 %index
33 %next.gep14 = getelementptr float, float* %pDst, i32 %index
34 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
35 %0 = bitcast float* %next.gep to <4 x float>*
36 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
37 %1 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %wide.masked.load)
38 %2 = bitcast float* %next.gep14 to <4 x float>*
39 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
40 %index.next = add i32 %index, 4
41 %3 = icmp eq i32 %index.next, %n.vec
42 br i1 %3, label %for.cond.cleanup, label %vector.body
43
44for.cond.cleanup: ; preds = %vector.body, %entry
45 ret void
46}
47
48define arm_aapcs_vfpcc void @rint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
49; CHECK-LABEL: rint:
50; CHECK: @ %bb.0: @ %entry
51; CHECK-NEXT: .save {r7, lr}
52; CHECK-NEXT: push {r7, lr}
53; CHECK-NEXT: cmp r2, #0
54; CHECK-NEXT: it eq
55; CHECK-NEXT: popeq {r7, pc}
56; CHECK-NEXT: dlstp.32 lr, r2
57; CHECK-NEXT: .LBB1_1: @ %vector.body
58; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
59; CHECK-NEXT: vldrw.u32 q0, [r0], #16
60; CHECK-NEXT: vrintx.f32 q0, q0
61; CHECK-NEXT: vstrw.32 q0, [r1], #16
62; CHECK-NEXT: letp lr, .LBB1_1
63; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
64; CHECK-NEXT: pop {r7, pc}
65entry:
66 %cmp5 = icmp eq i32 %n, 0
67 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
68
69vector.ph: ; preds = %entry
70 %n.rnd.up = add i32 %n, 3
71 %n.vec = and i32 %n.rnd.up, -4
72 %trip.count.minus.1 = add i32 %n, -1
73 br label %vector.body
74
75vector.body: ; preds = %vector.body, %vector.ph
76 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
77 %next.gep = getelementptr float, float* %pSrcA, i32 %index
78 %next.gep14 = getelementptr float, float* %pDst, i32 %index
79 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
80 %0 = bitcast float* %next.gep to <4 x float>*
81 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
82 %1 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %wide.masked.load)
83 %2 = bitcast float* %next.gep14 to <4 x float>*
84 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
85 %index.next = add i32 %index, 4
86 %3 = icmp eq i32 %index.next, %n.vec
87 br i1 %3, label %for.cond.cleanup, label %vector.body
88
89for.cond.cleanup: ; preds = %vector.body, %entry
90 ret void
91}
92
93define arm_aapcs_vfpcc void @trunc(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
94; CHECK-LABEL: trunc:
95; CHECK: @ %bb.0: @ %entry
96; CHECK-NEXT: .save {r7, lr}
97; CHECK-NEXT: push {r7, lr}
98; CHECK-NEXT: cmp r2, #0
99; CHECK-NEXT: it eq
100; CHECK-NEXT: popeq {r7, pc}
101; CHECK-NEXT: dlstp.32 lr, r2
102; CHECK-NEXT: .LBB2_1: @ %vector.body
103; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
104; CHECK-NEXT: vldrw.u32 q0, [r0], #16
105; CHECK-NEXT: vrintz.f32 q0, q0
106; CHECK-NEXT: vstrw.32 q0, [r1], #16
107; CHECK-NEXT: letp lr, .LBB2_1
108; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
109; CHECK-NEXT: pop {r7, pc}
110entry:
111 %cmp5 = icmp eq i32 %n, 0
112 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
113
114vector.ph: ; preds = %entry
115 %n.rnd.up = add i32 %n, 3
116 %n.vec = and i32 %n.rnd.up, -4
117 %trip.count.minus.1 = add i32 %n, -1
118 br label %vector.body
119
120vector.body: ; preds = %vector.body, %vector.ph
121 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
122 %next.gep = getelementptr float, float* %pSrcA, i32 %index
123 %next.gep14 = getelementptr float, float* %pDst, i32 %index
124 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
125 %0 = bitcast float* %next.gep to <4 x float>*
126 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
127 %1 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %wide.masked.load)
128 %2 = bitcast float* %next.gep14 to <4 x float>*
129 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
130 %index.next = add i32 %index, 4
131 %3 = icmp eq i32 %index.next, %n.vec
132 br i1 %3, label %for.cond.cleanup, label %vector.body
133
134for.cond.cleanup: ; preds = %vector.body, %entry
135 ret void
136}
137
138define arm_aapcs_vfpcc void @ceil(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
139; CHECK-LABEL: ceil:
140; CHECK: @ %bb.0: @ %entry
141; CHECK-NEXT: .save {r7, lr}
142; CHECK-NEXT: push {r7, lr}
143; CHECK-NEXT: cmp r2, #0
144; CHECK-NEXT: it eq
145; CHECK-NEXT: popeq {r7, pc}
146; CHECK-NEXT: dlstp.32 lr, r2
147; CHECK-NEXT: .LBB3_1: @ %vector.body
148; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
149; CHECK-NEXT: vldrw.u32 q0, [r0], #16
150; CHECK-NEXT: vrintp.f32 q0, q0
151; CHECK-NEXT: vstrw.32 q0, [r1], #16
152; CHECK-NEXT: letp lr, .LBB3_1
153; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
154; CHECK-NEXT: pop {r7, pc}
155entry:
156 %cmp5 = icmp eq i32 %n, 0
157 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
158
159vector.ph: ; preds = %entry
160 %n.rnd.up = add i32 %n, 3
161 %n.vec = and i32 %n.rnd.up, -4
162 %trip.count.minus.1 = add i32 %n, -1
163 br label %vector.body
164
165vector.body: ; preds = %vector.body, %vector.ph
166 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
167 %next.gep = getelementptr float, float* %pSrcA, i32 %index
168 %next.gep14 = getelementptr float, float* %pDst, i32 %index
169 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
170 %0 = bitcast float* %next.gep to <4 x float>*
171 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
172 %1 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.masked.load)
173 %2 = bitcast float* %next.gep14 to <4 x float>*
174 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
175 %index.next = add i32 %index, 4
176 %3 = icmp eq i32 %index.next, %n.vec
177 br i1 %3, label %for.cond.cleanup, label %vector.body
178
179for.cond.cleanup: ; preds = %vector.body, %entry
180 ret void
181}
182
183define arm_aapcs_vfpcc void @floor(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
184; CHECK-LABEL: floor:
185; CHECK: @ %bb.0: @ %entry
186; CHECK-NEXT: .save {r7, lr}
187; CHECK-NEXT: push {r7, lr}
188; CHECK-NEXT: cmp r2, #0
189; CHECK-NEXT: it eq
190; CHECK-NEXT: popeq {r7, pc}
191; CHECK-NEXT: dlstp.32 lr, r2
192; CHECK-NEXT: .LBB4_1: @ %vector.body
193; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
194; CHECK-NEXT: vldrw.u32 q0, [r0], #16
195; CHECK-NEXT: vrintm.f32 q0, q0
196; CHECK-NEXT: vstrw.32 q0, [r1], #16
197; CHECK-NEXT: letp lr, .LBB4_1
198; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
199; CHECK-NEXT: pop {r7, pc}
200entry:
201 %cmp5 = icmp eq i32 %n, 0
202 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
203
204vector.ph: ; preds = %entry
205 %n.rnd.up = add i32 %n, 3
206 %n.vec = and i32 %n.rnd.up, -4
207 %trip.count.minus.1 = add i32 %n, -1
208 br label %vector.body
209
210vector.body: ; preds = %vector.body, %vector.ph
211 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
212 %next.gep = getelementptr float, float* %pSrcA, i32 %index
213 %next.gep14 = getelementptr float, float* %pDst, i32 %index
214 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
215 %0 = bitcast float* %next.gep to <4 x float>*
216 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
217 %1 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %wide.masked.load)
218 %2 = bitcast float* %next.gep14 to <4 x float>*
219 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
220 %index.next = add i32 %index, 4
221 %3 = icmp eq i32 %index.next, %n.vec
222 br i1 %3, label %for.cond.cleanup, label %vector.body
223
224for.cond.cleanup: ; preds = %vector.body, %entry
225 ret void
226}
227
228; nearbyint shouldn't be tail predicated because it's lowered into multiple instructions
229define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
230; CHECK-LABEL: nearbyint:
231; CHECK: @ %bb.0: @ %entry
232; CHECK-NEXT: .save {r7, lr}
233; CHECK-NEXT: push {r7, lr}
234; CHECK-NEXT: cmp r2, #0
235; CHECK-NEXT: it eq
236; CHECK-NEXT: popeq {r7, pc}
237; CHECK-NEXT: adds r3, r2, #3
238; CHECK-NEXT: bic r3, r3, #3
239; CHECK-NEXT: sub.w r12, r3, #4
240; CHECK-NEXT: movs r3, #1
241; CHECK-NEXT: add.w lr, r3, r12, lsr #2
242; CHECK-NEXT: adr r3, .LCPI5_0
243; CHECK-NEXT: sub.w r12, r2, #1
244; CHECK-NEXT: vldrw.u32 q0, [r3]
245; CHECK-NEXT: movs r2, #0
246; CHECK-NEXT: vdup.32 q1, r12
247; CHECK-NEXT: dls lr, lr
248; CHECK-NEXT: .LBB5_1: @ %vector.body
249; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
250; CHECK-NEXT: vadd.i32 q2, q0, r2
251; CHECK-NEXT: vdup.32 q3, r2
252; CHECK-NEXT: vcmp.u32 hi, q3, q2
253; CHECK-NEXT: adds r2, #4
254; CHECK-NEXT: vpnot
255; CHECK-NEXT: vpstt
256; CHECK-NEXT: vcmpt.u32 cs, q1, q2
257; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
258; CHECK-NEXT: vrintr.f32 s15, s11
259; CHECK-NEXT: vrintr.f32 s14, s10
260; CHECK-NEXT: vrintr.f32 s13, s9
261; CHECK-NEXT: vrintr.f32 s12, s8
262; CHECK-NEXT: vpst
263; CHECK-NEXT: vstrwt.32 q3, [r1], #16
264; CHECK-NEXT: le lr, .LBB5_1
265; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
266; CHECK-NEXT: pop {r7, pc}
267; CHECK-NEXT: .p2align 4
268; CHECK-NEXT: @ %bb.3:
269; CHECK-NEXT: .LCPI5_0:
270; CHECK-NEXT: .long 0 @ 0x0
271; CHECK-NEXT: .long 1 @ 0x1
272; CHECK-NEXT: .long 2 @ 0x2
273; CHECK-NEXT: .long 3 @ 0x3
274entry:
275 %cmp5 = icmp eq i32 %n, 0
276 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
277
278vector.ph: ; preds = %entry
279 %n.rnd.up = add i32 %n, 3
280 %n.vec = and i32 %n.rnd.up, -4
281 %trip.count.minus.1 = add i32 %n, -1
282 br label %vector.body
283
284vector.body: ; preds = %vector.body, %vector.ph
285 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
286 %next.gep = getelementptr float, float* %pSrcA, i32 %index
287 %next.gep14 = getelementptr float, float* %pDst, i32 %index
288 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
289 %0 = bitcast float* %next.gep to <4 x float>*
290 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
291 %1 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.masked.load)
292 %2 = bitcast float* %next.gep14 to <4 x float>*
293 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
294 %index.next = add i32 %index, 4
295 %3 = icmp eq i32 %index.next, %n.vec
296 br i1 %3, label %for.cond.cleanup, label %vector.body
297
298for.cond.cleanup: ; preds = %vector.body, %entry
299 ret void
300}
301
302declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
303
304declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2
305
306declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3
307
308declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3
309
310declare <4 x float> @llvm.round.v4f32(<4 x float>) #3
311
312declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3
313
314declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3
315
316declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #1
317
318declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) #4