blob: 02bd955c23363eeab873991766a735d0b814463c [file] [log] [blame]
David Green9cf920e2020-03-20 08:25:19 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
David Greeneecba952020-04-22 16:33:11 +01002; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
David Green9cf920e2020-03-20 08:25:19 +00003
4define arm_aapcs_vfpcc void @test_fadd(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
5; CHECK-LABEL: test_fadd:
6; CHECK: @ %bb.0: @ %entry
7; CHECK-NEXT: cmp r2, #1
8; CHECK-NEXT: it lt
9; CHECK-NEXT: bxlt lr
10; CHECK-NEXT: vmov r3, s0
David Green9cf920e2020-03-20 08:25:19 +000011; CHECK-NEXT: .LBB0_1: @ %vector.body
12; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +000013; CHECK-NEXT: vldrw.u32 q0, [r0], #16
David Green9cf920e2020-03-20 08:25:19 +000014; CHECK-NEXT: subs r2, #4
David Green37b9cc82020-03-25 11:35:53 +000015; CHECK-NEXT: vadd.f32 q0, q0, r3
16; CHECK-NEXT: vstrb.8 q0, [r1], #16
David Green9cf920e2020-03-20 08:25:19 +000017; CHECK-NEXT: bne .LBB0_1
18; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
19; CHECK-NEXT: bx lr
20entry:
21 %0 = and i32 %n, 7
22 %cmp = icmp eq i32 %0, 0
23 tail call void @llvm.assume(i1 %cmp)
24 %cmp18 = icmp sgt i32 %n, 0
25 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
26
27vector.ph: ; preds = %entry
28 %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
29 %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
30 br label %vector.body
31
32vector.body: ; preds = %vector.body, %vector.ph
33 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
34 %1 = getelementptr inbounds float, float* %A, i32 %index
35 %2 = bitcast float* %1 to <4 x float>*
36 %wide.load = load <4 x float>, <4 x float>* %2, align 4
37 %3 = fadd fast <4 x float> %wide.load, %broadcast.splat11
38 %4 = getelementptr inbounds float, float* %C, i32 %index
39 %5 = bitcast float* %4 to <4 x float>*
40 store <4 x float> %3, <4 x float>* %5, align 4
41 %index.next = add i32 %index, 4
42 %6 = icmp eq i32 %index.next, %n
43 br i1 %6, label %for.cond.cleanup, label %vector.body
44
45for.cond.cleanup: ; preds = %vector.body, %entry
46 ret void
47}
48
49define arm_aapcs_vfpcc void @test_fadd_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
50; CHECK-LABEL: test_fadd_r:
51; CHECK: @ %bb.0: @ %entry
52; CHECK-NEXT: cmp r2, #1
53; CHECK-NEXT: it lt
54; CHECK-NEXT: bxlt lr
55; CHECK-NEXT: vmov r3, s0
David Green9cf920e2020-03-20 08:25:19 +000056; CHECK-NEXT: .LBB1_1: @ %vector.body
57; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +000058; CHECK-NEXT: vldrw.u32 q0, [r0], #16
David Green9cf920e2020-03-20 08:25:19 +000059; CHECK-NEXT: subs r2, #4
David Green37b9cc82020-03-25 11:35:53 +000060; CHECK-NEXT: vadd.f32 q0, q0, r3
61; CHECK-NEXT: vstrb.8 q0, [r1], #16
David Green9cf920e2020-03-20 08:25:19 +000062; CHECK-NEXT: bne .LBB1_1
63; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
64; CHECK-NEXT: bx lr
65entry:
66 %0 = and i32 %n, 7
67 %cmp = icmp eq i32 %0, 0
68 tail call void @llvm.assume(i1 %cmp)
69 %cmp18 = icmp sgt i32 %n, 0
70 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
71
72vector.ph: ; preds = %entry
73 %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
74 %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
75 br label %vector.body
76
77vector.body: ; preds = %vector.body, %vector.ph
78 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
79 %1 = getelementptr inbounds float, float* %A, i32 %index
80 %2 = bitcast float* %1 to <4 x float>*
81 %wide.load = load <4 x float>, <4 x float>* %2, align 4
82 %3 = fadd fast <4 x float> %broadcast.splat11, %wide.load
83 %4 = getelementptr inbounds float, float* %C, i32 %index
84 %5 = bitcast float* %4 to <4 x float>*
85 store <4 x float> %3, <4 x float>* %5, align 4
86 %index.next = add i32 %index, 4
87 %6 = icmp eq i32 %index.next, %n
88 br i1 %6, label %for.cond.cleanup, label %vector.body
89
90for.cond.cleanup: ; preds = %vector.body, %entry
91 ret void
92}
93
94define arm_aapcs_vfpcc void @test_fmul(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
95; CHECK-LABEL: test_fmul:
96; CHECK: @ %bb.0: @ %entry
97; CHECK-NEXT: cmp r2, #1
98; CHECK-NEXT: it lt
99; CHECK-NEXT: bxlt lr
100; CHECK-NEXT: vmov r3, s0
David Green9cf920e2020-03-20 08:25:19 +0000101; CHECK-NEXT: .LBB2_1: @ %vector.body
102; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +0000103; CHECK-NEXT: vldrw.u32 q0, [r0], #16
David Green9cf920e2020-03-20 08:25:19 +0000104; CHECK-NEXT: subs r2, #4
David Green37b9cc82020-03-25 11:35:53 +0000105; CHECK-NEXT: vmul.f32 q0, q0, r3
106; CHECK-NEXT: vstrb.8 q0, [r1], #16
David Green9cf920e2020-03-20 08:25:19 +0000107; CHECK-NEXT: bne .LBB2_1
108; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
109; CHECK-NEXT: bx lr
110entry:
111 %0 = and i32 %n, 7
112 %cmp = icmp eq i32 %0, 0
113 tail call void @llvm.assume(i1 %cmp)
114 %cmp18 = icmp sgt i32 %n, 0
115 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
116
117vector.ph: ; preds = %entry
118 %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
119 %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
120 br label %vector.body
121
122vector.body: ; preds = %vector.body, %vector.ph
123 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
124 %1 = getelementptr inbounds float, float* %A, i32 %index
125 %2 = bitcast float* %1 to <4 x float>*
126 %wide.load = load <4 x float>, <4 x float>* %2, align 4
127 %3 = fmul fast <4 x float> %wide.load, %broadcast.splat11
128 %4 = getelementptr inbounds float, float* %C, i32 %index
129 %5 = bitcast float* %4 to <4 x float>*
130 store <4 x float> %3, <4 x float>* %5, align 4
131 %index.next = add i32 %index, 4
132 %6 = icmp eq i32 %index.next, %n
133 br i1 %6, label %for.cond.cleanup, label %vector.body
134
135for.cond.cleanup: ; preds = %vector.body, %entry
136 ret void
137}
138
139define arm_aapcs_vfpcc void @test_fmul_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
140; CHECK-LABEL: test_fmul_r:
141; CHECK: @ %bb.0: @ %entry
142; CHECK-NEXT: cmp r2, #1
143; CHECK-NEXT: it lt
144; CHECK-NEXT: bxlt lr
145; CHECK-NEXT: vmov r3, s0
David Green9cf920e2020-03-20 08:25:19 +0000146; CHECK-NEXT: .LBB3_1: @ %vector.body
147; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +0000148; CHECK-NEXT: vldrw.u32 q0, [r0], #16
David Green9cf920e2020-03-20 08:25:19 +0000149; CHECK-NEXT: subs r2, #4
David Green37b9cc82020-03-25 11:35:53 +0000150; CHECK-NEXT: vmul.f32 q0, q0, r3
151; CHECK-NEXT: vstrb.8 q0, [r1], #16
David Green9cf920e2020-03-20 08:25:19 +0000152; CHECK-NEXT: bne .LBB3_1
153; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
154; CHECK-NEXT: bx lr
155entry:
156 %0 = and i32 %n, 7
157 %cmp = icmp eq i32 %0, 0
158 tail call void @llvm.assume(i1 %cmp)
159 %cmp18 = icmp sgt i32 %n, 0
160 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
161
162vector.ph: ; preds = %entry
163 %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
164 %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
165 br label %vector.body
166
167vector.body: ; preds = %vector.body, %vector.ph
168 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
169 %1 = getelementptr inbounds float, float* %A, i32 %index
170 %2 = bitcast float* %1 to <4 x float>*
171 %wide.load = load <4 x float>, <4 x float>* %2, align 4
172 %3 = fmul fast <4 x float> %broadcast.splat11, %wide.load
173 %4 = getelementptr inbounds float, float* %C, i32 %index
174 %5 = bitcast float* %4 to <4 x float>*
175 store <4 x float> %3, <4 x float>* %5, align 4
176 %index.next = add i32 %index, 4
177 %6 = icmp eq i32 %index.next, %n
178 br i1 %6, label %for.cond.cleanup, label %vector.body
179
180for.cond.cleanup: ; preds = %vector.body, %entry
181 ret void
182}
183
184define arm_aapcs_vfpcc void @test_fsub(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
185; CHECK-LABEL: test_fsub:
186; CHECK: @ %bb.0: @ %entry
187; CHECK-NEXT: cmp r2, #1
188; CHECK-NEXT: it lt
189; CHECK-NEXT: bxlt lr
190; CHECK-NEXT: vmov r3, s0
David Green9cf920e2020-03-20 08:25:19 +0000191; CHECK-NEXT: .LBB4_1: @ %vector.body
192; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +0000193; CHECK-NEXT: vldrw.u32 q0, [r0], #16
David Green9cf920e2020-03-20 08:25:19 +0000194; CHECK-NEXT: subs r2, #4
David Green37b9cc82020-03-25 11:35:53 +0000195; CHECK-NEXT: vsub.f32 q0, q0, r3
196; CHECK-NEXT: vstrb.8 q0, [r1], #16
David Green9cf920e2020-03-20 08:25:19 +0000197; CHECK-NEXT: bne .LBB4_1
198; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
199; CHECK-NEXT: bx lr
200entry:
201 %0 = and i32 %n, 7
202 %cmp = icmp eq i32 %0, 0
203 tail call void @llvm.assume(i1 %cmp)
204 %cmp18 = icmp sgt i32 %n, 0
205 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
206
207vector.ph: ; preds = %entry
208 %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
209 %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
210 br label %vector.body
211
212vector.body: ; preds = %vector.body, %vector.ph
213 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
214 %1 = getelementptr inbounds float, float* %A, i32 %index
215 %2 = bitcast float* %1 to <4 x float>*
216 %wide.load = load <4 x float>, <4 x float>* %2, align 4
217 %3 = fsub fast <4 x float> %wide.load, %broadcast.splat11
218 %4 = getelementptr inbounds float, float* %C, i32 %index
219 %5 = bitcast float* %4 to <4 x float>*
220 store <4 x float> %3, <4 x float>* %5, align 4
221 %index.next = add i32 %index, 4
222 %6 = icmp eq i32 %index.next, %n
223 br i1 %6, label %for.cond.cleanup, label %vector.body
224
225for.cond.cleanup: ; preds = %vector.body, %entry
226 ret void
227}
228
229define arm_aapcs_vfpcc void @test_fsub_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
230; CHECK-LABEL: test_fsub_r:
231; CHECK: @ %bb.0: @ %entry
232; CHECK-NEXT: cmp r2, #1
233; CHECK-NEXT: it lt
234; CHECK-NEXT: bxlt lr
235; CHECK-NEXT: vmov r3, s0
236; CHECK-NEXT: vdup.32 q0, r3
237; CHECK-NEXT: .LBB5_1: @ %vector.body
238; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
239; CHECK-NEXT: vldrw.u32 q1, [r0], #16
240; CHECK-NEXT: subs r2, #4
241; CHECK-NEXT: vsub.f32 q1, q0, q1
242; CHECK-NEXT: vstrb.8 q1, [r1], #16
243; CHECK-NEXT: bne .LBB5_1
244; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
245; CHECK-NEXT: bx lr
246entry:
247 %0 = and i32 %n, 7
248 %cmp = icmp eq i32 %0, 0
249 tail call void @llvm.assume(i1 %cmp)
250 %cmp18 = icmp sgt i32 %n, 0
251 br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
252
253vector.ph: ; preds = %entry
254 %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
255 %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
256 br label %vector.body
257
258vector.body: ; preds = %vector.body, %vector.ph
259 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
260 %1 = getelementptr inbounds float, float* %A, i32 %index
261 %2 = bitcast float* %1 to <4 x float>*
262 %wide.load = load <4 x float>, <4 x float>* %2, align 4
263 %3 = fsub fast <4 x float> %broadcast.splat11, %wide.load
264 %4 = getelementptr inbounds float, float* %C, i32 %index
265 %5 = bitcast float* %4 to <4 x float>*
266 store <4 x float> %3, <4 x float>* %5, align 4
267 %index.next = add i32 %index, 4
268 %6 = icmp eq i32 %index.next, %n
269 br i1 %6, label %for.cond.cleanup, label %vector.body
270
271for.cond.cleanup: ; preds = %vector.body, %entry
272 ret void
273}
274
275
276define arm_aapcs_vfpcc void @test_fmas(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
277; CHECK-LABEL: test_fmas:
278; CHECK: @ %bb.0: @ %entry
279; CHECK-NEXT: cmp r3, #1
280; CHECK-NEXT: it lt
281; CHECK-NEXT: bxlt lr
282; CHECK-NEXT: vmov r12, s0
David Green9cf920e2020-03-20 08:25:19 +0000283; CHECK-NEXT: .LBB6_1: @ %vector.body
284; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +0000285; CHECK-NEXT: vldrw.u32 q0, [r0], #16
286; CHECK-NEXT: vldrw.u32 q1, [r1], #16
David Green9cf920e2020-03-20 08:25:19 +0000287; CHECK-NEXT: subs r3, #4
David Green37b9cc82020-03-25 11:35:53 +0000288; CHECK-NEXT: vfmas.f32 q1, q0, r12
289; CHECK-NEXT: vstrb.8 q1, [r2], #16
David Green9cf920e2020-03-20 08:25:19 +0000290; CHECK-NEXT: bne .LBB6_1
291; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
292; CHECK-NEXT: bx lr
293entry:
294 %0 = and i32 %n, 7
295 %cmp = icmp eq i32 %0, 0
296 tail call void @llvm.assume(i1 %cmp)
297 %cmp110 = icmp sgt i32 %n, 0
298 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
299
300vector.ph: ; preds = %entry
301 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
302 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
303 br label %vector.body
304
305vector.body: ; preds = %vector.body, %vector.ph
306 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
307 %1 = getelementptr inbounds float, float* %A, i32 %index
308 %2 = bitcast float* %1 to <4 x float>*
309 %wide.load = load <4 x float>, <4 x float>* %2, align 4
310 %3 = getelementptr inbounds float, float* %B, i32 %index
311 %4 = bitcast float* %3 to <4 x float>*
312 %wide.load12 = load <4 x float>, <4 x float>* %4, align 4
313 %5 = fmul fast <4 x float> %wide.load12, %wide.load
314 %6 = fadd fast <4 x float> %5, %broadcast.splat14
315 %7 = getelementptr inbounds float, float* %D, i32 %index
316 %8 = bitcast float* %7 to <4 x float>*
317 store <4 x float> %6, <4 x float>* %8, align 4
318 %index.next = add i32 %index, 4
319 %9 = icmp eq i32 %index.next, %n
320 br i1 %9, label %for.cond.cleanup, label %vector.body
321
322for.cond.cleanup: ; preds = %vector.body, %entry
323 ret void
324}
325
326define arm_aapcs_vfpcc void @test_fmas_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
327; CHECK-LABEL: test_fmas_r:
328; CHECK: @ %bb.0: @ %entry
329; CHECK-NEXT: cmp r3, #1
330; CHECK-NEXT: it lt
331; CHECK-NEXT: bxlt lr
332; CHECK-NEXT: vmov r12, s0
David Green9cf920e2020-03-20 08:25:19 +0000333; CHECK-NEXT: .LBB7_1: @ %vector.body
334; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +0000335; CHECK-NEXT: vldrw.u32 q0, [r0], #16
336; CHECK-NEXT: vldrw.u32 q1, [r1], #16
David Green9cf920e2020-03-20 08:25:19 +0000337; CHECK-NEXT: subs r3, #4
David Green37b9cc82020-03-25 11:35:53 +0000338; CHECK-NEXT: vfmas.f32 q1, q0, r12
339; CHECK-NEXT: vstrb.8 q1, [r2], #16
David Green9cf920e2020-03-20 08:25:19 +0000340; CHECK-NEXT: bne .LBB7_1
341; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
342; CHECK-NEXT: bx lr
343entry:
344 %0 = and i32 %n, 7
345 %cmp = icmp eq i32 %0, 0
346 tail call void @llvm.assume(i1 %cmp)
347 %cmp110 = icmp sgt i32 %n, 0
348 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
349
350vector.ph: ; preds = %entry
351 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
352 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
353 br label %vector.body
354
355vector.body: ; preds = %vector.body, %vector.ph
356 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
357 %1 = getelementptr inbounds float, float* %A, i32 %index
358 %2 = bitcast float* %1 to <4 x float>*
359 %wide.load = load <4 x float>, <4 x float>* %2, align 4
360 %3 = getelementptr inbounds float, float* %B, i32 %index
361 %4 = bitcast float* %3 to <4 x float>*
362 %wide.load12 = load <4 x float>, <4 x float>* %4, align 4
363 %5 = fmul fast <4 x float> %wide.load12, %wide.load
364 %6 = fadd fast <4 x float> %broadcast.splat14, %5
365 %7 = getelementptr inbounds float, float* %D, i32 %index
366 %8 = bitcast float* %7 to <4 x float>*
367 store <4 x float> %6, <4 x float>* %8, align 4
368 %index.next = add i32 %index, 4
369 %9 = icmp eq i32 %index.next, %n
370 br i1 %9, label %for.cond.cleanup, label %vector.body
371
372for.cond.cleanup: ; preds = %vector.body, %entry
373 ret void
374}
375
376define arm_aapcs_vfpcc void @test_fma(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
377; CHECK-LABEL: test_fma:
378; CHECK: @ %bb.0: @ %entry
379; CHECK-NEXT: cmp r3, #1
380; CHECK-NEXT: it lt
381; CHECK-NEXT: bxlt lr
382; CHECK-NEXT: vmov r12, s0
David Green9cf920e2020-03-20 08:25:19 +0000383; CHECK-NEXT: .LBB8_1: @ %vector.body
384; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +0000385; CHECK-NEXT: vldrw.u32 q0, [r0], #16
386; CHECK-NEXT: vldrw.u32 q1, [r1], #16
David Green9cf920e2020-03-20 08:25:19 +0000387; CHECK-NEXT: subs r3, #4
David Green37b9cc82020-03-25 11:35:53 +0000388; CHECK-NEXT: vfma.f32 q1, q0, r12
389; CHECK-NEXT: vstrb.8 q1, [r2], #16
David Green9cf920e2020-03-20 08:25:19 +0000390; CHECK-NEXT: bne .LBB8_1
391; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
392; CHECK-NEXT: bx lr
393entry:
394 %0 = and i32 %n, 7
395 %cmp = icmp eq i32 %0, 0
396 tail call void @llvm.assume(i1 %cmp)
397 %cmp110 = icmp sgt i32 %n, 0
398 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
399
400vector.ph: ; preds = %entry
401 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
402 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
403 br label %vector.body
404
405vector.body: ; preds = %vector.body, %vector.ph
406 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
407 %1 = getelementptr inbounds float, float* %A, i32 %index
408 %2 = bitcast float* %1 to <4 x float>*
409 %wide.load = load <4 x float>, <4 x float>* %2, align 4
410 %3 = fmul fast <4 x float> %wide.load, %broadcast.splat13
411 %4 = getelementptr inbounds float, float* %B, i32 %index
412 %5 = bitcast float* %4 to <4 x float>*
413 %wide.load14 = load <4 x float>, <4 x float>* %5, align 4
414 %6 = fadd fast <4 x float> %3, %wide.load14
415 %7 = getelementptr inbounds float, float* %D, i32 %index
416 %8 = bitcast float* %7 to <4 x float>*
417 store <4 x float> %6, <4 x float>* %8, align 4
418 %index.next = add i32 %index, 4
419 %9 = icmp eq i32 %index.next, %n
420 br i1 %9, label %for.cond.cleanup, label %vector.body
421
422for.cond.cleanup: ; preds = %vector.body, %entry
423 ret void
424}
425
426define arm_aapcs_vfpcc void @test_fma_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
427; CHECK-LABEL: test_fma_r:
428; CHECK: @ %bb.0: @ %entry
429; CHECK-NEXT: cmp r3, #1
430; CHECK-NEXT: it lt
431; CHECK-NEXT: bxlt lr
432; CHECK-NEXT: vmov r12, s0
David Green9cf920e2020-03-20 08:25:19 +0000433; CHECK-NEXT: .LBB9_1: @ %vector.body
434; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +0000435; CHECK-NEXT: vldrw.u32 q0, [r0], #16
436; CHECK-NEXT: vldrw.u32 q1, [r1], #16
David Green9cf920e2020-03-20 08:25:19 +0000437; CHECK-NEXT: subs r3, #4
David Green37b9cc82020-03-25 11:35:53 +0000438; CHECK-NEXT: vfma.f32 q1, q0, r12
439; CHECK-NEXT: vstrb.8 q1, [r2], #16
David Green9cf920e2020-03-20 08:25:19 +0000440; CHECK-NEXT: bne .LBB9_1
441; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
442; CHECK-NEXT: bx lr
443entry:
444 %0 = and i32 %n, 7
445 %cmp = icmp eq i32 %0, 0
446 tail call void @llvm.assume(i1 %cmp)
447 %cmp110 = icmp sgt i32 %n, 0
448 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
449
450vector.ph: ; preds = %entry
451 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
452 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
453 br label %vector.body
454
455vector.body: ; preds = %vector.body, %vector.ph
456 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
457 %1 = getelementptr inbounds float, float* %A, i32 %index
458 %2 = bitcast float* %1 to <4 x float>*
459 %wide.load = load <4 x float>, <4 x float>* %2, align 4
460 %3 = fmul fast <4 x float> %broadcast.splat13, %wide.load
461 %4 = getelementptr inbounds float, float* %B, i32 %index
462 %5 = bitcast float* %4 to <4 x float>*
463 %wide.load14 = load <4 x float>, <4 x float>* %5, align 4
464 %6 = fadd fast <4 x float> %3, %wide.load14
465 %7 = getelementptr inbounds float, float* %D, i32 %index
466 %8 = bitcast float* %7 to <4 x float>*
467 store <4 x float> %6, <4 x float>* %8, align 4
468 %index.next = add i32 %index, 4
469 %9 = icmp eq i32 %index.next, %n
470 br i1 %9, label %for.cond.cleanup, label %vector.body
471
472for.cond.cleanup: ; preds = %vector.body, %entry
473 ret void
474}
475
476
477define arm_aapcs_vfpcc void @test_fmss(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
478; CHECK-LABEL: test_fmss:
479; CHECK: @ %bb.0: @ %entry
480; CHECK-NEXT: cmp r3, #1
481; CHECK-NEXT: it lt
482; CHECK-NEXT: bxlt lr
483; CHECK-NEXT: vmov r12, s0
484; CHECK-NEXT: vdup.32 q0, r12
485; CHECK-NEXT: vneg.f32 q0, q0
486; CHECK-NEXT: .LBB10_1: @ %vector.body
487; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
488; CHECK-NEXT: vldrw.u32 q1, [r0], #16
489; CHECK-NEXT: vldrw.u32 q2, [r1], #16
490; CHECK-NEXT: vmov q3, q0
491; CHECK-NEXT: subs r3, #4
492; CHECK-NEXT: vfma.f32 q3, q2, q1
493; CHECK-NEXT: vstrb.8 q3, [r2], #16
494; CHECK-NEXT: bne .LBB10_1
495; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
496; CHECK-NEXT: bx lr
497entry:
498 %0 = and i32 %n, 7
499 %cmp = icmp eq i32 %0, 0
500 tail call void @llvm.assume(i1 %cmp)
501 %cmp110 = icmp sgt i32 %n, 0
502 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
503
504vector.ph: ; preds = %entry
505 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
506 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
507 br label %vector.body
508
509vector.body: ; preds = %vector.body, %vector.ph
510 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
511 %1 = getelementptr inbounds float, float* %A, i32 %index
512 %2 = bitcast float* %1 to <4 x float>*
513 %wide.load = load <4 x float>, <4 x float>* %2, align 4
514 %3 = getelementptr inbounds float, float* %B, i32 %index
515 %4 = bitcast float* %3 to <4 x float>*
516 %wide.load12 = load <4 x float>, <4 x float>* %4, align 4
517 %5 = fmul fast <4 x float> %wide.load12, %wide.load
518 %6 = fsub fast <4 x float> %5, %broadcast.splat14
519 %7 = getelementptr inbounds float, float* %D, i32 %index
520 %8 = bitcast float* %7 to <4 x float>*
521 store <4 x float> %6, <4 x float>* %8, align 4
522 %index.next = add i32 %index, 4
523 %9 = icmp eq i32 %index.next, %n
524 br i1 %9, label %for.cond.cleanup, label %vector.body
525
526for.cond.cleanup: ; preds = %vector.body, %entry
527 ret void
528}
529
530define arm_aapcs_vfpcc void @test_fmss_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
531; CHECK-LABEL: test_fmss_r:
532; CHECK: @ %bb.0: @ %entry
533; CHECK-NEXT: cmp r3, #1
534; CHECK-NEXT: it lt
535; CHECK-NEXT: bxlt lr
536; CHECK-NEXT: vmov r12, s0
537; CHECK-NEXT: vdup.32 q0, r12
538; CHECK-NEXT: .LBB11_1: @ %vector.body
539; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
540; CHECK-NEXT: vldrw.u32 q1, [r0], #16
541; CHECK-NEXT: vldrw.u32 q2, [r1], #16
542; CHECK-NEXT: vmov q3, q0
543; CHECK-NEXT: subs r3, #4
544; CHECK-NEXT: vfms.f32 q3, q2, q1
545; CHECK-NEXT: vstrb.8 q3, [r2], #16
546; CHECK-NEXT: bne .LBB11_1
547; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
548; CHECK-NEXT: bx lr
549entry:
550 %0 = and i32 %n, 7
551 %cmp = icmp eq i32 %0, 0
552 tail call void @llvm.assume(i1 %cmp)
553 %cmp110 = icmp sgt i32 %n, 0
554 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
555
556vector.ph: ; preds = %entry
557 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
558 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
559 br label %vector.body
560
561vector.body: ; preds = %vector.body, %vector.ph
562 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
563 %1 = getelementptr inbounds float, float* %A, i32 %index
564 %2 = bitcast float* %1 to <4 x float>*
565 %wide.load = load <4 x float>, <4 x float>* %2, align 4
566 %3 = getelementptr inbounds float, float* %B, i32 %index
567 %4 = bitcast float* %3 to <4 x float>*
568 %wide.load12 = load <4 x float>, <4 x float>* %4, align 4
569 %5 = fmul fast <4 x float> %wide.load12, %wide.load
570 %6 = fsub fast <4 x float> %broadcast.splat14, %5
571 %7 = getelementptr inbounds float, float* %D, i32 %index
572 %8 = bitcast float* %7 to <4 x float>*
573 store <4 x float> %6, <4 x float>* %8, align 4
574 %index.next = add i32 %index, 4
575 %9 = icmp eq i32 %index.next, %n
576 br i1 %9, label %for.cond.cleanup, label %vector.body
577
578for.cond.cleanup: ; preds = %vector.body, %entry
579 ret void
580}
581
582define arm_aapcs_vfpcc void @test_fms(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
583; CHECK-LABEL: test_fms:
584; CHECK: @ %bb.0: @ %entry
585; CHECK-NEXT: cmp r3, #1
586; CHECK-NEXT: it lt
587; CHECK-NEXT: bxlt lr
588; CHECK-NEXT: vmov r12, s0
David Green9cf920e2020-03-20 08:25:19 +0000589; CHECK-NEXT: .LBB12_1: @ %vector.body
590; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +0000591; CHECK-NEXT: vldrw.u32 q0, [r1], #16
592; CHECK-NEXT: vldrw.u32 q1, [r0], #16
David Green9cf920e2020-03-20 08:25:19 +0000593; CHECK-NEXT: subs r3, #4
David Green37b9cc82020-03-25 11:35:53 +0000594; CHECK-NEXT: vneg.f32 q0, q0
595; CHECK-NEXT: vfma.f32 q0, q1, r12
596; CHECK-NEXT: vstrb.8 q0, [r2], #16
David Green9cf920e2020-03-20 08:25:19 +0000597; CHECK-NEXT: bne .LBB12_1
598; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
599; CHECK-NEXT: bx lr
600entry:
601 %0 = and i32 %n, 7
602 %cmp = icmp eq i32 %0, 0
603 tail call void @llvm.assume(i1 %cmp)
604 %cmp110 = icmp sgt i32 %n, 0
605 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
606
607vector.ph: ; preds = %entry
608 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
609 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
610 br label %vector.body
611
612vector.body: ; preds = %vector.body, %vector.ph
613 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
614 %1 = getelementptr inbounds float, float* %A, i32 %index
615 %2 = bitcast float* %1 to <4 x float>*
616 %wide.load = load <4 x float>, <4 x float>* %2, align 4
617 %3 = fmul fast <4 x float> %wide.load, %broadcast.splat13
618 %4 = getelementptr inbounds float, float* %B, i32 %index
619 %5 = bitcast float* %4 to <4 x float>*
620 %wide.load14 = load <4 x float>, <4 x float>* %5, align 4
621 %6 = fsub fast <4 x float> %3, %wide.load14
622 %7 = getelementptr inbounds float, float* %D, i32 %index
623 %8 = bitcast float* %7 to <4 x float>*
624 store <4 x float> %6, <4 x float>* %8, align 4
625 %index.next = add i32 %index, 4
626 %9 = icmp eq i32 %index.next, %n
627 br i1 %9, label %for.cond.cleanup, label %vector.body
628
629for.cond.cleanup: ; preds = %vector.body, %entry
630 ret void
631}
632
633define arm_aapcs_vfpcc void @test_fms_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
634; CHECK-LABEL: test_fms_r:
635; CHECK: @ %bb.0: @ %entry
636; CHECK-NEXT: cmp r3, #1
637; CHECK-NEXT: it lt
638; CHECK-NEXT: bxlt lr
639; CHECK-NEXT: vmov r12, s0
David Green9cf920e2020-03-20 08:25:19 +0000640; CHECK-NEXT: .LBB13_1: @ %vector.body
641; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Green37b9cc82020-03-25 11:35:53 +0000642; CHECK-NEXT: vldrw.u32 q0, [r1], #16
643; CHECK-NEXT: vldrw.u32 q1, [r0], #16
David Green9cf920e2020-03-20 08:25:19 +0000644; CHECK-NEXT: subs r3, #4
David Green37b9cc82020-03-25 11:35:53 +0000645; CHECK-NEXT: vneg.f32 q0, q0
646; CHECK-NEXT: vfma.f32 q0, q1, r12
647; CHECK-NEXT: vstrb.8 q0, [r2], #16
David Green9cf920e2020-03-20 08:25:19 +0000648; CHECK-NEXT: bne .LBB13_1
649; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
650; CHECK-NEXT: bx lr
651entry:
652 %0 = and i32 %n, 7
653 %cmp = icmp eq i32 %0, 0
654 tail call void @llvm.assume(i1 %cmp)
655 %cmp110 = icmp sgt i32 %n, 0
656 br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
657
658vector.ph: ; preds = %entry
659 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
660 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
661 br label %vector.body
662
663vector.body: ; preds = %vector.body, %vector.ph
664 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
665 %1 = getelementptr inbounds float, float* %A, i32 %index
666 %2 = bitcast float* %1 to <4 x float>*
667 %wide.load = load <4 x float>, <4 x float>* %2, align 4
668 %3 = fmul fast <4 x float> %broadcast.splat13, %wide.load
669 %4 = getelementptr inbounds float, float* %B, i32 %index
670 %5 = bitcast float* %4 to <4 x float>*
671 %wide.load14 = load <4 x float>, <4 x float>* %5, align 4
672 %6 = fsub fast <4 x float> %3, %wide.load14
673 %7 = getelementptr inbounds float, float* %D, i32 %index
674 %8 = bitcast float* %7 to <4 x float>*
675 store <4 x float> %6, <4 x float>* %8, align 4
676 %index.next = add i32 %index, 4
677 %9 = icmp eq i32 %index.next, %n
678 br i1 %9, label %for.cond.cleanup, label %vector.body
679
680for.cond.cleanup: ; preds = %vector.body, %entry
681 ret void
682}
683
684
685define dso_local void @test_nested(float* noalias nocapture %pInT1, float* noalias nocapture readonly %pOutT1, float* noalias nocapture readonly %pPRT_in, float* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, float %in) local_unnamed_addr #0 {
686; CHECK-LABEL: test_nested:
687; CHECK: @ %bb.0: @ %for.body.us.preheader
688; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
689; CHECK-NEXT: push {r4, r5, r6, r7, lr}
690; CHECK-NEXT: ldrd lr, r12, [sp, #20]
691; CHECK-NEXT: lsl.w r3, r12, #2
692; CHECK-NEXT: dls lr, lr
693; CHECK-NEXT: .LBB14_1: @ %for.body.us
694; CHECK-NEXT: @ =>This Loop Header: Depth=1
695; CHECK-NEXT: @ Child Loop BB14_2 Depth 2
David Greenb3499f52020-03-20 09:23:57 +0000696; CHECK-NEXT: ldr r4, [r1]
David Green9cf920e2020-03-20 08:25:19 +0000697; CHECK-NEXT: mov r5, r12
David Green9cf920e2020-03-20 08:25:19 +0000698; CHECK-NEXT: vdup.32 q0, r4
699; CHECK-NEXT: movs r4, #0
700; CHECK-NEXT: .LBB14_2: @ %vector.body
701; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1
702; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
703; CHECK-NEXT: adds r6, r0, r4
704; CHECK-NEXT: adds r7, r2, r4
705; CHECK-NEXT: vldrw.u32 q1, [r7]
706; CHECK-NEXT: vldrw.u32 q2, [r6]
707; CHECK-NEXT: adds r4, #16
708; CHECK-NEXT: subs r5, #4
709; CHECK-NEXT: vfms.f32 q2, q1, q0
710; CHECK-NEXT: vstrw.32 q2, [r6]
711; CHECK-NEXT: bne .LBB14_2
712; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us
713; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1
714; CHECK-NEXT: add r0, r3
715; CHECK-NEXT: add r2, r3
716; CHECK-NEXT: adds r1, #4
717; CHECK-NEXT: le lr, .LBB14_1
718; CHECK-NEXT: @ %bb.4: @ %for.end14
719; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
720for.body.us.preheader:
721 %cmp = icmp sgt i32 %numRows, 0
722 tail call void @llvm.assume(i1 %cmp)
723 %cmp1 = icmp sgt i32 %numCols, 0
724 tail call void @llvm.assume(i1 %cmp1)
725 %rem = and i32 %numCols, 7
726 %cmp2 = icmp eq i32 %rem, 0
727 tail call void @llvm.assume(i1 %cmp2)
728 %cmp3 = icmp slt i32 %l, %numCols
729 tail call void @llvm.assume(i1 %cmp3)
730 br label %for.body.us
731
732for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
733 %pInT1.addr.038.us = phi float* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
734 %i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
735 %pOutT1.addr.036.us = phi float* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
736 %pPRT_in.addr.035.us = phi float* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
737 %scevgep = getelementptr float, float* %pPRT_in.addr.035.us, i32 %numCols
738 %0 = load float, float* %pOutT1.addr.036.us, align 4
739 %broadcast.splatinsert47 = insertelement <4 x float> undef, float %0, i32 0
740 %broadcast.splat48 = shufflevector <4 x float> %broadcast.splatinsert47, <4 x float> undef, <4 x i32> zeroinitializer
741 br label %vector.body
742
743vector.body: ; preds = %vector.body, %for.body.us
744 %index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
745 %next.gep = getelementptr float, float* %pInT1.addr.038.us, i32 %index
746 %next.gep45 = getelementptr float, float* %pPRT_in.addr.035.us, i32 %index
747 %1 = bitcast float* %next.gep to <4 x float>*
748 %wide.load = load <4 x float>, <4 x float>* %1, align 4
749 %2 = bitcast float* %next.gep45 to <4 x float>*
750 %wide.load46 = load <4 x float>, <4 x float>* %2, align 4
751 %3 = fmul fast <4 x float> %wide.load46, %broadcast.splat48
752 %4 = fsub fast <4 x float> %wide.load, %3
753 store <4 x float> %4, <4 x float>* %1, align 4
754 %index.next = add i32 %index, 4
755 %5 = icmp eq i32 %index.next, %numCols
756 br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body
757
758for.cond6.for.end_crit_edge.us: ; preds = %vector.body
759 %incdec.ptr.us = getelementptr inbounds float, float* %pOutT1.addr.036.us, i32 1
760 %scevgep40 = getelementptr float, float* %pInT1.addr.038.us, i32 %numCols
761 %inc13.us = add nuw nsw i32 %i.037.us, 1
762 %exitcond41 = icmp eq i32 %inc13.us, %numRows
763 br i1 %exitcond41, label %for.end14, label %for.body.us
764
765for.end14: ; preds = %for.cond6.for.end_crit_edge.us
766 ret void
767}
768
769%struct.arm_fir_instance_f32 = type { i16, float*, float* }
770define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* %pDst, i32 %blockSize) {
771; CHECK-LABEL: arm_fir_f32_1_4_mve:
772; CHECK: @ %bb.0: @ %entry
David Green37b9cc82020-03-25 11:35:53 +0000773; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
774; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
David Greenfa152552020-05-13 14:35:32 +0100775; CHECK-NEXT: .pad #16
776; CHECK-NEXT: sub sp, #16
777; CHECK-NEXT: ldrh r5, [r0]
778; CHECK-NEXT: mov r6, r3
779; CHECK-NEXT: ldr.w r12, [r0, #4]
780; CHECK-NEXT: sub.w lr, r5, #1
781; CHECK-NEXT: cmp.w lr, #3
David Green9cf920e2020-03-20 08:25:19 +0000782; CHECK-NEXT: bhi .LBB15_6
783; CHECK-NEXT: @ %bb.1: @ %if.then
David Greenfa152552020-05-13 14:35:32 +0100784; CHECK-NEXT: ldr r4, [r0, #8]
785; CHECK-NEXT: ldr r3, [r4, #12]
786; CHECK-NEXT: ldm.w r4, {r7, r8, r9}
787; CHECK-NEXT: add.w r4, r12, lr, lsl #2
788; CHECK-NEXT: lsr.w lr, r6, #2
David Green9cf920e2020-03-20 08:25:19 +0000789; CHECK-NEXT: wls lr, lr, .LBB15_5
790; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
David Greenfa152552020-05-13 14:35:32 +0100791; CHECK-NEXT: strd r6, r5, [sp, #8] @ 8-byte Folded Spill
792; CHECK-NEXT: bic r5, r6, #3
793; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
794; CHECK-NEXT: mov.w r10, #0
795; CHECK-NEXT: mov r0, r5
796; CHECK-NEXT: add.w r5, r2, r5, lsl #2
797; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
David Green9cf920e2020-03-20 08:25:19 +0000798; CHECK-NEXT: .LBB15_3: @ %while.body
799; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
David Greenfa152552020-05-13 14:35:32 +0100800; CHECK-NEXT: add.w r11, r1, r10
801; CHECK-NEXT: add.w r5, r4, r10
802; CHECK-NEXT: vldrw.u32 q0, [r11]
803; CHECK-NEXT: add.w r6, r12, r10
804; CHECK-NEXT: vstrw.32 q0, [r5]
805; CHECK-NEXT: add.w r5, r2, r10
806; CHECK-NEXT: vldrw.u32 q0, [r6]
807; CHECK-NEXT: vldrw.u32 q1, [r6, #4]
808; CHECK-NEXT: vldrw.u32 q2, [r6, #12]
809; CHECK-NEXT: add.w r10, r10, #16
810; CHECK-NEXT: vmul.f32 q0, q0, r7
811; CHECK-NEXT: vfma.f32 q0, q1, r8
812; CHECK-NEXT: vldrw.u32 q1, [r6, #8]
813; CHECK-NEXT: vfma.f32 q0, q1, r9
814; CHECK-NEXT: vfma.f32 q0, q2, r3
815; CHECK-NEXT: vstrw.32 q0, [r5]
David Green9cf920e2020-03-20 08:25:19 +0000816; CHECK-NEXT: le lr, .LBB15_3
817; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
David Greenfa152552020-05-13 14:35:32 +0100818; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload
819; CHECK-NEXT: add r4, r10
820; CHECK-NEXT: add.w r12, r12, r0, lsl #2
821; CHECK-NEXT: add.w r1, r1, r0, lsl #2
822; CHECK-NEXT: ldm.w sp, {r0, r2, r6} @ 12-byte Folded Reload
David Green9cf920e2020-03-20 08:25:19 +0000823; CHECK-NEXT: .LBB15_5: @ %while.end
David Greenfa152552020-05-13 14:35:32 +0100824; CHECK-NEXT: and lr, r6, #3
825; CHECK-NEXT: vldrw.u32 q0, [r1]
826; CHECK-NEXT: vctp.32 lr
David Green9cf920e2020-03-20 08:25:19 +0000827; CHECK-NEXT: vpst
David Greenfa152552020-05-13 14:35:32 +0100828; CHECK-NEXT: vstrwt.32 q0, [r4]
829; CHECK-NEXT: vldrw.u32 q0, [r12]
830; CHECK-NEXT: vldrw.u32 q1, [r12, #4]
David Green37b9cc82020-03-25 11:35:53 +0000831; CHECK-NEXT: vmul.f32 q0, q0, r7
David Greenfa152552020-05-13 14:35:32 +0100832; CHECK-NEXT: vfma.f32 q0, q1, r8
833; CHECK-NEXT: vldrw.u32 q1, [r12, #8]
David Green37b9cc82020-03-25 11:35:53 +0000834; CHECK-NEXT: vfma.f32 q0, q1, r9
David Greenfa152552020-05-13 14:35:32 +0100835; CHECK-NEXT: vldrw.u32 q1, [r12, #12]
836; CHECK-NEXT: vfma.f32 q0, q1, r3
David Green9cf920e2020-03-20 08:25:19 +0000837; CHECK-NEXT: vpst
David Green37b9cc82020-03-25 11:35:53 +0000838; CHECK-NEXT: vstrwt.32 q0, [r2]
David Greenfa152552020-05-13 14:35:32 +0100839; CHECK-NEXT: ldr.w r12, [r0, #4]
David Green9cf920e2020-03-20 08:25:19 +0000840; CHECK-NEXT: .LBB15_6: @ %if.end
David Greenfa152552020-05-13 14:35:32 +0100841; CHECK-NEXT: add.w r0, r12, r6, lsl #2
842; CHECK-NEXT: lsr.w lr, r5, #2
David Green9cf920e2020-03-20 08:25:19 +0000843; CHECK-NEXT: wls lr, lr, .LBB15_10
844; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
David Greenfa152552020-05-13 14:35:32 +0100845; CHECK-NEXT: bic r2, r5, #3
846; CHECK-NEXT: mov r3, r12
847; CHECK-NEXT: adds r1, r2, r6
848; CHECK-NEXT: add.w r1, r12, r1, lsl #2
David Green9cf920e2020-03-20 08:25:19 +0000849; CHECK-NEXT: .LBB15_8: @ %while.body51
850; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
851; CHECK-NEXT: vldrw.u32 q0, [r0], #16
852; CHECK-NEXT: vstrb.8 q0, [r3], #16
853; CHECK-NEXT: le lr, .LBB15_8
854; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit
David Greenfa152552020-05-13 14:35:32 +0100855; CHECK-NEXT: add.w r12, r12, r2, lsl #2
David Green9cf920e2020-03-20 08:25:19 +0000856; CHECK-NEXT: mov r0, r1
857; CHECK-NEXT: .LBB15_10: @ %while.end55
David Greenfa152552020-05-13 14:35:32 +0100858; CHECK-NEXT: ands r1, r5, #3
David Green9cf920e2020-03-20 08:25:19 +0000859; CHECK-NEXT: beq .LBB15_12
860; CHECK-NEXT: @ %bb.11: @ %if.then59
861; CHECK-NEXT: vldrw.u32 q0, [r0]
862; CHECK-NEXT: vctp.32 r1
863; CHECK-NEXT: vpst
David Greenfa152552020-05-13 14:35:32 +0100864; CHECK-NEXT: vstrwt.32 q0, [r12]
David Green9cf920e2020-03-20 08:25:19 +0000865; CHECK-NEXT: .LBB15_12: @ %if.end61
David Greenfa152552020-05-13 14:35:32 +0100866; CHECK-NEXT: add sp, #16
David Green37b9cc82020-03-25 11:35:53 +0000867; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
David Green9cf920e2020-03-20 08:25:19 +0000868entry:
869 %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
870 %0 = load float*, float** %pState1, align 4
871 %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
872 %1 = load float*, float** %pCoeffs2, align 4
873 %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
874 %2 = load i16, i16* %numTaps3, align 4
875 %conv = zext i16 %2 to i32
876 %sub = add nsw i32 %conv, -1
877 %cmp = icmp ult i32 %sub, 4
878 br i1 %cmp, label %if.then, label %if.end
879
880if.then: ; preds = %entry
881 %arrayidx = getelementptr inbounds float, float* %0, i32 %sub
882 %incdec.ptr = getelementptr inbounds float, float* %1, i32 1
883 %3 = load float, float* %1, align 4
884 %incdec.ptr6 = getelementptr inbounds float, float* %1, i32 2
885 %4 = load float, float* %incdec.ptr, align 4
886 %incdec.ptr7 = getelementptr inbounds float, float* %1, i32 3
887 %5 = load float, float* %incdec.ptr6, align 4
888 %6 = load float, float* %incdec.ptr7, align 4
889 %shr = lshr i32 %blockSize, 2
890 %cmp9146 = icmp eq i32 %shr, 0
891 %.pre161 = insertelement <4 x float> undef, float %3, i32 0
892 %.pre162 = shufflevector <4 x float> %.pre161, <4 x float> undef, <4 x i32> zeroinitializer
893 %.pre163 = insertelement <4 x float> undef, float %4, i32 0
894 %.pre164 = shufflevector <4 x float> %.pre163, <4 x float> undef, <4 x i32> zeroinitializer
895 %.pre165 = insertelement <4 x float> undef, float %5, i32 0
896 %.pre166 = shufflevector <4 x float> %.pre165, <4 x float> undef, <4 x i32> zeroinitializer
897 %.pre167 = insertelement <4 x float> undef, float %6, i32 0
898 %.pre168 = shufflevector <4 x float> %.pre167, <4 x float> undef, <4 x i32> zeroinitializer
899 br i1 %cmp9146, label %while.end, label %while.body.lr.ph
900
901while.body.lr.ph: ; preds = %if.then
902 %7 = and i32 %blockSize, -4
903 %scevgep158 = getelementptr float, float* %pDst, i32 %7
904 br label %while.body
905
906while.body: ; preds = %while.body.lr.ph, %while.body
907 %pStateCur.0151 = phi float* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
908 %pSamples.0150 = phi float* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
909 %pOutput.0149 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
910 %pTempSrc.0148 = phi float* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
911 %blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
912 %8 = bitcast float* %pTempSrc.0148 to <4 x float>*
913 %9 = load <4 x float>, <4 x float>* %8, align 4
914 %10 = bitcast float* %pStateCur.0151 to <4 x float>*
915 store <4 x float> %9, <4 x float>* %10, align 4
916 %add.ptr = getelementptr inbounds float, float* %pStateCur.0151, i32 4
917 %add.ptr11 = getelementptr inbounds float, float* %pTempSrc.0148, i32 4
918 %11 = bitcast float* %pSamples.0150 to <4 x float>*
919 %12 = load <4 x float>, <4 x float>* %11, align 4
920 %13 = fmul fast <4 x float> %12, %.pre162
921 %arrayidx12 = getelementptr inbounds float, float* %pSamples.0150, i32 1
922 %14 = bitcast float* %arrayidx12 to <4 x float>*
923 %15 = load <4 x float>, <4 x float>* %14, align 4
924 %mul = fmul fast <4 x float> %15, %.pre164
925 %add = fadd fast <4 x float> %mul, %13
926 %arrayidx13 = getelementptr inbounds float, float* %pSamples.0150, i32 2
927 %16 = bitcast float* %arrayidx13 to <4 x float>*
928 %17 = load <4 x float>, <4 x float>* %16, align 4
929 %mul16 = fmul fast <4 x float> %17, %.pre166
930 %add17 = fadd fast <4 x float> %add, %mul16
931 %arrayidx18 = getelementptr inbounds float, float* %pSamples.0150, i32 3
932 %18 = bitcast float* %arrayidx18 to <4 x float>*
933 %19 = load <4 x float>, <4 x float>* %18, align 4
934 %mul21 = fmul fast <4 x float> %19, %.pre168
935 %add22 = fadd fast <4 x float> %add17, %mul21
936 %20 = bitcast float* %pOutput.0149 to <4 x float>*
937 store <4 x float> %add22, <4 x float>* %20, align 4
938 %add.ptr23 = getelementptr inbounds float, float* %pOutput.0149, i32 4
939 %add.ptr24 = getelementptr inbounds float, float* %pSamples.0150, i32 4
940 %dec = add nsw i32 %blkCnt.0147, -1
941 %cmp9 = icmp eq i32 %dec, 0
942 br i1 %cmp9, label %while.end.loopexit, label %while.body
943
944while.end.loopexit: ; preds = %while.body
945 %scevgep157 = getelementptr float, float* %pSrc, i32 %7
946 %scevgep159 = getelementptr float, float* %0, i32 %7
947 br label %while.end
948
949while.end: ; preds = %if.then, %while.end.loopexit
950 %pTempSrc.0.lcssa = phi float* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
951 %pOutput.0.lcssa = phi float* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
952 %pSamples.0.lcssa = phi float* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ]
953 %pStateCur.0.lcssa = phi float* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
954 %and = and i32 %blockSize, 3
955 %21 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and)
956 %22 = bitcast float* %pTempSrc.0.lcssa to <4 x float>*
957 %23 = load <4 x float>, <4 x float>* %22, align 4
958 %24 = bitcast float* %pStateCur.0.lcssa to <4 x float>*
959 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %23, <4 x float>* %24, i32 4, <4 x i1> %21)
960 %25 = bitcast float* %pSamples.0.lcssa to <4 x float>*
961 %26 = load <4 x float>, <4 x float>* %25, align 4
962 %27 = fmul fast <4 x float> %26, %.pre162
963 %arrayidx29 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 1
964 %28 = bitcast float* %arrayidx29 to <4 x float>*
965 %29 = load <4 x float>, <4 x float>* %28, align 4
966 %mul32 = fmul fast <4 x float> %29, %.pre164
967 %add33 = fadd fast <4 x float> %mul32, %27
968 %arrayidx34 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 2
969 %30 = bitcast float* %arrayidx34 to <4 x float>*
970 %31 = load <4 x float>, <4 x float>* %30, align 4
971 %mul37 = fmul fast <4 x float> %31, %.pre166
972 %add38 = fadd fast <4 x float> %add33, %mul37
973 %arrayidx39 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 3
974 %32 = bitcast float* %arrayidx39 to <4 x float>*
975 %33 = load <4 x float>, <4 x float>* %32, align 4
976 %mul42 = fmul fast <4 x float> %33, %.pre168
977 %add43 = fadd fast <4 x float> %add38, %mul42
978 %34 = bitcast float* %pOutput.0.lcssa to <4 x float>*
979 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %add43, <4 x float>* %34, i32 4, <4 x i1> %21)
980 %.pre = load float*, float** %pState1, align 4
981 br label %if.end
982
983if.end: ; preds = %while.end, %entry
984 %35 = phi float* [ %.pre, %while.end ], [ %0, %entry ]
985 %arrayidx45 = getelementptr inbounds float, float* %35, i32 %blockSize
986 %shr47 = lshr i32 %conv, 2
987 %cmp49141 = icmp eq i32 %shr47, 0
988 br i1 %cmp49141, label %while.end55, label %while.body51.preheader
989
990while.body51.preheader: ; preds = %if.end
991 %36 = and i32 %conv, 65532
992 %37 = add i32 %36, %blockSize
993 %scevgep = getelementptr float, float* %35, i32 %37
994 br label %while.body51
995
996while.body51: ; preds = %while.body51.preheader, %while.body51
997 %pTempSrc.1144 = phi float* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
998 %pTempDest.0143 = phi float* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ]
999 %blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
1000 %38 = bitcast float* %pTempSrc.1144 to <4 x float>*
1001 %39 = load <4 x float>, <4 x float>* %38, align 4
1002 %40 = bitcast float* %pTempDest.0143 to <4 x float>*
1003 store <4 x float> %39, <4 x float>* %40, align 4
1004 %add.ptr52 = getelementptr inbounds float, float* %pTempSrc.1144, i32 4
1005 %add.ptr53 = getelementptr inbounds float, float* %pTempDest.0143, i32 4
1006 %dec54 = add nsw i32 %blkCnt.1142, -1
1007 %cmp49 = icmp eq i32 %dec54, 0
1008 br i1 %cmp49, label %while.end55.loopexit, label %while.body51
1009
1010while.end55.loopexit: ; preds = %while.body51
1011 %scevgep156 = getelementptr float, float* %35, i32 %36
1012 br label %while.end55
1013
1014while.end55: ; preds = %while.end55.loopexit, %if.end
1015 %pTempDest.0.lcssa = phi float* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
1016 %pTempSrc.1.lcssa = phi float* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
1017 %and56 = and i32 %conv, 3
1018 %cmp57 = icmp eq i32 %and56, 0
1019 br i1 %cmp57, label %if.end61, label %if.then59
1020
1021if.then59: ; preds = %while.end55
1022 %41 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and56)
1023 %42 = bitcast float* %pTempSrc.1.lcssa to <4 x float>*
1024 %43 = load <4 x float>, <4 x float>* %42, align 4
1025 %44 = bitcast float* %pTempDest.0.lcssa to <4 x float>*
1026 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %43, <4 x float>* %44, i32 4, <4 x i1> %41)
1027 br label %if.end61
1028
1029if.end61: ; preds = %while.end55, %if.then59
1030 ret void
1031}
1032
1033
1034define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) {
1035; CHECK-LABEL: fir:
1036; CHECK: @ %bb.0: @ %entry
1037; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1038; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1039; CHECK-NEXT: .pad #4
1040; CHECK-NEXT: sub sp, #4
David Greenb3499f52020-03-20 09:23:57 +00001041; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
1042; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
David Green892af452020-04-22 13:30:22 +01001043; CHECK-NEXT: .pad #32
1044; CHECK-NEXT: sub sp, #32
David Green9cf920e2020-03-20 08:25:19 +00001045; CHECK-NEXT: cmp r3, #8
1046; CHECK-NEXT: blo.w .LBB16_12
1047; CHECK-NEXT: @ %bb.1: @ %if.then
1048; CHECK-NEXT: movs r7, #0
1049; CHECK-NEXT: cmp.w r7, r3, lsr #2
1050; CHECK-NEXT: beq.w .LBB16_12
1051; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
David Green892af452020-04-22 13:30:22 +01001052; CHECK-NEXT: ldrh r5, [r0]
1053; CHECK-NEXT: lsr.w r9, r3, #2
1054; CHECK-NEXT: ldrd r8, r12, [r0, #4]
1055; CHECK-NEXT: movs r3, #1
1056; CHECK-NEXT: sub.w r0, r5, #8
David Green9cf920e2020-03-20 08:25:19 +00001057; CHECK-NEXT: add.w r7, r0, r0, lsr #29
David Greenb3499f52020-03-20 09:23:57 +00001058; CHECK-NEXT: and r0, r0, #7
David Green892af452020-04-22 13:30:22 +01001059; CHECK-NEXT: asrs r6, r7, #3
1060; CHECK-NEXT: cmp r6, #1
David Green9cf920e2020-03-20 08:25:19 +00001061; CHECK-NEXT: it gt
David Green892af452020-04-22 13:30:22 +01001062; CHECK-NEXT: asrgt r3, r7, #3
1063; CHECK-NEXT: add.w r7, r8, r5, lsl #2
David Green9cf920e2020-03-20 08:25:19 +00001064; CHECK-NEXT: sub.w r11, r7, #4
David Green892af452020-04-22 13:30:22 +01001065; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
1066; CHECK-NEXT: rsbs r3, r5, #0
1067; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
1068; CHECK-NEXT: add.w r3, r12, #32
1069; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
David Greenb3499f52020-03-20 09:23:57 +00001070; CHECK-NEXT: adds r0, #1
David Green892af452020-04-22 13:30:22 +01001071; CHECK-NEXT: str r5, [sp, #16] @ 4-byte Spill
1072; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
1073; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
David Green9cf920e2020-03-20 08:25:19 +00001074; CHECK-NEXT: b .LBB16_4
1075; CHECK-NEXT: .LBB16_3: @ %while.end
1076; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
David Green892af452020-04-22 13:30:22 +01001077; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
1078; CHECK-NEXT: subs.w r9, r9, #1
1079; CHECK-NEXT: ldrd r11, r1, [sp, #24] @ 8-byte Folded Reload
David Green9cf920e2020-03-20 08:25:19 +00001080; CHECK-NEXT: vstrb.8 q0, [r2], #16
David Greenb3499f52020-03-20 09:23:57 +00001081; CHECK-NEXT: add.w r0, r8, r0, lsl #2
David Green892af452020-04-22 13:30:22 +01001082; CHECK-NEXT: add.w r8, r0, #16
David Greenb3499f52020-03-20 09:23:57 +00001083; CHECK-NEXT: beq .LBB16_12
David Green9cf920e2020-03-20 08:25:19 +00001084; CHECK-NEXT: .LBB16_4: @ %while.body
1085; CHECK-NEXT: @ =>This Loop Header: Depth=1
1086; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
1087; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
David Greenb3499f52020-03-20 09:23:57 +00001088; CHECK-NEXT: vldrw.u32 q0, [r1], #16
David Green892af452020-04-22 13:30:22 +01001089; CHECK-NEXT: ldrd r0, r7, [r12]
1090; CHECK-NEXT: ldrd r4, r6, [r12, #8]
1091; CHECK-NEXT: ldrd r5, r3, [r12, #16]
1092; CHECK-NEXT: ldrd lr, r10, [r12, #24]
David Greenb3499f52020-03-20 09:23:57 +00001093; CHECK-NEXT: vstrb.8 q0, [r11], #16
David Green892af452020-04-22 13:30:22 +01001094; CHECK-NEXT: vldrw.u32 q0, [r8], #32
Jean-Michel Gorius7019cea2020-05-21 16:30:48 +02001095; CHECK-NEXT: strd r11, r1, [sp, #24] @ 8-byte Folded Spill
David Green892af452020-04-22 13:30:22 +01001096; CHECK-NEXT: vldrw.u32 q1, [r8, #-28]
David Greenb3499f52020-03-20 09:23:57 +00001097; CHECK-NEXT: vmul.f32 q0, q0, r0
David Green892af452020-04-22 13:30:22 +01001098; CHECK-NEXT: vldrw.u32 q6, [r8, #-24]
1099; CHECK-NEXT: vldrw.u32 q4, [r8, #-20]
1100; CHECK-NEXT: vfma.f32 q0, q1, r7
1101; CHECK-NEXT: vldrw.u32 q5, [r8, #-16]
1102; CHECK-NEXT: vfma.f32 q0, q6, r4
1103; CHECK-NEXT: vldrw.u32 q2, [r8, #-12]
1104; CHECK-NEXT: vfma.f32 q0, q4, r6
1105; CHECK-NEXT: vldrw.u32 q3, [r8, #-8]
1106; CHECK-NEXT: vfma.f32 q0, q5, r5
David Green892af452020-04-22 13:30:22 +01001107; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
Jean-Michel Gorius7019cea2020-05-21 16:30:48 +02001108; CHECK-NEXT: vfma.f32 q0, q2, r3
1109; CHECK-NEXT: vldrw.u32 q1, [r8, #-4]
David Green892af452020-04-22 13:30:22 +01001110; CHECK-NEXT: vfma.f32 q0, q3, lr
David Green892af452020-04-22 13:30:22 +01001111; CHECK-NEXT: cmp r0, #16
Jean-Michel Gorius7019cea2020-05-21 16:30:48 +02001112; CHECK-NEXT: vfma.f32 q0, q1, r10
David Greenb3499f52020-03-20 09:23:57 +00001113; CHECK-NEXT: blo .LBB16_7
David Green9cf920e2020-03-20 08:25:19 +00001114; CHECK-NEXT: @ %bb.5: @ %for.body.preheader
1115; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
David Green892af452020-04-22 13:30:22 +01001116; CHECK-NEXT: ldr.w lr, [sp] @ 4-byte Reload
David Green9cf920e2020-03-20 08:25:19 +00001117; CHECK-NEXT: dls lr, lr
David Green892af452020-04-22 13:30:22 +01001118; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
David Green9cf920e2020-03-20 08:25:19 +00001119; CHECK-NEXT: .LBB16_6: @ %for.body
1120; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
1121; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
David Green892af452020-04-22 13:30:22 +01001122; CHECK-NEXT: ldm.w r7, {r0, r3, r4, r5, r6, r10, r11}
1123; CHECK-NEXT: vldrw.u32 q1, [r8], #32
1124; CHECK-NEXT: vldrw.u32 q6, [r8, #-24]
1125; CHECK-NEXT: vldrw.u32 q4, [r8, #-20]
David Green9cf920e2020-03-20 08:25:19 +00001126; CHECK-NEXT: vfma.f32 q0, q1, r0
David Green892af452020-04-22 13:30:22 +01001127; CHECK-NEXT: vldrw.u32 q1, [r8, #-28]
1128; CHECK-NEXT: vldrw.u32 q5, [r8, #-16]
1129; CHECK-NEXT: vldrw.u32 q2, [r8, #-12]
David Greenb3499f52020-03-20 09:23:57 +00001130; CHECK-NEXT: vfma.f32 q0, q1, r3
David Green892af452020-04-22 13:30:22 +01001131; CHECK-NEXT: vldrw.u32 q3, [r8, #-8]
David Greenb3499f52020-03-20 09:23:57 +00001132; CHECK-NEXT: vfma.f32 q0, q6, r4
David Green892af452020-04-22 13:30:22 +01001133; CHECK-NEXT: ldr r1, [r7, #28]
1134; CHECK-NEXT: vfma.f32 q0, q4, r5
1135; CHECK-NEXT: vldrw.u32 q1, [r8, #-4]
1136; CHECK-NEXT: vfma.f32 q0, q5, r6
1137; CHECK-NEXT: adds r7, #32
1138; CHECK-NEXT: vfma.f32 q0, q2, r10
1139; CHECK-NEXT: vfma.f32 q0, q3, r11
1140; CHECK-NEXT: vfma.f32 q0, q1, r1
David Green9cf920e2020-03-20 08:25:19 +00001141; CHECK-NEXT: le lr, .LBB16_6
David Greenb3499f52020-03-20 09:23:57 +00001142; CHECK-NEXT: b .LBB16_8
1143; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
David Green892af452020-04-22 13:30:22 +01001144; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
David Greenb3499f52020-03-20 09:23:57 +00001145; CHECK-NEXT: .LBB16_8: @ %for.end
David Green9cf920e2020-03-20 08:25:19 +00001146; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
David Green892af452020-04-22 13:30:22 +01001147; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
David Greenb3499f52020-03-20 09:23:57 +00001148; CHECK-NEXT: cmp r0, #0
1149; CHECK-NEXT: beq .LBB16_3
1150; CHECK-NEXT: @ %bb.9: @ %while.body76.preheader
David Green9cf920e2020-03-20 08:25:19 +00001151; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
David Green892af452020-04-22 13:30:22 +01001152; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
1153; CHECK-NEXT: mov r4, r8
David Green9cf920e2020-03-20 08:25:19 +00001154; CHECK-NEXT: .LBB16_10: @ %while.body76
1155; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
1156; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
David Green892af452020-04-22 13:30:22 +01001157; CHECK-NEXT: ldr r1, [r7], #4
1158; CHECK-NEXT: vldrw.u32 q1, [r4], #4
1159; CHECK-NEXT: subs r0, #1
David Greenb3499f52020-03-20 09:23:57 +00001160; CHECK-NEXT: vfma.f32 q0, q1, r1
David Green892af452020-04-22 13:30:22 +01001161; CHECK-NEXT: cmp r0, #1
David Green9cf920e2020-03-20 08:25:19 +00001162; CHECK-NEXT: bgt .LBB16_10
1163; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit
1164; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
David Green892af452020-04-22 13:30:22 +01001165; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
David Greenb3499f52020-03-20 09:23:57 +00001166; CHECK-NEXT: add.w r8, r8, r0, lsl #2
David Green9cf920e2020-03-20 08:25:19 +00001167; CHECK-NEXT: b .LBB16_3
1168; CHECK-NEXT: .LBB16_12: @ %if.end
David Green892af452020-04-22 13:30:22 +01001169; CHECK-NEXT: add sp, #32
David Greenb3499f52020-03-20 09:23:57 +00001170; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
David Green9cf920e2020-03-20 08:25:19 +00001171; CHECK-NEXT: add sp, #4
1172; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1173entry:
1174 %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
1175 %0 = load float*, float** %pState1, align 4
1176 %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
1177 %1 = load float*, float** %pCoeffs2, align 4
1178 %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
1179 %2 = load i16, i16* %numTaps3, align 4
1180 %conv = zext i16 %2 to i32
1181 %cmp = icmp ugt i32 %blockSize, 7
1182 br i1 %cmp, label %if.then, label %if.end
1183
1184if.then: ; preds = %entry
1185 %shr = lshr i32 %blockSize, 2
1186 %cmp5217 = icmp eq i32 %shr, 0
1187 br i1 %cmp5217, label %if.end, label %while.body.lr.ph
1188
1189while.body.lr.ph: ; preds = %if.then
1190 %sub = add nsw i32 %conv, -1
1191 %arrayidx = getelementptr inbounds float, float* %0, i32 %sub
1192 %incdec.ptr = getelementptr inbounds float, float* %1, i32 1
1193 %incdec.ptr7 = getelementptr inbounds float, float* %1, i32 2
1194 %incdec.ptr8 = getelementptr inbounds float, float* %1, i32 3
1195 %incdec.ptr9 = getelementptr inbounds float, float* %1, i32 4
1196 %incdec.ptr10 = getelementptr inbounds float, float* %1, i32 5
1197 %incdec.ptr11 = getelementptr inbounds float, float* %1, i32 6
1198 %incdec.ptr12 = getelementptr inbounds float, float* %1, i32 7
1199 %sub37 = add nsw i32 %conv, -8
1200 %div = sdiv i32 %sub37, 8
1201 %pCoeffsCur.0199 = getelementptr inbounds float, float* %1, i32 8
1202 %cmp38201 = icmp ugt i16 %2, 15
1203 %and = and i32 %sub37, 7
1204 %cmp74210 = icmp eq i32 %and, 0
1205 %idx.neg = sub nsw i32 0, %conv
1206 %3 = icmp sgt i32 %div, 1
1207 %smax = select i1 %3, i32 %div, i32 1
1208 br label %while.body
1209
1210while.body: ; preds = %while.body.lr.ph, %while.end
1211 %blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
1212 %pStateCur.0221 = phi float* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
1213 %pSamples.0220 = phi float* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
1214 %pTempSrc.0219 = phi float* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
1215 %pOutput.0218 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
1216 %4 = load float, float* %1, align 4
1217 %5 = load float, float* %incdec.ptr, align 4
1218 %6 = load float, float* %incdec.ptr7, align 4
1219 %7 = load float, float* %incdec.ptr8, align 4
1220 %8 = load float, float* %incdec.ptr9, align 4
1221 %9 = load float, float* %incdec.ptr10, align 4
1222 %10 = load float, float* %incdec.ptr11, align 4
1223 %11 = load float, float* %incdec.ptr12, align 4
1224 %12 = bitcast float* %pTempSrc.0219 to <4 x float>*
1225 %13 = load <4 x float>, <4 x float>* %12, align 4
1226 %14 = bitcast float* %pStateCur.0221 to <4 x float>*
1227 store <4 x float> %13, <4 x float>* %14, align 4
1228 %add.ptr = getelementptr inbounds float, float* %pStateCur.0221, i32 4
1229 %add.ptr14 = getelementptr inbounds float, float* %pTempSrc.0219, i32 4
1230 %15 = bitcast float* %pSamples.0220 to <4 x float>*
1231 %16 = load <4 x float>, <4 x float>* %15, align 4
1232 %.splatinsert = insertelement <4 x float> undef, float %4, i32 0
1233 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1234 %17 = fmul fast <4 x float> %16, %.splat
1235 %arrayidx15 = getelementptr inbounds float, float* %pSamples.0220, i32 1
1236 %18 = bitcast float* %arrayidx15 to <4 x float>*
1237 %19 = load <4 x float>, <4 x float>* %18, align 4
1238 %.splatinsert16 = insertelement <4 x float> undef, float %5, i32 0
1239 %.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer
1240 %20 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %19, <4 x float> %.splat17, <4 x float> %17)
1241 %arrayidx18 = getelementptr inbounds float, float* %pSamples.0220, i32 2
1242 %21 = bitcast float* %arrayidx18 to <4 x float>*
1243 %22 = load <4 x float>, <4 x float>* %21, align 4
1244 %.splatinsert19 = insertelement <4 x float> undef, float %6, i32 0
1245 %.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
1246 %23 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %22, <4 x float> %.splat20, <4 x float> %20)
1247 %arrayidx21 = getelementptr inbounds float, float* %pSamples.0220, i32 3
1248 %24 = bitcast float* %arrayidx21 to <4 x float>*
1249 %25 = load <4 x float>, <4 x float>* %24, align 4
1250 %.splatinsert22 = insertelement <4 x float> undef, float %7, i32 0
1251 %.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer
1252 %26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %25, <4 x float> %.splat23, <4 x float> %23)
1253 %arrayidx24 = getelementptr inbounds float, float* %pSamples.0220, i32 4
1254 %27 = bitcast float* %arrayidx24 to <4 x float>*
1255 %28 = load <4 x float>, <4 x float>* %27, align 4
1256 %.splatinsert25 = insertelement <4 x float> undef, float %8, i32 0
1257 %.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer
1258 %29 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %28, <4 x float> %.splat26, <4 x float> %26)
1259 %arrayidx27 = getelementptr inbounds float, float* %pSamples.0220, i32 5
1260 %30 = bitcast float* %arrayidx27 to <4 x float>*
1261 %31 = load <4 x float>, <4 x float>* %30, align 4
1262 %.splatinsert28 = insertelement <4 x float> undef, float %9, i32 0
1263 %.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer
1264 %32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %31, <4 x float> %.splat29, <4 x float> %29)
1265 %arrayidx30 = getelementptr inbounds float, float* %pSamples.0220, i32 6
1266 %33 = bitcast float* %arrayidx30 to <4 x float>*
1267 %34 = load <4 x float>, <4 x float>* %33, align 4
1268 %.splatinsert31 = insertelement <4 x float> undef, float %10, i32 0
1269 %.splat32 = shufflevector <4 x float> %.splatinsert31, <4 x float> undef, <4 x i32> zeroinitializer
1270 %35 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %34, <4 x float> %.splat32, <4 x float> %32)
1271 %arrayidx33 = getelementptr inbounds float, float* %pSamples.0220, i32 7
1272 %36 = bitcast float* %arrayidx33 to <4 x float>*
1273 %37 = load <4 x float>, <4 x float>* %36, align 4
1274 %.splatinsert34 = insertelement <4 x float> undef, float %11, i32 0
1275 %.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer
1276 %38 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %37, <4 x float> %.splat35, <4 x float> %35)
1277 %pSamples.1200 = getelementptr inbounds float, float* %pSamples.0220, i32 8
1278 br i1 %cmp38201, label %for.body, label %for.end
1279
1280for.body: ; preds = %while.body, %for.body
1281 %pSamples.1207 = phi float* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
1282 %pCoeffsCur.0206 = phi float* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
1283 %.pn205 = phi float* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ]
1284 %i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
1285 %vecAcc0.0203 = phi <4 x float> [ %70, %for.body ], [ %38, %while.body ]
1286 %pSamples.0.pn202 = phi float* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
1287 %incdec.ptr40 = getelementptr inbounds float, float* %.pn205, i32 9
1288 %39 = load float, float* %pCoeffsCur.0206, align 4
1289 %incdec.ptr41 = getelementptr inbounds float, float* %.pn205, i32 10
1290 %40 = load float, float* %incdec.ptr40, align 4
1291 %incdec.ptr42 = getelementptr inbounds float, float* %.pn205, i32 11
1292 %41 = load float, float* %incdec.ptr41, align 4
1293 %incdec.ptr43 = getelementptr inbounds float, float* %.pn205, i32 12
1294 %42 = load float, float* %incdec.ptr42, align 4
1295 %incdec.ptr44 = getelementptr inbounds float, float* %.pn205, i32 13
1296 %43 = load float, float* %incdec.ptr43, align 4
1297 %incdec.ptr45 = getelementptr inbounds float, float* %.pn205, i32 14
1298 %44 = load float, float* %incdec.ptr44, align 4
1299 %incdec.ptr46 = getelementptr inbounds float, float* %.pn205, i32 15
1300 %45 = load float, float* %incdec.ptr45, align 4
1301 %46 = load float, float* %incdec.ptr46, align 4
1302 %47 = bitcast float* %pSamples.1207 to <4 x float>*
1303 %48 = load <4 x float>, <4 x float>* %47, align 4
1304 %.splatinsert48 = insertelement <4 x float> undef, float %39, i32 0
1305 %.splat49 = shufflevector <4 x float> %.splatinsert48, <4 x float> undef, <4 x i32> zeroinitializer
1306 %49 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %48, <4 x float> %.splat49, <4 x float> %vecAcc0.0203)
1307 %arrayidx50 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 9
1308 %50 = bitcast float* %arrayidx50 to <4 x float>*
1309 %51 = load <4 x float>, <4 x float>* %50, align 4
1310 %.splatinsert51 = insertelement <4 x float> undef, float %40, i32 0
1311 %.splat52 = shufflevector <4 x float> %.splatinsert51, <4 x float> undef, <4 x i32> zeroinitializer
1312 %52 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %51, <4 x float> %.splat52, <4 x float> %49)
1313 %arrayidx53 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 10
1314 %53 = bitcast float* %arrayidx53 to <4 x float>*
1315 %54 = load <4 x float>, <4 x float>* %53, align 4
1316 %.splatinsert54 = insertelement <4 x float> undef, float %41, i32 0
1317 %.splat55 = shufflevector <4 x float> %.splatinsert54, <4 x float> undef, <4 x i32> zeroinitializer
1318 %55 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %54, <4 x float> %.splat55, <4 x float> %52)
1319 %arrayidx56 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 11
1320 %56 = bitcast float* %arrayidx56 to <4 x float>*
1321 %57 = load <4 x float>, <4 x float>* %56, align 4
1322 %.splatinsert57 = insertelement <4 x float> undef, float %42, i32 0
1323 %.splat58 = shufflevector <4 x float> %.splatinsert57, <4 x float> undef, <4 x i32> zeroinitializer
1324 %58 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %57, <4 x float> %.splat58, <4 x float> %55)
1325 %arrayidx59 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 12
1326 %59 = bitcast float* %arrayidx59 to <4 x float>*
1327 %60 = load <4 x float>, <4 x float>* %59, align 4
1328 %.splatinsert60 = insertelement <4 x float> undef, float %43, i32 0
1329 %.splat61 = shufflevector <4 x float> %.splatinsert60, <4 x float> undef, <4 x i32> zeroinitializer
1330 %61 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %60, <4 x float> %.splat61, <4 x float> %58)
1331 %arrayidx62 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 13
1332 %62 = bitcast float* %arrayidx62 to <4 x float>*
1333 %63 = load <4 x float>, <4 x float>* %62, align 4
1334 %.splatinsert63 = insertelement <4 x float> undef, float %44, i32 0
1335 %.splat64 = shufflevector <4 x float> %.splatinsert63, <4 x float> undef, <4 x i32> zeroinitializer
1336 %64 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %63, <4 x float> %.splat64, <4 x float> %61)
1337 %arrayidx65 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 14
1338 %65 = bitcast float* %arrayidx65 to <4 x float>*
1339 %66 = load <4 x float>, <4 x float>* %65, align 4
1340 %.splatinsert66 = insertelement <4 x float> undef, float %45, i32 0
1341 %.splat67 = shufflevector <4 x float> %.splatinsert66, <4 x float> undef, <4 x i32> zeroinitializer
1342 %67 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %66, <4 x float> %.splat67, <4 x float> %64)
1343 %arrayidx68 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 15
1344 %68 = bitcast float* %arrayidx68 to <4 x float>*
1345 %69 = load <4 x float>, <4 x float>* %68, align 4
1346 %.splatinsert69 = insertelement <4 x float> undef, float %46, i32 0
1347 %.splat70 = shufflevector <4 x float> %.splatinsert69, <4 x float> undef, <4 x i32> zeroinitializer
1348 %70 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %69, <4 x float> %.splat70, <4 x float> %67)
1349 %inc = add nuw nsw i32 %i.0204, 1
1350 %pCoeffsCur.0 = getelementptr inbounds float, float* %pCoeffsCur.0206, i32 8
1351 %pSamples.1 = getelementptr inbounds float, float* %pSamples.1207, i32 8
1352 %exitcond = icmp eq i32 %inc, %smax
1353 br i1 %exitcond, label %for.end, label %for.body
1354
1355for.end: ; preds = %for.body, %while.body
1356 %vecAcc0.0.lcssa = phi <4 x float> [ %38, %while.body ], [ %70, %for.body ]
1357 %pCoeffsCur.0.lcssa = phi float* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
1358 %pSamples.1.lcssa = phi float* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
1359 br i1 %cmp74210, label %while.end, label %while.body76
1360
1361while.body76: ; preds = %for.end, %while.body76
1362 %pCoeffsCur.1214 = phi float* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
1363 %vecAcc0.1213 = phi <4 x float> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
1364 %numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
1365 %pSamples.2211 = phi float* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
1366 %incdec.ptr77 = getelementptr inbounds float, float* %pCoeffsCur.1214, i32 1
1367 %71 = load float, float* %pCoeffsCur.1214, align 4
1368 %72 = bitcast float* %pSamples.2211 to <4 x float>*
1369 %73 = load <4 x float>, <4 x float>* %72, align 4
1370 %.splatinsert78 = insertelement <4 x float> undef, float %71, i32 0
1371 %.splat79 = shufflevector <4 x float> %.splatinsert78, <4 x float> undef, <4 x i32> zeroinitializer
1372 %74 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %73, <4 x float> %.splat79, <4 x float> %vecAcc0.1213)
1373 %incdec.ptr80 = getelementptr inbounds float, float* %pSamples.2211, i32 1
1374 %dec = add nsw i32 %numCnt.0212, -1
1375 %cmp74 = icmp sgt i32 %numCnt.0212, 1
1376 br i1 %cmp74, label %while.body76, label %while.end.loopexit
1377
1378while.end.loopexit: ; preds = %while.body76
1379 %scevgep = getelementptr float, float* %pSamples.1.lcssa, i32 %and
1380 br label %while.end
1381
1382while.end: ; preds = %while.end.loopexit, %for.end
1383 %pSamples.2.lcssa = phi float* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
1384 %vecAcc0.1.lcssa = phi <4 x float> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ]
1385 %75 = bitcast float* %pOutput.0218 to <4 x float>*
1386 store <4 x float> %vecAcc0.1.lcssa, <4 x float>* %75, align 4
1387 %add.ptr81 = getelementptr inbounds float, float* %pOutput.0218, i32 4
1388 %add.ptr82 = getelementptr inbounds float, float* %pSamples.2.lcssa, i32 4
1389 %add.ptr83 = getelementptr inbounds float, float* %add.ptr82, i32 %idx.neg
1390 %dec84 = add nsw i32 %blkCnt.0222, -1
1391 %cmp5 = icmp eq i32 %dec84, 0
1392 br i1 %cmp5, label %if.end, label %while.body
1393
1394if.end: ; preds = %while.end, %if.then, %entry
1395 ret void
1396}
1397
1398declare void @llvm.assume(i1)
1399declare <4 x i1> @llvm.arm.mve.vctp32(i32)
1400declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
1401declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)