blob: 9b3398ef5126921877ea41de32a7c95bf6073c95 [file] [log] [blame]
David Green0ac4f6b2020-02-17 11:41:16 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3
4define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
5; CHECK-LABEL: add_v4i32_v4i32:
6; CHECK: @ %bb.0: @ %entry
7; CHECK-NEXT: vmul.i32 q0, q0, q1
8; CHECK-NEXT: vaddv.u32 r0, q0
9; CHECK-NEXT: bx lr
10entry:
11 %m = mul <4 x i32> %x, %y
12 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
13 ret i32 %z
14}
15
16define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
17; CHECK-LABEL: add_v4i32_v4i64_zext:
18; CHECK: @ %bb.0: @ %entry
19; CHECK-NEXT: .vsave {d8, d9}
20; CHECK-NEXT: vpush {d8, d9}
21; CHECK-NEXT: vmov.f32 s8, s4
22; CHECK-NEXT: vmov.f32 s12, s0
23; CHECK-NEXT: vmov.f32 s10, s5
24; CHECK-NEXT: vmov.f32 s14, s1
25; CHECK-NEXT: vmov r0, s8
26; CHECK-NEXT: vmov r1, s12
27; CHECK-NEXT: umull r0, r1, r1, r0
28; CHECK-NEXT: vmov.32 q4[0], r0
29; CHECK-NEXT: vmov r0, s10
30; CHECK-NEXT: vmov.32 q4[1], r1
31; CHECK-NEXT: vmov r1, s14
32; CHECK-NEXT: vmov.f32 s8, s6
33; CHECK-NEXT: vmov.f32 s12, s2
34; CHECK-NEXT: vmov.f32 s10, s7
35; CHECK-NEXT: vmov.f32 s14, s3
36; CHECK-NEXT: umull r0, r1, r1, r0
37; CHECK-NEXT: vmov.32 q4[2], r0
38; CHECK-NEXT: vmov.32 q4[3], r1
39; CHECK-NEXT: vmov r0, s18
40; CHECK-NEXT: vmov r3, s16
41; CHECK-NEXT: vmov r2, s17
42; CHECK-NEXT: adds r0, r0, r3
43; CHECK-NEXT: adcs r1, r2
44; CHECK-NEXT: adr r2, .LCPI1_0
45; CHECK-NEXT: vldrw.u32 q1, [r2]
46; CHECK-NEXT: vand q2, q2, q1
47; CHECK-NEXT: vand q0, q3, q1
48; CHECK-NEXT: vmov r2, s8
49; CHECK-NEXT: vmov r3, s0
50; CHECK-NEXT: umlal r0, r1, r3, r2
51; CHECK-NEXT: vmov r2, s10
52; CHECK-NEXT: vmov r3, s2
53; CHECK-NEXT: umlal r0, r1, r3, r2
54; CHECK-NEXT: vpop {d8, d9}
55; CHECK-NEXT: bx lr
56; CHECK-NEXT: .p2align 4
57; CHECK-NEXT: @ %bb.1:
58; CHECK-NEXT: .LCPI1_0:
59; CHECK-NEXT: .long 4294967295 @ 0xffffffff
60; CHECK-NEXT: .long 0 @ 0x0
61; CHECK-NEXT: .long 4294967295 @ 0xffffffff
62; CHECK-NEXT: .long 0 @ 0x0
63entry:
64 %xx = zext <4 x i32> %x to <4 x i64>
65 %yy = zext <4 x i32> %y to <4 x i64>
66 %m = mul <4 x i64> %xx, %yy
67 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
68 ret i64 %z
69}
70
71define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
72; CHECK-LABEL: add_v4i32_v4i64_sext:
73; CHECK: @ %bb.0: @ %entry
74; CHECK-NEXT: .vsave {d8, d9}
75; CHECK-NEXT: vpush {d8, d9}
76; CHECK-NEXT: vmov.f32 s8, s4
77; CHECK-NEXT: vmov.f32 s12, s0
78; CHECK-NEXT: vmov.f32 s10, s5
79; CHECK-NEXT: vmov.f32 s14, s1
80; CHECK-NEXT: vmov r0, s8
81; CHECK-NEXT: vmov r1, s12
82; CHECK-NEXT: smull r0, r1, r1, r0
83; CHECK-NEXT: vmov.32 q4[0], r0
84; CHECK-NEXT: vmov r0, s10
85; CHECK-NEXT: vmov.32 q4[1], r1
86; CHECK-NEXT: vmov r1, s14
87; CHECK-NEXT: vmov.f32 s8, s6
88; CHECK-NEXT: vmov.f32 s10, s7
89; CHECK-NEXT: vmov.f32 s4, s2
90; CHECK-NEXT: vmov.f32 s6, s3
91; CHECK-NEXT: smull r0, r1, r1, r0
92; CHECK-NEXT: vmov.32 q4[2], r0
93; CHECK-NEXT: vmov.32 q4[3], r1
94; CHECK-NEXT: vmov r0, s18
95; CHECK-NEXT: vmov r3, s16
96; CHECK-NEXT: vmov r2, s17
97; CHECK-NEXT: adds r0, r0, r3
98; CHECK-NEXT: vmov r3, s4
99; CHECK-NEXT: adcs r1, r2
100; CHECK-NEXT: vmov r2, s8
101; CHECK-NEXT: smlal r0, r1, r3, r2
102; CHECK-NEXT: vmov r2, s10
103; CHECK-NEXT: vmov r3, s6
104; CHECK-NEXT: smlal r0, r1, r3, r2
105; CHECK-NEXT: vpop {d8, d9}
106; CHECK-NEXT: bx lr
107entry:
108 %xx = sext <4 x i32> %x to <4 x i64>
109 %yy = sext <4 x i32> %y to <4 x i64>
110 %m = mul <4 x i64> %xx, %yy
111 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
112 ret i64 %z
113}
114
115define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
116; CHECK-LABEL: add_v2i32_v2i64_zext:
117; CHECK: @ %bb.0: @ %entry
118; CHECK-NEXT: vmov r0, s4
119; CHECK-NEXT: vmov r1, s0
120; CHECK-NEXT: vmov r2, s6
121; CHECK-NEXT: vmov r3, s2
122; CHECK-NEXT: umull r0, r1, r1, r0
123; CHECK-NEXT: umlal r0, r1, r3, r2
124; CHECK-NEXT: bx lr
125entry:
126 %xx = zext <2 x i32> %x to <2 x i64>
127 %yy = zext <2 x i32> %y to <2 x i64>
128 %m = mul <2 x i64> %xx, %yy
129 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
130 ret i64 %z
131}
132
133define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
134; CHECK-LABEL: add_v2i32_v2i64_sext:
135; CHECK: @ %bb.0: @ %entry
136; CHECK-NEXT: vmov r0, s4
137; CHECK-NEXT: vmov r1, s0
138; CHECK-NEXT: vmov r2, s6
139; CHECK-NEXT: vmov r3, s2
140; CHECK-NEXT: smull r0, r1, r1, r0
141; CHECK-NEXT: smlal r0, r1, r3, r2
142; CHECK-NEXT: bx lr
143entry:
144 %xx = sext <2 x i32> %x to <2 x i64>
145 %yy = sext <2 x i32> %y to <2 x i64>
146 %m = mul <2 x i64> %xx, %yy
147 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
148 ret i64 %z
149}
150
151define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
152; CHECK-LABEL: add_v8i16_v8i32_zext:
153; CHECK: @ %bb.0: @ %entry
154; CHECK-NEXT: vmov.u16 r0, q1[4]
155; CHECK-NEXT: vmov.32 q2[0], r0
156; CHECK-NEXT: vmov.u16 r0, q1[5]
157; CHECK-NEXT: vmov.32 q2[1], r0
158; CHECK-NEXT: vmov.u16 r0, q1[6]
159; CHECK-NEXT: vmov.32 q2[2], r0
160; CHECK-NEXT: vmov.u16 r0, q1[7]
161; CHECK-NEXT: vmov.32 q2[3], r0
162; CHECK-NEXT: vmov.u16 r0, q0[4]
163; CHECK-NEXT: vmov.32 q3[0], r0
164; CHECK-NEXT: vmov.u16 r0, q0[5]
165; CHECK-NEXT: vmov.32 q3[1], r0
166; CHECK-NEXT: vmov.u16 r0, q0[6]
167; CHECK-NEXT: vmov.32 q3[2], r0
168; CHECK-NEXT: vmov.u16 r0, q0[7]
169; CHECK-NEXT: vmov.32 q3[3], r0
170; CHECK-NEXT: vmovlb.u16 q2, q2
171; CHECK-NEXT: vmovlb.u16 q3, q3
172; CHECK-NEXT: vmov.u16 r0, q1[0]
173; CHECK-NEXT: vmul.i32 q2, q3, q2
174; CHECK-NEXT: vmov.32 q3[0], r0
175; CHECK-NEXT: vmov.u16 r0, q1[1]
176; CHECK-NEXT: vmov.32 q3[1], r0
177; CHECK-NEXT: vmov.u16 r0, q1[2]
178; CHECK-NEXT: vmov.32 q3[2], r0
179; CHECK-NEXT: vmov.u16 r0, q1[3]
180; CHECK-NEXT: vmov.32 q3[3], r0
181; CHECK-NEXT: vmov.u16 r0, q0[0]
182; CHECK-NEXT: vmovlb.u16 q1, q3
183; CHECK-NEXT: vmov.32 q3[0], r0
184; CHECK-NEXT: vmov.u16 r0, q0[1]
185; CHECK-NEXT: vmov.32 q3[1], r0
186; CHECK-NEXT: vmov.u16 r0, q0[2]
187; CHECK-NEXT: vmov.32 q3[2], r0
188; CHECK-NEXT: vmov.u16 r0, q0[3]
189; CHECK-NEXT: vmov.32 q3[3], r0
190; CHECK-NEXT: vmovlb.u16 q0, q3
191; CHECK-NEXT: vmul.i32 q0, q0, q1
192; CHECK-NEXT: vadd.i32 q0, q0, q2
193; CHECK-NEXT: vaddv.u32 r0, q0
194; CHECK-NEXT: bx lr
195entry:
196 %xx = zext <8 x i16> %x to <8 x i32>
197 %yy = zext <8 x i16> %y to <8 x i32>
198 %m = mul <8 x i32> %xx, %yy
199 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
200 ret i32 %z
201}
202
203define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
204; CHECK-LABEL: add_v8i16_v8i32_sext:
205; CHECK: @ %bb.0: @ %entry
206; CHECK-NEXT: vmov.u16 r0, q1[4]
207; CHECK-NEXT: vmov.32 q2[0], r0
208; CHECK-NEXT: vmov.u16 r0, q1[5]
209; CHECK-NEXT: vmov.32 q2[1], r0
210; CHECK-NEXT: vmov.u16 r0, q1[6]
211; CHECK-NEXT: vmov.32 q2[2], r0
212; CHECK-NEXT: vmov.u16 r0, q1[7]
213; CHECK-NEXT: vmov.32 q2[3], r0
214; CHECK-NEXT: vmov.u16 r0, q0[4]
215; CHECK-NEXT: vmov.32 q3[0], r0
216; CHECK-NEXT: vmov.u16 r0, q0[5]
217; CHECK-NEXT: vmov.32 q3[1], r0
218; CHECK-NEXT: vmov.u16 r0, q0[6]
219; CHECK-NEXT: vmov.32 q3[2], r0
220; CHECK-NEXT: vmov.u16 r0, q0[7]
221; CHECK-NEXT: vmov.32 q3[3], r0
222; CHECK-NEXT: vmovlb.s16 q2, q2
223; CHECK-NEXT: vmovlb.s16 q3, q3
224; CHECK-NEXT: vmov.u16 r0, q1[0]
225; CHECK-NEXT: vmul.i32 q2, q3, q2
226; CHECK-NEXT: vmov.32 q3[0], r0
227; CHECK-NEXT: vmov.u16 r0, q1[1]
228; CHECK-NEXT: vmov.32 q3[1], r0
229; CHECK-NEXT: vmov.u16 r0, q1[2]
230; CHECK-NEXT: vmov.32 q3[2], r0
231; CHECK-NEXT: vmov.u16 r0, q1[3]
232; CHECK-NEXT: vmov.32 q3[3], r0
233; CHECK-NEXT: vmov.u16 r0, q0[0]
234; CHECK-NEXT: vmovlb.s16 q1, q3
235; CHECK-NEXT: vmov.32 q3[0], r0
236; CHECK-NEXT: vmov.u16 r0, q0[1]
237; CHECK-NEXT: vmov.32 q3[1], r0
238; CHECK-NEXT: vmov.u16 r0, q0[2]
239; CHECK-NEXT: vmov.32 q3[2], r0
240; CHECK-NEXT: vmov.u16 r0, q0[3]
241; CHECK-NEXT: vmov.32 q3[3], r0
242; CHECK-NEXT: vmovlb.s16 q0, q3
243; CHECK-NEXT: vmul.i32 q0, q0, q1
244; CHECK-NEXT: vadd.i32 q0, q0, q2
245; CHECK-NEXT: vaddv.u32 r0, q0
246; CHECK-NEXT: bx lr
247entry:
248 %xx = sext <8 x i16> %x to <8 x i32>
249 %yy = sext <8 x i16> %y to <8 x i32>
250 %m = mul <8 x i32> %xx, %yy
251 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
252 ret i32 %z
253}
254
255define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
256; CHECK-LABEL: add_v4i16_v4i32_zext:
257; CHECK: @ %bb.0: @ %entry
258; CHECK-NEXT: vmovlb.u16 q1, q1
259; CHECK-NEXT: vmovlb.u16 q0, q0
260; CHECK-NEXT: vmul.i32 q0, q0, q1
261; CHECK-NEXT: vaddv.u32 r0, q0
262; CHECK-NEXT: bx lr
263entry:
264 %xx = zext <4 x i16> %x to <4 x i32>
265 %yy = zext <4 x i16> %y to <4 x i32>
266 %m = mul <4 x i32> %xx, %yy
267 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
268 ret i32 %z
269}
270
271define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
272; CHECK-LABEL: add_v4i16_v4i32_sext:
273; CHECK: @ %bb.0: @ %entry
274; CHECK-NEXT: vmovlb.s16 q1, q1
275; CHECK-NEXT: vmovlb.s16 q0, q0
276; CHECK-NEXT: vmul.i32 q0, q0, q1
277; CHECK-NEXT: vaddv.u32 r0, q0
278; CHECK-NEXT: bx lr
279entry:
280 %xx = sext <4 x i16> %x to <4 x i32>
281 %yy = sext <4 x i16> %y to <4 x i32>
282 %m = mul <4 x i32> %xx, %yy
283 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
284 ret i32 %z
285}
286
287define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
288; CHECK-LABEL: add_v8i16_v8i16:
289; CHECK: @ %bb.0: @ %entry
290; CHECK-NEXT: vmul.i16 q0, q0, q1
291; CHECK-NEXT: vaddv.u16 r0, q0
292; CHECK-NEXT: uxth r0, r0
293; CHECK-NEXT: bx lr
294entry:
295 %m = mul <8 x i16> %x, %y
296 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
297 ret i16 %z
298}
299
300define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
301; CHECK-LABEL: add_v8i16_v8i64_zext:
302; CHECK: @ %bb.0: @ %entry
303; CHECK-NEXT: .vsave {d8, d9, d10, d11}
304; CHECK-NEXT: vpush {d8, d9, d10, d11}
305; CHECK-NEXT: vmov.u16 r0, q1[0]
306; CHECK-NEXT: vmov.u16 r1, q0[0]
307; CHECK-NEXT: vmov.32 q3[0], r0
308; CHECK-NEXT: vmov.u16 r0, q1[1]
309; CHECK-NEXT: vmov.32 q3[2], r0
310; CHECK-NEXT: adr r0, .LCPI10_0
311; CHECK-NEXT: vldrw.u32 q2, [r0]
312; CHECK-NEXT: vmov.32 q4[0], r1
313; CHECK-NEXT: vmov.u16 r1, q0[1]
314; CHECK-NEXT: vmov.32 q4[2], r1
315; CHECK-NEXT: vand q3, q3, q2
316; CHECK-NEXT: vand q4, q4, q2
317; CHECK-NEXT: vmov r0, s12
318; CHECK-NEXT: vmov r1, s16
319; CHECK-NEXT: umull r0, r1, r1, r0
320; CHECK-NEXT: vmov.32 q5[0], r0
321; CHECK-NEXT: vmov r0, s14
322; CHECK-NEXT: vmov.32 q5[1], r1
323; CHECK-NEXT: vmov r1, s18
324; CHECK-NEXT: umull r0, r1, r1, r0
325; CHECK-NEXT: vmov.32 q5[2], r0
326; CHECK-NEXT: vmov.32 q5[3], r1
327; CHECK-NEXT: vmov r0, s22
328; CHECK-NEXT: vmov r3, s20
329; CHECK-NEXT: vmov r2, s21
330; CHECK-NEXT: adds.w r12, r3, r0
331; CHECK-NEXT: vmov.u16 r3, q0[2]
332; CHECK-NEXT: adcs r1, r2
333; CHECK-NEXT: vmov.u16 r2, q1[2]
334; CHECK-NEXT: vmov.32 q3[0], r2
335; CHECK-NEXT: vmov.u16 r2, q1[3]
336; CHECK-NEXT: vmov.32 q4[0], r3
337; CHECK-NEXT: vmov.u16 r3, q0[3]
338; CHECK-NEXT: vmov.32 q3[2], r2
339; CHECK-NEXT: vmov.32 q4[2], r3
340; CHECK-NEXT: vand q3, q3, q2
341; CHECK-NEXT: vand q4, q4, q2
342; CHECK-NEXT: vmov r2, s12
343; CHECK-NEXT: vmov r3, s16
344; CHECK-NEXT: umull r2, r3, r3, r2
345; CHECK-NEXT: vmov.32 q5[0], r2
346; CHECK-NEXT: vmov r2, s14
347; CHECK-NEXT: vmov.32 q5[1], r3
348; CHECK-NEXT: vmov r3, s18
349; CHECK-NEXT: umull r2, r3, r3, r2
350; CHECK-NEXT: vmov.32 q5[2], r2
351; CHECK-NEXT: vmov.32 q5[3], r3
352; CHECK-NEXT: vmov r0, s20
353; CHECK-NEXT: vmov r2, s21
354; CHECK-NEXT: adds.w r0, r0, r12
355; CHECK-NEXT: adcs r1, r2
356; CHECK-NEXT: vmov r2, s22
357; CHECK-NEXT: adds.w r12, r0, r2
358; CHECK-NEXT: vmov.u16 r2, q1[4]
359; CHECK-NEXT: adcs r1, r3
360; CHECK-NEXT: vmov.u16 r3, q0[4]
361; CHECK-NEXT: vmov.32 q3[0], r2
362; CHECK-NEXT: vmov.u16 r2, q1[5]
363; CHECK-NEXT: vmov.32 q4[0], r3
364; CHECK-NEXT: vmov.u16 r3, q0[5]
365; CHECK-NEXT: vmov.32 q3[2], r2
366; CHECK-NEXT: vmov.32 q4[2], r3
367; CHECK-NEXT: vand q3, q3, q2
368; CHECK-NEXT: vand q4, q4, q2
369; CHECK-NEXT: vmov r2, s12
370; CHECK-NEXT: vmov r3, s16
371; CHECK-NEXT: umull r2, r3, r3, r2
372; CHECK-NEXT: vmov.32 q5[0], r2
373; CHECK-NEXT: vmov r2, s14
374; CHECK-NEXT: vmov.32 q5[1], r3
375; CHECK-NEXT: vmov r3, s18
376; CHECK-NEXT: umull r2, r3, r3, r2
377; CHECK-NEXT: vmov.32 q5[2], r2
378; CHECK-NEXT: vmov.32 q5[3], r3
379; CHECK-NEXT: vmov r0, s20
380; CHECK-NEXT: vmov r2, s21
381; CHECK-NEXT: adds.w r0, r0, r12
382; CHECK-NEXT: adcs r1, r2
383; CHECK-NEXT: vmov r2, s22
384; CHECK-NEXT: adds r0, r0, r2
385; CHECK-NEXT: vmov.u16 r2, q1[6]
386; CHECK-NEXT: vmov.32 q3[0], r2
387; CHECK-NEXT: vmov.u16 r2, q1[7]
388; CHECK-NEXT: adcs r1, r3
389; CHECK-NEXT: vmov.32 q3[2], r2
390; CHECK-NEXT: vmov.u16 r3, q0[6]
391; CHECK-NEXT: vand q1, q3, q2
392; CHECK-NEXT: vmov.32 q3[0], r3
393; CHECK-NEXT: vmov.u16 r3, q0[7]
394; CHECK-NEXT: vmov.32 q3[2], r3
395; CHECK-NEXT: vmov r2, s4
396; CHECK-NEXT: vand q0, q3, q2
397; CHECK-NEXT: vmov r3, s0
398; CHECK-NEXT: umlal r0, r1, r3, r2
399; CHECK-NEXT: vmov r2, s6
400; CHECK-NEXT: vmov r3, s2
401; CHECK-NEXT: umlal r0, r1, r3, r2
402; CHECK-NEXT: vpop {d8, d9, d10, d11}
403; CHECK-NEXT: bx lr
404; CHECK-NEXT: .p2align 4
405; CHECK-NEXT: @ %bb.1:
406; CHECK-NEXT: .LCPI10_0:
407; CHECK-NEXT: .long 65535 @ 0xffff
408; CHECK-NEXT: .long 0 @ 0x0
409; CHECK-NEXT: .long 65535 @ 0xffff
410; CHECK-NEXT: .long 0 @ 0x0
411entry:
412 %xx = zext <8 x i16> %x to <8 x i64>
413 %yy = zext <8 x i16> %y to <8 x i64>
414 %m = mul <8 x i64> %xx, %yy
415 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
416 ret i64 %z
417}
418
419define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
420; CHECK-LABEL: add_v8i16_v8i64_sext:
421; CHECK: @ %bb.0: @ %entry
422; CHECK-NEXT: vmov.u16 r0, q1[0]
423; CHECK-NEXT: vmov.u16 r1, q0[0]
424; CHECK-NEXT: sxth r0, r0
425; CHECK-NEXT: sxth r1, r1
426; CHECK-NEXT: smull r0, r1, r1, r0
427; CHECK-NEXT: vmov.32 q2[0], r0
428; CHECK-NEXT: vmov.u16 r0, q1[1]
429; CHECK-NEXT: vmov.32 q2[1], r1
430; CHECK-NEXT: vmov.u16 r1, q0[1]
431; CHECK-NEXT: sxth r0, r0
432; CHECK-NEXT: sxth r1, r1
433; CHECK-NEXT: smull r0, r1, r1, r0
434; CHECK-NEXT: vmov.32 q2[2], r0
435; CHECK-NEXT: vmov.32 q2[3], r1
436; CHECK-NEXT: vmov r2, s10
437; CHECK-NEXT: vmov r3, s8
438; CHECK-NEXT: vmov r0, s9
439; CHECK-NEXT: adds r2, r2, r3
440; CHECK-NEXT: vmov.u16 r3, q0[2]
441; CHECK-NEXT: adc.w r12, r0, r1
442; CHECK-NEXT: vmov.u16 r1, q1[2]
443; CHECK-NEXT: sxth r1, r1
444; CHECK-NEXT: sxth r3, r3
445; CHECK-NEXT: smull r1, r3, r3, r1
446; CHECK-NEXT: vmov.32 q2[0], r1
447; CHECK-NEXT: vmov.u16 r1, q1[3]
448; CHECK-NEXT: vmov.32 q2[1], r3
449; CHECK-NEXT: vmov.u16 r3, q0[3]
450; CHECK-NEXT: sxth r1, r1
451; CHECK-NEXT: sxth r3, r3
452; CHECK-NEXT: smull r1, r3, r3, r1
453; CHECK-NEXT: vmov.32 q2[2], r1
454; CHECK-NEXT: vmov.32 q2[3], r3
455; CHECK-NEXT: vmov r0, s8
456; CHECK-NEXT: vmov r1, s9
457; CHECK-NEXT: adds r0, r0, r2
458; CHECK-NEXT: vmov r2, s10
459; CHECK-NEXT: adc.w r1, r1, r12
460; CHECK-NEXT: adds.w r12, r0, r2
461; CHECK-NEXT: vmov.u16 r2, q1[4]
462; CHECK-NEXT: adcs r1, r3
463; CHECK-NEXT: vmov.u16 r3, q0[4]
464; CHECK-NEXT: sxth r2, r2
465; CHECK-NEXT: sxth r3, r3
466; CHECK-NEXT: smull r2, r3, r3, r2
467; CHECK-NEXT: vmov.32 q2[0], r2
468; CHECK-NEXT: vmov.u16 r2, q1[5]
469; CHECK-NEXT: vmov.32 q2[1], r3
470; CHECK-NEXT: vmov.u16 r3, q0[5]
471; CHECK-NEXT: sxth r2, r2
472; CHECK-NEXT: sxth r3, r3
473; CHECK-NEXT: smull r2, r3, r3, r2
474; CHECK-NEXT: vmov.32 q2[2], r2
475; CHECK-NEXT: vmov.32 q2[3], r3
476; CHECK-NEXT: vmov r0, s8
477; CHECK-NEXT: vmov r2, s9
478; CHECK-NEXT: adds.w r0, r0, r12
479; CHECK-NEXT: adcs r1, r2
480; CHECK-NEXT: vmov r2, s10
481; CHECK-NEXT: adds r0, r0, r2
482; CHECK-NEXT: vmov.u16 r2, q1[6]
483; CHECK-NEXT: adcs r1, r3
484; CHECK-NEXT: vmov.u16 r3, q0[6]
485; CHECK-NEXT: sxth r2, r2
486; CHECK-NEXT: sxth r3, r3
487; CHECK-NEXT: smlal r0, r1, r3, r2
488; CHECK-NEXT: vmov.u16 r2, q1[7]
489; CHECK-NEXT: vmov.u16 r3, q0[7]
490; CHECK-NEXT: sxth r2, r2
491; CHECK-NEXT: sxth r3, r3
492; CHECK-NEXT: smlal r0, r1, r3, r2
493; CHECK-NEXT: bx lr
494entry:
495 %xx = sext <8 x i16> %x to <8 x i64>
496 %yy = sext <8 x i16> %y to <8 x i64>
497 %m = mul <8 x i64> %xx, %yy
498 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
499 ret i64 %z
500}
501
502define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
503; CHECK-LABEL: add_v2i16_v2i64_zext:
504; CHECK: @ %bb.0: @ %entry
505; CHECK-NEXT: adr r0, .LCPI12_0
506; CHECK-NEXT: vldrw.u32 q2, [r0]
507; CHECK-NEXT: vand q1, q1, q2
508; CHECK-NEXT: vand q0, q0, q2
509; CHECK-NEXT: vmov r0, s4
510; CHECK-NEXT: vmov r1, s0
511; CHECK-NEXT: vmov r2, s6
512; CHECK-NEXT: vmov r3, s2
513; CHECK-NEXT: umull r0, r1, r1, r0
514; CHECK-NEXT: umlal r0, r1, r3, r2
515; CHECK-NEXT: bx lr
516; CHECK-NEXT: .p2align 4
517; CHECK-NEXT: @ %bb.1:
518; CHECK-NEXT: .LCPI12_0:
519; CHECK-NEXT: .long 65535 @ 0xffff
520; CHECK-NEXT: .long 0 @ 0x0
521; CHECK-NEXT: .long 65535 @ 0xffff
522; CHECK-NEXT: .long 0 @ 0x0
523entry:
524 %xx = zext <2 x i16> %x to <2 x i64>
525 %yy = zext <2 x i16> %y to <2 x i64>
526 %m = mul <2 x i64> %xx, %yy
527 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
528 ret i64 %z
529}
530
531define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
532; CHECK-LABEL: add_v2i16_v2i64_sext:
533; CHECK: @ %bb.0: @ %entry
534; CHECK-NEXT: vmov r0, s4
535; CHECK-NEXT: vmov r1, s0
536; CHECK-NEXT: vmov r2, s6
537; CHECK-NEXT: vmov r3, s2
538; CHECK-NEXT: sxth r0, r0
539; CHECK-NEXT: sxth r1, r1
540; CHECK-NEXT: smull r0, r1, r1, r0
541; CHECK-NEXT: sxth r2, r2
542; CHECK-NEXT: sxth r3, r3
543; CHECK-NEXT: smlal r0, r1, r3, r2
544; CHECK-NEXT: bx lr
545entry:
546 %xx = sext <2 x i16> %x to <2 x i64>
547 %yy = sext <2 x i16> %y to <2 x i64>
548 %m = mul <2 x i64> %xx, %yy
549 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
550 ret i64 %z
551}
552
553define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
554; CHECK-LABEL: add_v16i8_v16i32_zext:
555; CHECK: @ %bb.0: @ %entry
556; CHECK-NEXT: .vsave {d8, d9, d10, d11}
557; CHECK-NEXT: vpush {d8, d9, d10, d11}
558; CHECK-NEXT: vmov.u8 r0, q1[12]
559; CHECK-NEXT: vmov.i32 q2, #0xff
560; CHECK-NEXT: vmov.32 q3[0], r0
561; CHECK-NEXT: vmov.u8 r0, q1[13]
562; CHECK-NEXT: vmov.32 q3[1], r0
563; CHECK-NEXT: vmov.u8 r0, q1[14]
564; CHECK-NEXT: vmov.32 q3[2], r0
565; CHECK-NEXT: vmov.u8 r0, q1[15]
566; CHECK-NEXT: vmov.32 q3[3], r0
567; CHECK-NEXT: vmov.u8 r0, q0[12]
568; CHECK-NEXT: vmov.32 q4[0], r0
569; CHECK-NEXT: vmov.u8 r0, q0[13]
570; CHECK-NEXT: vmov.32 q4[1], r0
571; CHECK-NEXT: vmov.u8 r0, q0[14]
572; CHECK-NEXT: vmov.32 q4[2], r0
573; CHECK-NEXT: vmov.u8 r0, q0[15]
574; CHECK-NEXT: vmov.32 q4[3], r0
575; CHECK-NEXT: vand q3, q3, q2
576; CHECK-NEXT: vand q4, q4, q2
577; CHECK-NEXT: vmov.u8 r0, q1[4]
578; CHECK-NEXT: vmul.i32 q3, q4, q3
579; CHECK-NEXT: vmov.32 q4[0], r0
580; CHECK-NEXT: vmov.u8 r0, q1[5]
581; CHECK-NEXT: vmov.32 q4[1], r0
582; CHECK-NEXT: vmov.u8 r0, q1[6]
583; CHECK-NEXT: vmov.32 q4[2], r0
584; CHECK-NEXT: vmov.u8 r0, q1[7]
585; CHECK-NEXT: vmov.32 q4[3], r0
586; CHECK-NEXT: vmov.u8 r0, q0[4]
587; CHECK-NEXT: vmov.32 q5[0], r0
588; CHECK-NEXT: vmov.u8 r0, q0[5]
589; CHECK-NEXT: vmov.32 q5[1], r0
590; CHECK-NEXT: vmov.u8 r0, q0[6]
591; CHECK-NEXT: vmov.32 q5[2], r0
592; CHECK-NEXT: vmov.u8 r0, q0[7]
593; CHECK-NEXT: vmov.32 q5[3], r0
594; CHECK-NEXT: vand q4, q4, q2
595; CHECK-NEXT: vand q5, q5, q2
596; CHECK-NEXT: vmov.u8 r0, q1[8]
597; CHECK-NEXT: vmul.i32 q4, q5, q4
598; CHECK-NEXT: vadd.i32 q3, q4, q3
599; CHECK-NEXT: vmov.32 q4[0], r0
600; CHECK-NEXT: vmov.u8 r0, q1[9]
601; CHECK-NEXT: vmov.32 q4[1], r0
602; CHECK-NEXT: vmov.u8 r0, q1[10]
603; CHECK-NEXT: vmov.32 q4[2], r0
604; CHECK-NEXT: vmov.u8 r0, q1[11]
605; CHECK-NEXT: vmov.32 q4[3], r0
606; CHECK-NEXT: vmov.u8 r0, q0[8]
607; CHECK-NEXT: vmov.32 q5[0], r0
608; CHECK-NEXT: vmov.u8 r0, q0[9]
609; CHECK-NEXT: vmov.32 q5[1], r0
610; CHECK-NEXT: vmov.u8 r0, q0[10]
611; CHECK-NEXT: vmov.32 q5[2], r0
612; CHECK-NEXT: vmov.u8 r0, q0[11]
613; CHECK-NEXT: vmov.32 q5[3], r0
614; CHECK-NEXT: vand q4, q4, q2
615; CHECK-NEXT: vand q5, q5, q2
616; CHECK-NEXT: vmov.u8 r0, q1[0]
617; CHECK-NEXT: vmul.i32 q4, q5, q4
618; CHECK-NEXT: vmov.32 q5[0], r0
619; CHECK-NEXT: vmov.u8 r0, q1[1]
620; CHECK-NEXT: vmov.32 q5[1], r0
621; CHECK-NEXT: vmov.u8 r0, q1[2]
622; CHECK-NEXT: vmov.32 q5[2], r0
623; CHECK-NEXT: vmov.u8 r0, q1[3]
624; CHECK-NEXT: vmov.32 q5[3], r0
625; CHECK-NEXT: vmov.u8 r0, q0[0]
626; CHECK-NEXT: vand q1, q5, q2
627; CHECK-NEXT: vmov.32 q5[0], r0
628; CHECK-NEXT: vmov.u8 r0, q0[1]
629; CHECK-NEXT: vmov.32 q5[1], r0
630; CHECK-NEXT: vmov.u8 r0, q0[2]
631; CHECK-NEXT: vmov.32 q5[2], r0
632; CHECK-NEXT: vmov.u8 r0, q0[3]
633; CHECK-NEXT: vmov.32 q5[3], r0
634; CHECK-NEXT: vand q0, q5, q2
635; CHECK-NEXT: vmul.i32 q0, q0, q1
636; CHECK-NEXT: vadd.i32 q0, q0, q4
637; CHECK-NEXT: vadd.i32 q0, q0, q3
638; CHECK-NEXT: vaddv.u32 r0, q0
639; CHECK-NEXT: vpop {d8, d9, d10, d11}
640; CHECK-NEXT: bx lr
641entry:
642 %xx = zext <16 x i8> %x to <16 x i32>
643 %yy = zext <16 x i8> %y to <16 x i32>
644 %m = mul <16 x i32> %xx, %yy
645 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
646 ret i32 %z
647}
648
649define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
650; CHECK-LABEL: add_v16i8_v16i32_sext:
651; CHECK: @ %bb.0: @ %entry
652; CHECK-NEXT: .vsave {d8, d9}
653; CHECK-NEXT: vpush {d8, d9}
654; CHECK-NEXT: vmov.u8 r0, q1[12]
655; CHECK-NEXT: vmov.32 q2[0], r0
656; CHECK-NEXT: vmov.u8 r0, q1[13]
657; CHECK-NEXT: vmov.32 q2[1], r0
658; CHECK-NEXT: vmov.u8 r0, q1[14]
659; CHECK-NEXT: vmov.32 q2[2], r0
660; CHECK-NEXT: vmov.u8 r0, q1[15]
661; CHECK-NEXT: vmov.32 q2[3], r0
662; CHECK-NEXT: vmov.u8 r0, q0[12]
663; CHECK-NEXT: vmov.32 q3[0], r0
664; CHECK-NEXT: vmov.u8 r0, q0[13]
665; CHECK-NEXT: vmov.32 q3[1], r0
666; CHECK-NEXT: vmov.u8 r0, q0[14]
667; CHECK-NEXT: vmov.32 q3[2], r0
668; CHECK-NEXT: vmov.u8 r0, q0[15]
669; CHECK-NEXT: vmov.32 q3[3], r0
670; CHECK-NEXT: vmovlb.s8 q2, q2
671; CHECK-NEXT: vmovlb.s8 q3, q3
672; CHECK-NEXT: vmovlb.s16 q2, q2
673; CHECK-NEXT: vmovlb.s16 q3, q3
674; CHECK-NEXT: vmov.u8 r0, q1[4]
675; CHECK-NEXT: vmul.i32 q2, q3, q2
676; CHECK-NEXT: vmov.32 q3[0], r0
677; CHECK-NEXT: vmov.u8 r0, q1[5]
678; CHECK-NEXT: vmov.32 q3[1], r0
679; CHECK-NEXT: vmov.u8 r0, q1[6]
680; CHECK-NEXT: vmov.32 q3[2], r0
681; CHECK-NEXT: vmov.u8 r0, q1[7]
682; CHECK-NEXT: vmov.32 q3[3], r0
683; CHECK-NEXT: vmov.u8 r0, q0[4]
684; CHECK-NEXT: vmov.32 q4[0], r0
685; CHECK-NEXT: vmov.u8 r0, q0[5]
686; CHECK-NEXT: vmov.32 q4[1], r0
687; CHECK-NEXT: vmov.u8 r0, q0[6]
688; CHECK-NEXT: vmov.32 q4[2], r0
689; CHECK-NEXT: vmov.u8 r0, q0[7]
690; CHECK-NEXT: vmov.32 q4[3], r0
691; CHECK-NEXT: vmovlb.s8 q3, q3
692; CHECK-NEXT: vmovlb.s8 q4, q4
693; CHECK-NEXT: vmovlb.s16 q3, q3
694; CHECK-NEXT: vmovlb.s16 q4, q4
695; CHECK-NEXT: vmov.u8 r0, q1[8]
696; CHECK-NEXT: vmul.i32 q3, q4, q3
697; CHECK-NEXT: vadd.i32 q2, q3, q2
698; CHECK-NEXT: vmov.32 q3[0], r0
699; CHECK-NEXT: vmov.u8 r0, q1[9]
700; CHECK-NEXT: vmov.32 q3[1], r0
701; CHECK-NEXT: vmov.u8 r0, q1[10]
702; CHECK-NEXT: vmov.32 q3[2], r0
703; CHECK-NEXT: vmov.u8 r0, q1[11]
704; CHECK-NEXT: vmov.32 q3[3], r0
705; CHECK-NEXT: vmov.u8 r0, q0[8]
706; CHECK-NEXT: vmov.32 q4[0], r0
707; CHECK-NEXT: vmov.u8 r0, q0[9]
708; CHECK-NEXT: vmov.32 q4[1], r0
709; CHECK-NEXT: vmov.u8 r0, q0[10]
710; CHECK-NEXT: vmov.32 q4[2], r0
711; CHECK-NEXT: vmov.u8 r0, q0[11]
712; CHECK-NEXT: vmov.32 q4[3], r0
713; CHECK-NEXT: vmovlb.s8 q3, q3
714; CHECK-NEXT: vmovlb.s8 q4, q4
715; CHECK-NEXT: vmovlb.s16 q3, q3
716; CHECK-NEXT: vmovlb.s16 q4, q4
717; CHECK-NEXT: vmov.u8 r0, q1[0]
718; CHECK-NEXT: vmul.i32 q3, q4, q3
719; CHECK-NEXT: vmov.32 q4[0], r0
720; CHECK-NEXT: vmov.u8 r0, q1[1]
721; CHECK-NEXT: vmov.32 q4[1], r0
722; CHECK-NEXT: vmov.u8 r0, q1[2]
723; CHECK-NEXT: vmov.32 q4[2], r0
724; CHECK-NEXT: vmov.u8 r0, q1[3]
725; CHECK-NEXT: vmov.32 q4[3], r0
726; CHECK-NEXT: vmov.u8 r0, q0[0]
727; CHECK-NEXT: vmovlb.s8 q1, q4
728; CHECK-NEXT: vmov.32 q4[0], r0
729; CHECK-NEXT: vmov.u8 r0, q0[1]
730; CHECK-NEXT: vmovlb.s16 q1, q1
731; CHECK-NEXT: vmov.32 q4[1], r0
732; CHECK-NEXT: vmov.u8 r0, q0[2]
733; CHECK-NEXT: vmov.32 q4[2], r0
734; CHECK-NEXT: vmov.u8 r0, q0[3]
735; CHECK-NEXT: vmov.32 q4[3], r0
736; CHECK-NEXT: vmovlb.s8 q0, q4
737; CHECK-NEXT: vmovlb.s16 q0, q0
738; CHECK-NEXT: vmul.i32 q0, q0, q1
739; CHECK-NEXT: vadd.i32 q0, q0, q3
740; CHECK-NEXT: vadd.i32 q0, q0, q2
741; CHECK-NEXT: vaddv.u32 r0, q0
742; CHECK-NEXT: vpop {d8, d9}
743; CHECK-NEXT: bx lr
744entry:
745 %xx = sext <16 x i8> %x to <16 x i32>
746 %yy = sext <16 x i8> %y to <16 x i32>
747 %m = mul <16 x i32> %xx, %yy
748 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
749 ret i32 %z
750}
751
752define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
753; CHECK-LABEL: add_v4i8_v4i32_zext:
754; CHECK: @ %bb.0: @ %entry
755; CHECK-NEXT: vmov.i32 q2, #0xff
756; CHECK-NEXT: vand q1, q1, q2
757; CHECK-NEXT: vand q0, q0, q2
758; CHECK-NEXT: vmul.i32 q0, q0, q1
759; CHECK-NEXT: vaddv.u32 r0, q0
760; CHECK-NEXT: bx lr
761entry:
762 %xx = zext <4 x i8> %x to <4 x i32>
763 %yy = zext <4 x i8> %y to <4 x i32>
764 %m = mul <4 x i32> %xx, %yy
765 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
766 ret i32 %z
767}
768
769define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
770; CHECK-LABEL: add_v4i8_v4i32_sext:
771; CHECK: @ %bb.0: @ %entry
772; CHECK-NEXT: vmovlb.s8 q1, q1
773; CHECK-NEXT: vmovlb.s8 q0, q0
774; CHECK-NEXT: vmovlb.s16 q1, q1
775; CHECK-NEXT: vmovlb.s16 q0, q0
776; CHECK-NEXT: vmul.i32 q0, q0, q1
777; CHECK-NEXT: vaddv.u32 r0, q0
778; CHECK-NEXT: bx lr
779entry:
780 %xx = sext <4 x i8> %x to <4 x i32>
781 %yy = sext <4 x i8> %y to <4 x i32>
782 %m = mul <4 x i32> %xx, %yy
783 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
784 ret i32 %z
785}
786
787define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
788; CHECK-LABEL: add_v16i8_v16i16_zext:
789; CHECK: @ %bb.0: @ %entry
790; CHECK-NEXT: vmov.u8 r0, q1[8]
791; CHECK-NEXT: vmov.16 q2[0], r0
792; CHECK-NEXT: vmov.u8 r0, q1[9]
793; CHECK-NEXT: vmov.16 q2[1], r0
794; CHECK-NEXT: vmov.u8 r0, q1[10]
795; CHECK-NEXT: vmov.16 q2[2], r0
796; CHECK-NEXT: vmov.u8 r0, q1[11]
797; CHECK-NEXT: vmov.16 q2[3], r0
798; CHECK-NEXT: vmov.u8 r0, q1[12]
799; CHECK-NEXT: vmov.16 q2[4], r0
800; CHECK-NEXT: vmov.u8 r0, q1[13]
801; CHECK-NEXT: vmov.16 q2[5], r0
802; CHECK-NEXT: vmov.u8 r0, q1[14]
803; CHECK-NEXT: vmov.16 q2[6], r0
804; CHECK-NEXT: vmov.u8 r0, q1[15]
805; CHECK-NEXT: vmov.16 q2[7], r0
806; CHECK-NEXT: vmov.u8 r0, q0[8]
807; CHECK-NEXT: vmov.16 q3[0], r0
808; CHECK-NEXT: vmov.u8 r0, q0[9]
809; CHECK-NEXT: vmov.16 q3[1], r0
810; CHECK-NEXT: vmov.u8 r0, q0[10]
811; CHECK-NEXT: vmov.16 q3[2], r0
812; CHECK-NEXT: vmov.u8 r0, q0[11]
813; CHECK-NEXT: vmov.16 q3[3], r0
814; CHECK-NEXT: vmov.u8 r0, q0[12]
815; CHECK-NEXT: vmov.16 q3[4], r0
816; CHECK-NEXT: vmov.u8 r0, q0[13]
817; CHECK-NEXT: vmov.16 q3[5], r0
818; CHECK-NEXT: vmov.u8 r0, q0[14]
819; CHECK-NEXT: vmov.16 q3[6], r0
820; CHECK-NEXT: vmov.u8 r0, q0[15]
821; CHECK-NEXT: vmov.16 q3[7], r0
822; CHECK-NEXT: vmovlb.u8 q2, q2
823; CHECK-NEXT: vmovlb.u8 q3, q3
824; CHECK-NEXT: vmov.u8 r0, q1[0]
825; CHECK-NEXT: vmul.i16 q2, q3, q2
826; CHECK-NEXT: vmov.16 q3[0], r0
827; CHECK-NEXT: vmov.u8 r0, q1[1]
828; CHECK-NEXT: vmov.16 q3[1], r0
829; CHECK-NEXT: vmov.u8 r0, q1[2]
830; CHECK-NEXT: vmov.16 q3[2], r0
831; CHECK-NEXT: vmov.u8 r0, q1[3]
832; CHECK-NEXT: vmov.16 q3[3], r0
833; CHECK-NEXT: vmov.u8 r0, q1[4]
834; CHECK-NEXT: vmov.16 q3[4], r0
835; CHECK-NEXT: vmov.u8 r0, q1[5]
836; CHECK-NEXT: vmov.16 q3[5], r0
837; CHECK-NEXT: vmov.u8 r0, q1[6]
838; CHECK-NEXT: vmov.16 q3[6], r0
839; CHECK-NEXT: vmov.u8 r0, q1[7]
840; CHECK-NEXT: vmov.16 q3[7], r0
841; CHECK-NEXT: vmov.u8 r0, q0[0]
842; CHECK-NEXT: vmovlb.u8 q1, q3
843; CHECK-NEXT: vmov.16 q3[0], r0
844; CHECK-NEXT: vmov.u8 r0, q0[1]
845; CHECK-NEXT: vmov.16 q3[1], r0
846; CHECK-NEXT: vmov.u8 r0, q0[2]
847; CHECK-NEXT: vmov.16 q3[2], r0
848; CHECK-NEXT: vmov.u8 r0, q0[3]
849; CHECK-NEXT: vmov.16 q3[3], r0
850; CHECK-NEXT: vmov.u8 r0, q0[4]
851; CHECK-NEXT: vmov.16 q3[4], r0
852; CHECK-NEXT: vmov.u8 r0, q0[5]
853; CHECK-NEXT: vmov.16 q3[5], r0
854; CHECK-NEXT: vmov.u8 r0, q0[6]
855; CHECK-NEXT: vmov.16 q3[6], r0
856; CHECK-NEXT: vmov.u8 r0, q0[7]
857; CHECK-NEXT: vmov.16 q3[7], r0
858; CHECK-NEXT: vmovlb.u8 q0, q3
859; CHECK-NEXT: vmul.i16 q0, q0, q1
860; CHECK-NEXT: vadd.i16 q0, q0, q2
861; CHECK-NEXT: vaddv.u16 r0, q0
862; CHECK-NEXT: uxth r0, r0
863; CHECK-NEXT: bx lr
864entry:
865 %xx = zext <16 x i8> %x to <16 x i16>
866 %yy = zext <16 x i8> %y to <16 x i16>
867 %m = mul <16 x i16> %xx, %yy
868 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
869 ret i16 %z
870}
871
872define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
873; CHECK-LABEL: add_v16i8_v16i16_sext:
874; CHECK: @ %bb.0: @ %entry
875; CHECK-NEXT: vmov.u8 r0, q1[8]
876; CHECK-NEXT: vmov.16 q2[0], r0
877; CHECK-NEXT: vmov.u8 r0, q1[9]
878; CHECK-NEXT: vmov.16 q2[1], r0
879; CHECK-NEXT: vmov.u8 r0, q1[10]
880; CHECK-NEXT: vmov.16 q2[2], r0
881; CHECK-NEXT: vmov.u8 r0, q1[11]
882; CHECK-NEXT: vmov.16 q2[3], r0
883; CHECK-NEXT: vmov.u8 r0, q1[12]
884; CHECK-NEXT: vmov.16 q2[4], r0
885; CHECK-NEXT: vmov.u8 r0, q1[13]
886; CHECK-NEXT: vmov.16 q2[5], r0
887; CHECK-NEXT: vmov.u8 r0, q1[14]
888; CHECK-NEXT: vmov.16 q2[6], r0
889; CHECK-NEXT: vmov.u8 r0, q1[15]
890; CHECK-NEXT: vmov.16 q2[7], r0
891; CHECK-NEXT: vmov.u8 r0, q0[8]
892; CHECK-NEXT: vmov.16 q3[0], r0
893; CHECK-NEXT: vmov.u8 r0, q0[9]
894; CHECK-NEXT: vmov.16 q3[1], r0
895; CHECK-NEXT: vmov.u8 r0, q0[10]
896; CHECK-NEXT: vmov.16 q3[2], r0
897; CHECK-NEXT: vmov.u8 r0, q0[11]
898; CHECK-NEXT: vmov.16 q3[3], r0
899; CHECK-NEXT: vmov.u8 r0, q0[12]
900; CHECK-NEXT: vmov.16 q3[4], r0
901; CHECK-NEXT: vmov.u8 r0, q0[13]
902; CHECK-NEXT: vmov.16 q3[5], r0
903; CHECK-NEXT: vmov.u8 r0, q0[14]
904; CHECK-NEXT: vmov.16 q3[6], r0
905; CHECK-NEXT: vmov.u8 r0, q0[15]
906; CHECK-NEXT: vmov.16 q3[7], r0
907; CHECK-NEXT: vmovlb.s8 q2, q2
908; CHECK-NEXT: vmovlb.s8 q3, q3
909; CHECK-NEXT: vmov.u8 r0, q1[0]
910; CHECK-NEXT: vmul.i16 q2, q3, q2
911; CHECK-NEXT: vmov.16 q3[0], r0
912; CHECK-NEXT: vmov.u8 r0, q1[1]
913; CHECK-NEXT: vmov.16 q3[1], r0
914; CHECK-NEXT: vmov.u8 r0, q1[2]
915; CHECK-NEXT: vmov.16 q3[2], r0
916; CHECK-NEXT: vmov.u8 r0, q1[3]
917; CHECK-NEXT: vmov.16 q3[3], r0
918; CHECK-NEXT: vmov.u8 r0, q1[4]
919; CHECK-NEXT: vmov.16 q3[4], r0
920; CHECK-NEXT: vmov.u8 r0, q1[5]
921; CHECK-NEXT: vmov.16 q3[5], r0
922; CHECK-NEXT: vmov.u8 r0, q1[6]
923; CHECK-NEXT: vmov.16 q3[6], r0
924; CHECK-NEXT: vmov.u8 r0, q1[7]
925; CHECK-NEXT: vmov.16 q3[7], r0
926; CHECK-NEXT: vmov.u8 r0, q0[0]
927; CHECK-NEXT: vmovlb.s8 q1, q3
928; CHECK-NEXT: vmov.16 q3[0], r0
929; CHECK-NEXT: vmov.u8 r0, q0[1]
930; CHECK-NEXT: vmov.16 q3[1], r0
931; CHECK-NEXT: vmov.u8 r0, q0[2]
932; CHECK-NEXT: vmov.16 q3[2], r0
933; CHECK-NEXT: vmov.u8 r0, q0[3]
934; CHECK-NEXT: vmov.16 q3[3], r0
935; CHECK-NEXT: vmov.u8 r0, q0[4]
936; CHECK-NEXT: vmov.16 q3[4], r0
937; CHECK-NEXT: vmov.u8 r0, q0[5]
938; CHECK-NEXT: vmov.16 q3[5], r0
939; CHECK-NEXT: vmov.u8 r0, q0[6]
940; CHECK-NEXT: vmov.16 q3[6], r0
941; CHECK-NEXT: vmov.u8 r0, q0[7]
942; CHECK-NEXT: vmov.16 q3[7], r0
943; CHECK-NEXT: vmovlb.s8 q0, q3
944; CHECK-NEXT: vmul.i16 q0, q0, q1
945; CHECK-NEXT: vadd.i16 q0, q0, q2
946; CHECK-NEXT: vaddv.u16 r0, q0
947; CHECK-NEXT: sxth r0, r0
948; CHECK-NEXT: bx lr
949entry:
950 %xx = sext <16 x i8> %x to <16 x i16>
951 %yy = sext <16 x i8> %y to <16 x i16>
952 %m = mul <16 x i16> %xx, %yy
953 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
954 ret i16 %z
955}
956
957define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
958; CHECK-LABEL: add_v8i8_v8i16_zext:
959; CHECK: @ %bb.0: @ %entry
960; CHECK-NEXT: vmovlb.u8 q1, q1
961; CHECK-NEXT: vmovlb.u8 q0, q0
962; CHECK-NEXT: vmul.i16 q0, q0, q1
963; CHECK-NEXT: vaddv.u16 r0, q0
964; CHECK-NEXT: uxth r0, r0
965; CHECK-NEXT: bx lr
966entry:
967 %xx = zext <8 x i8> %x to <8 x i16>
968 %yy = zext <8 x i8> %y to <8 x i16>
969 %m = mul <8 x i16> %xx, %yy
970 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
971 ret i16 %z
972}
973
974define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
975; CHECK-LABEL: add_v8i8_v8i16_sext:
976; CHECK: @ %bb.0: @ %entry
977; CHECK-NEXT: vmovlb.s8 q1, q1
978; CHECK-NEXT: vmovlb.s8 q0, q0
979; CHECK-NEXT: vmul.i16 q0, q0, q1
980; CHECK-NEXT: vaddv.u16 r0, q0
981; CHECK-NEXT: sxth r0, r0
982; CHECK-NEXT: bx lr
983entry:
984 %xx = sext <8 x i8> %x to <8 x i16>
985 %yy = sext <8 x i8> %y to <8 x i16>
986 %m = mul <8 x i16> %xx, %yy
987 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
988 ret i16 %z
989}
990
991define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
992; CHECK-LABEL: add_v16i8_v16i8:
993; CHECK: @ %bb.0: @ %entry
994; CHECK-NEXT: vmul.i8 q0, q0, q1
995; CHECK-NEXT: vaddv.u8 r0, q0
996; CHECK-NEXT: uxtb r0, r0
997; CHECK-NEXT: bx lr
998entry:
999 %m = mul <16 x i8> %x, %y
1000 %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
1001 ret i8 %z
1002}
1003
1004define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
1005; CHECK-LABEL: add_v16i8_v16i64_zext:
1006; CHECK: @ %bb.0: @ %entry
1007; CHECK-NEXT: .save {r7, lr}
1008; CHECK-NEXT: push {r7, lr}
1009; CHECK-NEXT: .vsave {d8, d9, d10, d11}
1010; CHECK-NEXT: vpush {d8, d9, d10, d11}
1011; CHECK-NEXT: vmov.u8 r0, q1[0]
1012; CHECK-NEXT: vmov.u8 r1, q0[0]
1013; CHECK-NEXT: vmov.32 q3[0], r0
1014; CHECK-NEXT: vmov.u8 r0, q1[1]
1015; CHECK-NEXT: vmov.32 q3[2], r0
1016; CHECK-NEXT: adr r0, .LCPI23_0
1017; CHECK-NEXT: vldrw.u32 q2, [r0]
1018; CHECK-NEXT: vmov.32 q4[0], r1
1019; CHECK-NEXT: vmov.u8 r1, q0[1]
1020; CHECK-NEXT: vmov.32 q4[2], r1
1021; CHECK-NEXT: vand q3, q3, q2
1022; CHECK-NEXT: vand q4, q4, q2
1023; CHECK-NEXT: vmov r0, s14
1024; CHECK-NEXT: vmov r1, s18
1025; CHECK-NEXT: vmov r2, s12
1026; CHECK-NEXT: vmov r3, s16
1027; CHECK-NEXT: umull r12, r1, r1, r0
1028; CHECK-NEXT: vmov.u8 r0, q0[2]
1029; CHECK-NEXT: vmov.32 q4[0], r0
1030; CHECK-NEXT: vmov.u8 r0, q0[3]
1031; CHECK-NEXT: vmov.32 q4[2], r0
1032; CHECK-NEXT: umull r2, r3, r3, r2
1033; CHECK-NEXT: vand q4, q4, q2
1034; CHECK-NEXT: vmov r0, s16
1035; CHECK-NEXT: orr.w lr, r3, r1
1036; CHECK-NEXT: vmov.u8 r3, q1[2]
1037; CHECK-NEXT: vmov.32 q3[0], r3
1038; CHECK-NEXT: vmov.u8 r3, q1[3]
1039; CHECK-NEXT: vmov.32 q3[2], r3
1040; CHECK-NEXT: add r2, r12
1041; CHECK-NEXT: vand q3, q3, q2
1042; CHECK-NEXT: vmov r3, s12
1043; CHECK-NEXT: umull r0, r3, r0, r3
1044; CHECK-NEXT: vmov.32 q5[0], r0
1045; CHECK-NEXT: vmov r0, s14
1046; CHECK-NEXT: vmov.32 q5[1], r3
1047; CHECK-NEXT: vmov r3, s18
1048; CHECK-NEXT: umull r0, r3, r3, r0
1049; CHECK-NEXT: vmov.32 q5[2], r0
1050; CHECK-NEXT: vmov.32 q5[3], r3
1051; CHECK-NEXT: vmov r1, s20
1052; CHECK-NEXT: vmov r0, s21
1053; CHECK-NEXT: adds r1, r1, r2
1054; CHECK-NEXT: adc.w r2, lr, r0
1055; CHECK-NEXT: vmov r0, s22
1056; CHECK-NEXT: adds.w r12, r1, r0
1057; CHECK-NEXT: adc.w r1, r2, r3
1058; CHECK-NEXT: vmov.u8 r2, q1[4]
1059; CHECK-NEXT: vmov.u8 r3, q0[4]
1060; CHECK-NEXT: vmov.32 q3[0], r2
1061; CHECK-NEXT: vmov.u8 r2, q1[5]
1062; CHECK-NEXT: vmov.32 q4[0], r3
1063; CHECK-NEXT: vmov.u8 r3, q0[5]
1064; CHECK-NEXT: vmov.32 q3[2], r2
1065; CHECK-NEXT: vmov.32 q4[2], r3
1066; CHECK-NEXT: vand q3, q3, q2
1067; CHECK-NEXT: vand q4, q4, q2
1068; CHECK-NEXT: vmov r2, s12
1069; CHECK-NEXT: vmov r3, s16
1070; CHECK-NEXT: umull r2, r3, r3, r2
1071; CHECK-NEXT: vmov.32 q5[0], r2
1072; CHECK-NEXT: vmov r2, s14
1073; CHECK-NEXT: vmov.32 q5[1], r3
1074; CHECK-NEXT: vmov r3, s18
1075; CHECK-NEXT: umull r2, r3, r3, r2
1076; CHECK-NEXT: vmov.32 q5[2], r2
1077; CHECK-NEXT: vmov.32 q5[3], r3
1078; CHECK-NEXT: vmov r0, s20
1079; CHECK-NEXT: vmov r2, s21
1080; CHECK-NEXT: adds.w r0, r0, r12
1081; CHECK-NEXT: adcs r1, r2
1082; CHECK-NEXT: vmov r2, s22
1083; CHECK-NEXT: adds.w r12, r0, r2
1084; CHECK-NEXT: vmov.u8 r2, q1[6]
1085; CHECK-NEXT: adcs r1, r3
1086; CHECK-NEXT: vmov.u8 r3, q0[6]
1087; CHECK-NEXT: vmov.32 q3[0], r2
1088; CHECK-NEXT: vmov.u8 r2, q1[7]
1089; CHECK-NEXT: vmov.32 q4[0], r3
1090; CHECK-NEXT: vmov.u8 r3, q0[7]
1091; CHECK-NEXT: vmov.32 q3[2], r2
1092; CHECK-NEXT: vmov.32 q4[2], r3
1093; CHECK-NEXT: vand q3, q3, q2
1094; CHECK-NEXT: vand q4, q4, q2
1095; CHECK-NEXT: vmov r2, s12
1096; CHECK-NEXT: vmov r3, s16
1097; CHECK-NEXT: umull r2, r3, r3, r2
1098; CHECK-NEXT: vmov.32 q5[0], r2
1099; CHECK-NEXT: vmov r2, s14
1100; CHECK-NEXT: vmov.32 q5[1], r3
1101; CHECK-NEXT: vmov r3, s18
1102; CHECK-NEXT: umull r2, r3, r3, r2
1103; CHECK-NEXT: vmov.32 q5[2], r2
1104; CHECK-NEXT: vmov.32 q5[3], r3
1105; CHECK-NEXT: vmov r0, s20
1106; CHECK-NEXT: vmov r2, s21
1107; CHECK-NEXT: adds.w r0, r0, r12
1108; CHECK-NEXT: adcs r1, r2
1109; CHECK-NEXT: vmov r2, s22
1110; CHECK-NEXT: adds.w r12, r0, r2
1111; CHECK-NEXT: vmov.u8 r2, q1[8]
1112; CHECK-NEXT: adcs r1, r3
1113; CHECK-NEXT: vmov.u8 r3, q0[8]
1114; CHECK-NEXT: vmov.32 q3[0], r2
1115; CHECK-NEXT: vmov.u8 r2, q1[9]
1116; CHECK-NEXT: vmov.32 q4[0], r3
1117; CHECK-NEXT: vmov.u8 r3, q0[9]
1118; CHECK-NEXT: vmov.32 q3[2], r2
1119; CHECK-NEXT: vmov.32 q4[2], r3
1120; CHECK-NEXT: vand q3, q3, q2
1121; CHECK-NEXT: vand q4, q4, q2
1122; CHECK-NEXT: vmov r2, s12
1123; CHECK-NEXT: vmov r3, s16
1124; CHECK-NEXT: umull r2, r3, r3, r2
1125; CHECK-NEXT: vmov.32 q5[0], r2
1126; CHECK-NEXT: vmov r2, s14
1127; CHECK-NEXT: vmov.32 q5[1], r3
1128; CHECK-NEXT: vmov r3, s18
1129; CHECK-NEXT: umull r2, r3, r3, r2
1130; CHECK-NEXT: vmov.32 q5[2], r2
1131; CHECK-NEXT: vmov.32 q5[3], r3
1132; CHECK-NEXT: vmov r0, s20
1133; CHECK-NEXT: vmov r2, s21
1134; CHECK-NEXT: adds.w r0, r0, r12
1135; CHECK-NEXT: adcs r1, r2
1136; CHECK-NEXT: vmov r2, s22
1137; CHECK-NEXT: adds.w r12, r0, r2
1138; CHECK-NEXT: vmov.u8 r2, q1[10]
1139; CHECK-NEXT: adcs r1, r3
1140; CHECK-NEXT: vmov.u8 r3, q0[10]
1141; CHECK-NEXT: vmov.32 q3[0], r2
1142; CHECK-NEXT: vmov.u8 r2, q1[11]
1143; CHECK-NEXT: vmov.32 q4[0], r3
1144; CHECK-NEXT: vmov.u8 r3, q0[11]
1145; CHECK-NEXT: vmov.32 q3[2], r2
1146; CHECK-NEXT: vmov.32 q4[2], r3
1147; CHECK-NEXT: vand q3, q3, q2
1148; CHECK-NEXT: vand q4, q4, q2
1149; CHECK-NEXT: vmov r2, s12
1150; CHECK-NEXT: vmov r3, s16
1151; CHECK-NEXT: umull r2, r3, r3, r2
1152; CHECK-NEXT: vmov.32 q5[0], r2
1153; CHECK-NEXT: vmov r2, s14
1154; CHECK-NEXT: vmov.32 q5[1], r3
1155; CHECK-NEXT: vmov r3, s18
1156; CHECK-NEXT: umull r2, r3, r3, r2
1157; CHECK-NEXT: vmov.32 q5[2], r2
1158; CHECK-NEXT: vmov.32 q5[3], r3
1159; CHECK-NEXT: vmov r0, s20
1160; CHECK-NEXT: vmov r2, s21
1161; CHECK-NEXT: adds.w r0, r0, r12
1162; CHECK-NEXT: adcs r1, r2
1163; CHECK-NEXT: vmov r2, s22
1164; CHECK-NEXT: adds.w r12, r0, r2
1165; CHECK-NEXT: vmov.u8 r2, q1[12]
1166; CHECK-NEXT: adcs r1, r3
1167; CHECK-NEXT: vmov.u8 r3, q0[12]
1168; CHECK-NEXT: vmov.32 q3[0], r2
1169; CHECK-NEXT: vmov.u8 r2, q1[13]
1170; CHECK-NEXT: vmov.32 q4[0], r3
1171; CHECK-NEXT: vmov.u8 r3, q0[13]
1172; CHECK-NEXT: vmov.32 q3[2], r2
1173; CHECK-NEXT: vmov.32 q4[2], r3
1174; CHECK-NEXT: vand q3, q3, q2
1175; CHECK-NEXT: vand q4, q4, q2
1176; CHECK-NEXT: vmov r2, s12
1177; CHECK-NEXT: vmov r3, s16
1178; CHECK-NEXT: umull r2, r3, r3, r2
1179; CHECK-NEXT: vmov.32 q5[0], r2
1180; CHECK-NEXT: vmov r2, s14
1181; CHECK-NEXT: vmov.32 q5[1], r3
1182; CHECK-NEXT: vmov r3, s18
1183; CHECK-NEXT: umull r2, r3, r3, r2
1184; CHECK-NEXT: vmov.32 q5[2], r2
1185; CHECK-NEXT: vmov.32 q5[3], r3
1186; CHECK-NEXT: vmov r0, s20
1187; CHECK-NEXT: vmov r2, s21
1188; CHECK-NEXT: adds.w r0, r0, r12
1189; CHECK-NEXT: adcs r1, r2
1190; CHECK-NEXT: vmov r2, s22
1191; CHECK-NEXT: adds r0, r0, r2
1192; CHECK-NEXT: vmov.u8 r2, q1[14]
1193; CHECK-NEXT: vmov.32 q3[0], r2
1194; CHECK-NEXT: vmov.u8 r2, q1[15]
1195; CHECK-NEXT: adcs r1, r3
1196; CHECK-NEXT: vmov.32 q3[2], r2
1197; CHECK-NEXT: vmov.u8 r3, q0[14]
1198; CHECK-NEXT: vand q1, q3, q2
1199; CHECK-NEXT: vmov.32 q3[0], r3
1200; CHECK-NEXT: vmov.u8 r3, q0[15]
1201; CHECK-NEXT: vmov.32 q3[2], r3
1202; CHECK-NEXT: vmov r2, s4
1203; CHECK-NEXT: vand q0, q3, q2
1204; CHECK-NEXT: vmov r3, s0
1205; CHECK-NEXT: umlal r0, r1, r3, r2
1206; CHECK-NEXT: vmov r2, s6
1207; CHECK-NEXT: vmov r3, s2
1208; CHECK-NEXT: umlal r0, r1, r3, r2
1209; CHECK-NEXT: vpop {d8, d9, d10, d11}
1210; CHECK-NEXT: pop {r7, pc}
1211; CHECK-NEXT: .p2align 4
1212; CHECK-NEXT: @ %bb.1:
1213; CHECK-NEXT: .LCPI23_0:
1214; CHECK-NEXT: .long 255 @ 0xff
1215; CHECK-NEXT: .long 0 @ 0x0
1216; CHECK-NEXT: .long 255 @ 0xff
1217; CHECK-NEXT: .long 0 @ 0x0
1218entry:
1219 %xx = zext <16 x i8> %x to <16 x i64>
1220 %yy = zext <16 x i8> %y to <16 x i64>
1221 %m = mul <16 x i64> %xx, %yy
1222 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
1223 ret i64 %z
1224}
1225
1226define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
1227; CHECK-LABEL: add_v16i8_v16i64_sext:
1228; CHECK: @ %bb.0: @ %entry
1229; CHECK-NEXT: vmov.u8 r0, q1[0]
1230; CHECK-NEXT: vmov.u8 r1, q0[0]
1231; CHECK-NEXT: sxtb r0, r0
1232; CHECK-NEXT: sxtb r1, r1
1233; CHECK-NEXT: smull r0, r1, r1, r0
1234; CHECK-NEXT: vmov.32 q2[0], r0
1235; CHECK-NEXT: vmov.u8 r0, q1[1]
1236; CHECK-NEXT: vmov.32 q2[1], r1
1237; CHECK-NEXT: vmov.u8 r1, q0[1]
1238; CHECK-NEXT: sxtb r0, r0
1239; CHECK-NEXT: sxtb r1, r1
1240; CHECK-NEXT: smull r0, r1, r1, r0
1241; CHECK-NEXT: vmov.32 q2[2], r0
1242; CHECK-NEXT: vmov.32 q2[3], r1
1243; CHECK-NEXT: vmov r2, s10
1244; CHECK-NEXT: vmov r3, s8
1245; CHECK-NEXT: vmov r0, s9
1246; CHECK-NEXT: adds r2, r2, r3
1247; CHECK-NEXT: vmov.u8 r3, q0[2]
1248; CHECK-NEXT: adc.w r12, r0, r1
1249; CHECK-NEXT: vmov.u8 r1, q1[2]
1250; CHECK-NEXT: sxtb r1, r1
1251; CHECK-NEXT: sxtb r3, r3
1252; CHECK-NEXT: smull r1, r3, r3, r1
1253; CHECK-NEXT: vmov.32 q2[0], r1
1254; CHECK-NEXT: vmov.u8 r1, q1[3]
1255; CHECK-NEXT: vmov.32 q2[1], r3
1256; CHECK-NEXT: vmov.u8 r3, q0[3]
1257; CHECK-NEXT: sxtb r1, r1
1258; CHECK-NEXT: sxtb r3, r3
1259; CHECK-NEXT: smull r1, r3, r3, r1
1260; CHECK-NEXT: vmov.32 q2[2], r1
1261; CHECK-NEXT: vmov.32 q2[3], r3
1262; CHECK-NEXT: vmov r0, s8
1263; CHECK-NEXT: vmov r1, s9
1264; CHECK-NEXT: adds r0, r0, r2
1265; CHECK-NEXT: vmov r2, s10
1266; CHECK-NEXT: adc.w r1, r1, r12
1267; CHECK-NEXT: adds.w r12, r0, r2
1268; CHECK-NEXT: vmov.u8 r2, q1[4]
1269; CHECK-NEXT: adcs r1, r3
1270; CHECK-NEXT: vmov.u8 r3, q0[4]
1271; CHECK-NEXT: sxtb r2, r2
1272; CHECK-NEXT: sxtb r3, r3
1273; CHECK-NEXT: smull r2, r3, r3, r2
1274; CHECK-NEXT: vmov.32 q2[0], r2
1275; CHECK-NEXT: vmov.u8 r2, q1[5]
1276; CHECK-NEXT: vmov.32 q2[1], r3
1277; CHECK-NEXT: vmov.u8 r3, q0[5]
1278; CHECK-NEXT: sxtb r2, r2
1279; CHECK-NEXT: sxtb r3, r3
1280; CHECK-NEXT: smull r2, r3, r3, r2
1281; CHECK-NEXT: vmov.32 q2[2], r2
1282; CHECK-NEXT: vmov.32 q2[3], r3
1283; CHECK-NEXT: vmov r0, s8
1284; CHECK-NEXT: vmov r2, s9
1285; CHECK-NEXT: adds.w r0, r0, r12
1286; CHECK-NEXT: adcs r1, r2
1287; CHECK-NEXT: vmov r2, s10
1288; CHECK-NEXT: adds.w r12, r0, r2
1289; CHECK-NEXT: vmov.u8 r2, q1[6]
1290; CHECK-NEXT: adcs r1, r3
1291; CHECK-NEXT: vmov.u8 r3, q0[6]
1292; CHECK-NEXT: sxtb r2, r2
1293; CHECK-NEXT: sxtb r3, r3
1294; CHECK-NEXT: smull r2, r3, r3, r2
1295; CHECK-NEXT: vmov.32 q2[0], r2
1296; CHECK-NEXT: vmov.u8 r2, q1[7]
1297; CHECK-NEXT: vmov.32 q2[1], r3
1298; CHECK-NEXT: vmov.u8 r3, q0[7]
1299; CHECK-NEXT: sxtb r2, r2
1300; CHECK-NEXT: sxtb r3, r3
1301; CHECK-NEXT: smull r2, r3, r3, r2
1302; CHECK-NEXT: vmov.32 q2[2], r2
1303; CHECK-NEXT: vmov.32 q2[3], r3
1304; CHECK-NEXT: vmov r0, s8
1305; CHECK-NEXT: vmov r2, s9
1306; CHECK-NEXT: adds.w r0, r0, r12
1307; CHECK-NEXT: adcs r1, r2
1308; CHECK-NEXT: vmov r2, s10
1309; CHECK-NEXT: adds.w r12, r0, r2
1310; CHECK-NEXT: vmov.u8 r2, q1[8]
1311; CHECK-NEXT: adcs r1, r3
1312; CHECK-NEXT: vmov.u8 r3, q0[8]
1313; CHECK-NEXT: sxtb r2, r2
1314; CHECK-NEXT: sxtb r3, r3
1315; CHECK-NEXT: smull r2, r3, r3, r2
1316; CHECK-NEXT: vmov.32 q2[0], r2
1317; CHECK-NEXT: vmov.u8 r2, q1[9]
1318; CHECK-NEXT: vmov.32 q2[1], r3
1319; CHECK-NEXT: vmov.u8 r3, q0[9]
1320; CHECK-NEXT: sxtb r2, r2
1321; CHECK-NEXT: sxtb r3, r3
1322; CHECK-NEXT: smull r2, r3, r3, r2
1323; CHECK-NEXT: vmov.32 q2[2], r2
1324; CHECK-NEXT: vmov.32 q2[3], r3
1325; CHECK-NEXT: vmov r0, s8
1326; CHECK-NEXT: vmov r2, s9
1327; CHECK-NEXT: adds.w r0, r0, r12
1328; CHECK-NEXT: adcs r1, r2
1329; CHECK-NEXT: vmov r2, s10
1330; CHECK-NEXT: adds.w r12, r0, r2
1331; CHECK-NEXT: vmov.u8 r2, q1[10]
1332; CHECK-NEXT: adcs r1, r3
1333; CHECK-NEXT: vmov.u8 r3, q0[10]
1334; CHECK-NEXT: sxtb r2, r2
1335; CHECK-NEXT: sxtb r3, r3
1336; CHECK-NEXT: smull r2, r3, r3, r2
1337; CHECK-NEXT: vmov.32 q2[0], r2
1338; CHECK-NEXT: vmov.u8 r2, q1[11]
1339; CHECK-NEXT: vmov.32 q2[1], r3
1340; CHECK-NEXT: vmov.u8 r3, q0[11]
1341; CHECK-NEXT: sxtb r2, r2
1342; CHECK-NEXT: sxtb r3, r3
1343; CHECK-NEXT: smull r2, r3, r3, r2
1344; CHECK-NEXT: vmov.32 q2[2], r2
1345; CHECK-NEXT: vmov.32 q2[3], r3
1346; CHECK-NEXT: vmov r0, s8
1347; CHECK-NEXT: vmov r2, s9
1348; CHECK-NEXT: adds.w r0, r0, r12
1349; CHECK-NEXT: adcs r1, r2
1350; CHECK-NEXT: vmov r2, s10
1351; CHECK-NEXT: adds.w r12, r0, r2
1352; CHECK-NEXT: vmov.u8 r2, q1[12]
1353; CHECK-NEXT: adcs r1, r3
1354; CHECK-NEXT: vmov.u8 r3, q0[12]
1355; CHECK-NEXT: sxtb r2, r2
1356; CHECK-NEXT: sxtb r3, r3
1357; CHECK-NEXT: smull r2, r3, r3, r2
1358; CHECK-NEXT: vmov.32 q2[0], r2
1359; CHECK-NEXT: vmov.u8 r2, q1[13]
1360; CHECK-NEXT: vmov.32 q2[1], r3
1361; CHECK-NEXT: vmov.u8 r3, q0[13]
1362; CHECK-NEXT: sxtb r2, r2
1363; CHECK-NEXT: sxtb r3, r3
1364; CHECK-NEXT: smull r2, r3, r3, r2
1365; CHECK-NEXT: vmov.32 q2[2], r2
1366; CHECK-NEXT: vmov.32 q2[3], r3
1367; CHECK-NEXT: vmov r0, s8
1368; CHECK-NEXT: vmov r2, s9
1369; CHECK-NEXT: adds.w r0, r0, r12
1370; CHECK-NEXT: adcs r1, r2
1371; CHECK-NEXT: vmov r2, s10
1372; CHECK-NEXT: adds r0, r0, r2
1373; CHECK-NEXT: vmov.u8 r2, q1[14]
1374; CHECK-NEXT: adcs r1, r3
1375; CHECK-NEXT: vmov.u8 r3, q0[14]
1376; CHECK-NEXT: sxtb r2, r2
1377; CHECK-NEXT: sxtb r3, r3
1378; CHECK-NEXT: smlal r0, r1, r3, r2
1379; CHECK-NEXT: vmov.u8 r2, q1[15]
1380; CHECK-NEXT: vmov.u8 r3, q0[15]
1381; CHECK-NEXT: sxtb r2, r2
1382; CHECK-NEXT: sxtb r3, r3
1383; CHECK-NEXT: smlal r0, r1, r3, r2
1384; CHECK-NEXT: bx lr
1385entry:
1386 %xx = sext <16 x i8> %x to <16 x i64>
1387 %yy = sext <16 x i8> %y to <16 x i64>
1388 %m = mul <16 x i64> %xx, %yy
1389 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
1390 ret i64 %z
1391}
1392
1393define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
1394; CHECK-LABEL: add_v2i8_v2i64_zext:
1395; CHECK: @ %bb.0: @ %entry
1396; CHECK-NEXT: adr r0, .LCPI25_0
1397; CHECK-NEXT: vldrw.u32 q2, [r0]
1398; CHECK-NEXT: vand q1, q1, q2
1399; CHECK-NEXT: vand q0, q0, q2
1400; CHECK-NEXT: vmov r0, s6
1401; CHECK-NEXT: vmov r1, s2
1402; CHECK-NEXT: vmov r2, s4
1403; CHECK-NEXT: vmov r3, s0
1404; CHECK-NEXT: umull r0, r1, r1, r0
1405; CHECK-NEXT: umull r2, r3, r3, r2
1406; CHECK-NEXT: add r0, r2
1407; CHECK-NEXT: orrs r1, r3
1408; CHECK-NEXT: bx lr
1409; CHECK-NEXT: .p2align 4
1410; CHECK-NEXT: @ %bb.1:
1411; CHECK-NEXT: .LCPI25_0:
1412; CHECK-NEXT: .long 255 @ 0xff
1413; CHECK-NEXT: .long 0 @ 0x0
1414; CHECK-NEXT: .long 255 @ 0xff
1415; CHECK-NEXT: .long 0 @ 0x0
1416entry:
1417 %xx = zext <2 x i8> %x to <2 x i64>
1418 %yy = zext <2 x i8> %y to <2 x i64>
1419 %m = mul <2 x i64> %xx, %yy
1420 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1421 ret i64 %z
1422}
1423
1424define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
1425; CHECK-LABEL: add_v2i8_v2i64_sext:
1426; CHECK: @ %bb.0: @ %entry
1427; CHECK-NEXT: vmov r0, s4
1428; CHECK-NEXT: vmov r1, s0
1429; CHECK-NEXT: vmov r2, s6
1430; CHECK-NEXT: vmov r3, s2
1431; CHECK-NEXT: sxtb r0, r0
1432; CHECK-NEXT: sxtb r1, r1
1433; CHECK-NEXT: smull r0, r1, r1, r0
1434; CHECK-NEXT: sxtb r2, r2
1435; CHECK-NEXT: sxtb r3, r3
1436; CHECK-NEXT: smlal r0, r1, r3, r2
1437; CHECK-NEXT: bx lr
1438entry:
1439 %xx = sext <2 x i8> %x to <2 x i64>
1440 %yy = sext <2 x i8> %y to <2 x i64>
1441 %m = mul <2 x i64> %xx, %yy
1442 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1443 ret i64 %z
1444}
1445
1446define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
1447; CHECK-LABEL: add_v2i64_v2i64:
1448; CHECK: @ %bb.0: @ %entry
1449; CHECK-NEXT: .save {r4, lr}
1450; CHECK-NEXT: push {r4, lr}
1451; CHECK-NEXT: vmov r0, s4
1452; CHECK-NEXT: vmov r1, s0
1453; CHECK-NEXT: vmov r2, s5
1454; CHECK-NEXT: vmov r4, s7
1455; CHECK-NEXT: umull r12, r3, r1, r0
1456; CHECK-NEXT: mla r1, r1, r2, r3
1457; CHECK-NEXT: vmov r2, s1
1458; CHECK-NEXT: vmov r3, s2
1459; CHECK-NEXT: vmov.32 q2[0], r12
1460; CHECK-NEXT: mla r1, r2, r0, r1
1461; CHECK-NEXT: vmov r2, s6
1462; CHECK-NEXT: vmov.32 q2[1], r1
1463; CHECK-NEXT: vmov r12, s8
1464; CHECK-NEXT: umull lr, r0, r3, r2
1465; CHECK-NEXT: mla r0, r3, r4, r0
1466; CHECK-NEXT: vmov r3, s3
1467; CHECK-NEXT: mla r2, r3, r2, r0
1468; CHECK-NEXT: adds.w r0, r12, lr
1469; CHECK-NEXT: adcs r1, r2
1470; CHECK-NEXT: pop {r4, pc}
1471entry:
1472 %m = mul <2 x i64> %x, %y
1473 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1474 ret i64 %z
1475}
1476
1477define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, i32 %a) {
1478; CHECK-LABEL: add_v4i32_v4i32_acc:
1479; CHECK: @ %bb.0: @ %entry
1480; CHECK-NEXT: vmul.i32 q0, q0, q1
1481; CHECK-NEXT: vaddva.u32 r0, q0
1482; CHECK-NEXT: bx lr
1483entry:
1484 %m = mul <4 x i32> %x, %y
1485 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1486 %r = add i32 %z, %a
1487 ret i32 %r
1488}
1489
1490define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
1491; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
1492; CHECK: @ %bb.0: @ %entry
1493; CHECK-NEXT: .save {r7, lr}
1494; CHECK-NEXT: push {r7, lr}
1495; CHECK-NEXT: .vsave {d8, d9}
1496; CHECK-NEXT: vpush {d8, d9}
1497; CHECK-NEXT: vmov.f32 s8, s4
1498; CHECK-NEXT: vmov.f32 s12, s0
1499; CHECK-NEXT: vmov.f32 s10, s5
1500; CHECK-NEXT: vmov.f32 s14, s1
1501; CHECK-NEXT: vmov r2, s8
1502; CHECK-NEXT: vmov r3, s12
1503; CHECK-NEXT: umull r2, r3, r3, r2
1504; CHECK-NEXT: vmov.32 q4[0], r2
1505; CHECK-NEXT: vmov r2, s10
1506; CHECK-NEXT: vmov.32 q4[1], r3
1507; CHECK-NEXT: vmov r3, s14
1508; CHECK-NEXT: vmov.f32 s8, s6
1509; CHECK-NEXT: vmov.f32 s12, s2
1510; CHECK-NEXT: vmov.f32 s10, s7
1511; CHECK-NEXT: vmov.f32 s14, s3
1512; CHECK-NEXT: umull r2, r3, r3, r2
1513; CHECK-NEXT: vmov.32 q4[2], r2
1514; CHECK-NEXT: vmov.32 q4[3], r3
1515; CHECK-NEXT: vmov lr, s18
1516; CHECK-NEXT: vmov r2, s16
1517; CHECK-NEXT: vmov r12, s17
1518; CHECK-NEXT: adds.w lr, lr, r2
1519; CHECK-NEXT: adr r2, .LCPI29_0
1520; CHECK-NEXT: vldrw.u32 q1, [r2]
1521; CHECK-NEXT: adc.w r3, r3, r12
1522; CHECK-NEXT: vand q2, q2, q1
1523; CHECK-NEXT: vand q0, q3, q1
1524; CHECK-NEXT: vmov r12, s8
1525; CHECK-NEXT: vmov r2, s0
1526; CHECK-NEXT: umlal lr, r3, r2, r12
1527; CHECK-NEXT: vmov r12, s10
1528; CHECK-NEXT: vmov r2, s2
1529; CHECK-NEXT: umlal lr, r3, r2, r12
1530; CHECK-NEXT: adds.w r0, r0, lr
1531; CHECK-NEXT: adcs r1, r3
1532; CHECK-NEXT: vpop {d8, d9}
1533; CHECK-NEXT: pop {r7, pc}
1534; CHECK-NEXT: .p2align 4
1535; CHECK-NEXT: @ %bb.1:
1536; CHECK-NEXT: .LCPI29_0:
1537; CHECK-NEXT: .long 4294967295 @ 0xffffffff
1538; CHECK-NEXT: .long 0 @ 0x0
1539; CHECK-NEXT: .long 4294967295 @ 0xffffffff
1540; CHECK-NEXT: .long 0 @ 0x0
1541entry:
1542 %xx = zext <4 x i32> %x to <4 x i64>
1543 %yy = zext <4 x i32> %y to <4 x i64>
1544 %m = mul <4 x i64> %xx, %yy
1545 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
1546 %r = add i64 %z, %a
1547 ret i64 %r
1548}
1549
1550define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
1551; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
1552; CHECK: @ %bb.0: @ %entry
1553; CHECK-NEXT: .save {r7, lr}
1554; CHECK-NEXT: push {r7, lr}
1555; CHECK-NEXT: .vsave {d8, d9}
1556; CHECK-NEXT: vpush {d8, d9}
1557; CHECK-NEXT: vmov.f32 s8, s4
1558; CHECK-NEXT: vmov.f32 s12, s0
1559; CHECK-NEXT: vmov.f32 s10, s5
1560; CHECK-NEXT: vmov.f32 s14, s1
1561; CHECK-NEXT: vmov r2, s8
1562; CHECK-NEXT: vmov r3, s12
1563; CHECK-NEXT: smull r2, r3, r3, r2
1564; CHECK-NEXT: vmov.32 q4[0], r2
1565; CHECK-NEXT: vmov r2, s10
1566; CHECK-NEXT: vmov.32 q4[1], r3
1567; CHECK-NEXT: vmov r3, s14
1568; CHECK-NEXT: vmov.f32 s8, s6
1569; CHECK-NEXT: vmov.f32 s10, s7
1570; CHECK-NEXT: vmov.f32 s4, s2
1571; CHECK-NEXT: vmov.f32 s6, s3
1572; CHECK-NEXT: smull r2, r3, r3, r2
1573; CHECK-NEXT: vmov.32 q4[2], r2
1574; CHECK-NEXT: vmov.32 q4[3], r3
1575; CHECK-NEXT: vmov lr, s18
1576; CHECK-NEXT: vmov r2, s16
1577; CHECK-NEXT: vmov r12, s17
1578; CHECK-NEXT: adds.w lr, lr, r2
1579; CHECK-NEXT: vmov r2, s4
1580; CHECK-NEXT: adc.w r3, r3, r12
1581; CHECK-NEXT: vmov r12, s8
1582; CHECK-NEXT: smlal lr, r3, r2, r12
1583; CHECK-NEXT: vmov r12, s10
1584; CHECK-NEXT: vmov r2, s6
1585; CHECK-NEXT: smlal lr, r3, r2, r12
1586; CHECK-NEXT: adds.w r0, r0, lr
1587; CHECK-NEXT: adcs r1, r3
1588; CHECK-NEXT: vpop {d8, d9}
1589; CHECK-NEXT: pop {r7, pc}
1590entry:
1591 %xx = sext <4 x i32> %x to <4 x i64>
1592 %yy = sext <4 x i32> %y to <4 x i64>
1593 %m = mul <4 x i64> %xx, %yy
1594 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
1595 %r = add i64 %z, %a
1596 ret i64 %r
1597}
1598
1599define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
1600; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
1601; CHECK: @ %bb.0: @ %entry
1602; CHECK-NEXT: .save {r7, lr}
1603; CHECK-NEXT: push {r7, lr}
1604; CHECK-NEXT: vmov r2, s4
1605; CHECK-NEXT: vmov r3, s0
1606; CHECK-NEXT: vmov r12, s6
1607; CHECK-NEXT: umull r2, lr, r3, r2
1608; CHECK-NEXT: vmov r3, s2
1609; CHECK-NEXT: umlal r2, lr, r3, r12
1610; CHECK-NEXT: adds r0, r0, r2
1611; CHECK-NEXT: adc.w r1, r1, lr
1612; CHECK-NEXT: pop {r7, pc}
1613entry:
1614 %xx = zext <2 x i32> %x to <2 x i64>
1615 %yy = zext <2 x i32> %y to <2 x i64>
1616 %m = mul <2 x i64> %xx, %yy
1617 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1618 %r = add i64 %z, %a
1619 ret i64 %r
1620}
1621
1622define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
1623; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
1624; CHECK: @ %bb.0: @ %entry
1625; CHECK-NEXT: .save {r7, lr}
1626; CHECK-NEXT: push {r7, lr}
1627; CHECK-NEXT: vmov r2, s4
1628; CHECK-NEXT: vmov r3, s0
1629; CHECK-NEXT: vmov r12, s6
1630; CHECK-NEXT: smull r2, lr, r3, r2
1631; CHECK-NEXT: vmov r3, s2
1632; CHECK-NEXT: smlal r2, lr, r3, r12
1633; CHECK-NEXT: adds r0, r0, r2
1634; CHECK-NEXT: adc.w r1, r1, lr
1635; CHECK-NEXT: pop {r7, pc}
1636entry:
1637 %xx = sext <2 x i32> %x to <2 x i64>
1638 %yy = sext <2 x i32> %y to <2 x i64>
1639 %m = mul <2 x i64> %xx, %yy
1640 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1641 %r = add i64 %z, %a
1642 ret i64 %r
1643}
1644
1645define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
1646; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
1647; CHECK: @ %bb.0: @ %entry
1648; CHECK-NEXT: vmov.u16 r1, q1[4]
1649; CHECK-NEXT: vmov.32 q2[0], r1
1650; CHECK-NEXT: vmov.u16 r1, q1[5]
1651; CHECK-NEXT: vmov.32 q2[1], r1
1652; CHECK-NEXT: vmov.u16 r1, q1[6]
1653; CHECK-NEXT: vmov.32 q2[2], r1
1654; CHECK-NEXT: vmov.u16 r1, q1[7]
1655; CHECK-NEXT: vmov.32 q2[3], r1
1656; CHECK-NEXT: vmov.u16 r1, q0[4]
1657; CHECK-NEXT: vmov.32 q3[0], r1
1658; CHECK-NEXT: vmov.u16 r1, q0[5]
1659; CHECK-NEXT: vmov.32 q3[1], r1
1660; CHECK-NEXT: vmov.u16 r1, q0[6]
1661; CHECK-NEXT: vmov.32 q3[2], r1
1662; CHECK-NEXT: vmov.u16 r1, q0[7]
1663; CHECK-NEXT: vmov.32 q3[3], r1
1664; CHECK-NEXT: vmovlb.u16 q2, q2
1665; CHECK-NEXT: vmovlb.u16 q3, q3
1666; CHECK-NEXT: vmov.u16 r1, q1[0]
1667; CHECK-NEXT: vmul.i32 q2, q3, q2
1668; CHECK-NEXT: vmov.32 q3[0], r1
1669; CHECK-NEXT: vmov.u16 r1, q1[1]
1670; CHECK-NEXT: vmov.32 q3[1], r1
1671; CHECK-NEXT: vmov.u16 r1, q1[2]
1672; CHECK-NEXT: vmov.32 q3[2], r1
1673; CHECK-NEXT: vmov.u16 r1, q1[3]
1674; CHECK-NEXT: vmov.32 q3[3], r1
1675; CHECK-NEXT: vmov.u16 r1, q0[0]
1676; CHECK-NEXT: vmovlb.u16 q1, q3
1677; CHECK-NEXT: vmov.32 q3[0], r1
1678; CHECK-NEXT: vmov.u16 r1, q0[1]
1679; CHECK-NEXT: vmov.32 q3[1], r1
1680; CHECK-NEXT: vmov.u16 r1, q0[2]
1681; CHECK-NEXT: vmov.32 q3[2], r1
1682; CHECK-NEXT: vmov.u16 r1, q0[3]
1683; CHECK-NEXT: vmov.32 q3[3], r1
1684; CHECK-NEXT: vmovlb.u16 q0, q3
1685; CHECK-NEXT: vmul.i32 q0, q0, q1
1686; CHECK-NEXT: vadd.i32 q0, q0, q2
1687; CHECK-NEXT: vaddva.u32 r0, q0
1688; CHECK-NEXT: bx lr
1689entry:
1690 %xx = zext <8 x i16> %x to <8 x i32>
1691 %yy = zext <8 x i16> %y to <8 x i32>
1692 %m = mul <8 x i32> %xx, %yy
1693 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
1694 %r = add i32 %z, %a
1695 ret i32 %r
1696}
1697
1698define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
1699; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
1700; CHECK: @ %bb.0: @ %entry
1701; CHECK-NEXT: vmov.u16 r1, q1[4]
1702; CHECK-NEXT: vmov.32 q2[0], r1
1703; CHECK-NEXT: vmov.u16 r1, q1[5]
1704; CHECK-NEXT: vmov.32 q2[1], r1
1705; CHECK-NEXT: vmov.u16 r1, q1[6]
1706; CHECK-NEXT: vmov.32 q2[2], r1
1707; CHECK-NEXT: vmov.u16 r1, q1[7]
1708; CHECK-NEXT: vmov.32 q2[3], r1
1709; CHECK-NEXT: vmov.u16 r1, q0[4]
1710; CHECK-NEXT: vmov.32 q3[0], r1
1711; CHECK-NEXT: vmov.u16 r1, q0[5]
1712; CHECK-NEXT: vmov.32 q3[1], r1
1713; CHECK-NEXT: vmov.u16 r1, q0[6]
1714; CHECK-NEXT: vmov.32 q3[2], r1
1715; CHECK-NEXT: vmov.u16 r1, q0[7]
1716; CHECK-NEXT: vmov.32 q3[3], r1
1717; CHECK-NEXT: vmovlb.s16 q2, q2
1718; CHECK-NEXT: vmovlb.s16 q3, q3
1719; CHECK-NEXT: vmov.u16 r1, q1[0]
1720; CHECK-NEXT: vmul.i32 q2, q3, q2
1721; CHECK-NEXT: vmov.32 q3[0], r1
1722; CHECK-NEXT: vmov.u16 r1, q1[1]
1723; CHECK-NEXT: vmov.32 q3[1], r1
1724; CHECK-NEXT: vmov.u16 r1, q1[2]
1725; CHECK-NEXT: vmov.32 q3[2], r1
1726; CHECK-NEXT: vmov.u16 r1, q1[3]
1727; CHECK-NEXT: vmov.32 q3[3], r1
1728; CHECK-NEXT: vmov.u16 r1, q0[0]
1729; CHECK-NEXT: vmovlb.s16 q1, q3
1730; CHECK-NEXT: vmov.32 q3[0], r1
1731; CHECK-NEXT: vmov.u16 r1, q0[1]
1732; CHECK-NEXT: vmov.32 q3[1], r1
1733; CHECK-NEXT: vmov.u16 r1, q0[2]
1734; CHECK-NEXT: vmov.32 q3[2], r1
1735; CHECK-NEXT: vmov.u16 r1, q0[3]
1736; CHECK-NEXT: vmov.32 q3[3], r1
1737; CHECK-NEXT: vmovlb.s16 q0, q3
1738; CHECK-NEXT: vmul.i32 q0, q0, q1
1739; CHECK-NEXT: vadd.i32 q0, q0, q2
1740; CHECK-NEXT: vaddva.u32 r0, q0
1741; CHECK-NEXT: bx lr
1742entry:
1743 %xx = sext <8 x i16> %x to <8 x i32>
1744 %yy = sext <8 x i16> %y to <8 x i32>
1745 %m = mul <8 x i32> %xx, %yy
1746 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
1747 %r = add i32 %z, %a
1748 ret i32 %r
1749}
1750
1751define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
1752; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
1753; CHECK: @ %bb.0: @ %entry
1754; CHECK-NEXT: vmovlb.u16 q1, q1
1755; CHECK-NEXT: vmovlb.u16 q0, q0
1756; CHECK-NEXT: vmul.i32 q0, q0, q1
1757; CHECK-NEXT: vaddva.u32 r0, q0
1758; CHECK-NEXT: bx lr
1759entry:
1760 %xx = zext <4 x i16> %x to <4 x i32>
1761 %yy = zext <4 x i16> %y to <4 x i32>
1762 %m = mul <4 x i32> %xx, %yy
1763 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1764 %r = add i32 %z, %a
1765 ret i32 %r
1766}
1767
1768define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
1769; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
1770; CHECK: @ %bb.0: @ %entry
1771; CHECK-NEXT: vmovlb.s16 q1, q1
1772; CHECK-NEXT: vmovlb.s16 q0, q0
1773; CHECK-NEXT: vmul.i32 q0, q0, q1
1774; CHECK-NEXT: vaddva.u32 r0, q0
1775; CHECK-NEXT: bx lr
1776entry:
1777 %xx = sext <4 x i16> %x to <4 x i32>
1778 %yy = sext <4 x i16> %y to <4 x i32>
1779 %m = mul <4 x i32> %xx, %yy
1780 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1781 %r = add i32 %z, %a
1782 ret i32 %r
1783}
1784
1785define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, i16 %a) {
1786; CHECK-LABEL: add_v8i16_v8i16_acc:
1787; CHECK: @ %bb.0: @ %entry
1788; CHECK-NEXT: vmul.i16 q0, q0, q1
1789; CHECK-NEXT: vaddva.u16 r0, q0
1790; CHECK-NEXT: uxth r0, r0
1791; CHECK-NEXT: bx lr
1792entry:
1793 %m = mul <8 x i16> %x, %y
1794 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
1795 %r = add i16 %z, %a
1796 ret i16 %r
1797}
1798
1799define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1800; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
1801; CHECK: @ %bb.0: @ %entry
1802; CHECK-NEXT: .save {r4, lr}
1803; CHECK-NEXT: push {r4, lr}
1804; CHECK-NEXT: .vsave {d8, d9, d10, d11}
1805; CHECK-NEXT: vpush {d8, d9, d10, d11}
1806; CHECK-NEXT: vmov.u16 r2, q1[0]
1807; CHECK-NEXT: vmov.u16 r3, q0[0]
1808; CHECK-NEXT: vmov.32 q3[0], r2
1809; CHECK-NEXT: vmov.u16 r2, q1[1]
1810; CHECK-NEXT: vmov.32 q3[2], r2
1811; CHECK-NEXT: adr r2, .LCPI38_0
1812; CHECK-NEXT: vldrw.u32 q2, [r2]
1813; CHECK-NEXT: vmov.32 q4[0], r3
1814; CHECK-NEXT: vmov.u16 r3, q0[1]
1815; CHECK-NEXT: vmov.32 q4[2], r3
1816; CHECK-NEXT: vand q3, q3, q2
1817; CHECK-NEXT: vand q4, q4, q2
1818; CHECK-NEXT: vmov r2, s12
1819; CHECK-NEXT: vmov r3, s16
1820; CHECK-NEXT: umull r2, r3, r3, r2
1821; CHECK-NEXT: vmov.32 q5[0], r2
1822; CHECK-NEXT: vmov r2, s14
1823; CHECK-NEXT: vmov.32 q5[1], r3
1824; CHECK-NEXT: vmov r3, s18
1825; CHECK-NEXT: umull r2, r3, r3, r2
1826; CHECK-NEXT: vmov.32 q5[2], r2
1827; CHECK-NEXT: vmov.32 q5[3], r3
1828; CHECK-NEXT: vmov r12, s22
1829; CHECK-NEXT: vmov r2, s20
1830; CHECK-NEXT: vmov lr, s21
1831; CHECK-NEXT: adds.w r12, r12, r2
1832; CHECK-NEXT: vmov.u16 r2, q1[2]
1833; CHECK-NEXT: adc.w lr, lr, r3
1834; CHECK-NEXT: vmov.u16 r3, q0[2]
1835; CHECK-NEXT: vmov.32 q3[0], r2
1836; CHECK-NEXT: vmov.u16 r2, q1[3]
1837; CHECK-NEXT: vmov.32 q4[0], r3
1838; CHECK-NEXT: vmov.u16 r3, q0[3]
1839; CHECK-NEXT: vmov.32 q3[2], r2
1840; CHECK-NEXT: vmov.32 q4[2], r3
1841; CHECK-NEXT: vand q3, q3, q2
1842; CHECK-NEXT: vand q4, q4, q2
1843; CHECK-NEXT: vmov r2, s12
1844; CHECK-NEXT: vmov r3, s16
1845; CHECK-NEXT: umull r2, r3, r3, r2
1846; CHECK-NEXT: vmov.32 q5[0], r2
1847; CHECK-NEXT: vmov r2, s14
1848; CHECK-NEXT: vmov.32 q5[1], r3
1849; CHECK-NEXT: vmov r3, s18
1850; CHECK-NEXT: umull r2, r3, r3, r2
1851; CHECK-NEXT: vmov.32 q5[2], r2
1852; CHECK-NEXT: vmov.32 q5[3], r3
1853; CHECK-NEXT: vmov r4, s20
1854; CHECK-NEXT: vmov r2, s21
1855; CHECK-NEXT: adds.w r4, r4, r12
1856; CHECK-NEXT: adc.w r12, lr, r2
1857; CHECK-NEXT: vmov r2, s22
1858; CHECK-NEXT: adds.w lr, r4, r2
1859; CHECK-NEXT: vmov.u16 r2, q1[4]
1860; CHECK-NEXT: adc.w r12, r12, r3
1861; CHECK-NEXT: vmov.u16 r3, q0[4]
1862; CHECK-NEXT: vmov.32 q3[0], r2
1863; CHECK-NEXT: vmov.u16 r2, q1[5]
1864; CHECK-NEXT: vmov.32 q4[0], r3
1865; CHECK-NEXT: vmov.u16 r3, q0[5]
1866; CHECK-NEXT: vmov.32 q3[2], r2
1867; CHECK-NEXT: vmov.32 q4[2], r3
1868; CHECK-NEXT: vand q3, q3, q2
1869; CHECK-NEXT: vand q4, q4, q2
1870; CHECK-NEXT: vmov r2, s12
1871; CHECK-NEXT: vmov r3, s16
1872; CHECK-NEXT: umull r2, r3, r3, r2
1873; CHECK-NEXT: vmov.32 q5[0], r2
1874; CHECK-NEXT: vmov r2, s14
1875; CHECK-NEXT: vmov.32 q5[1], r3
1876; CHECK-NEXT: vmov r3, s18
1877; CHECK-NEXT: umull r2, r3, r3, r2
1878; CHECK-NEXT: vmov.32 q5[2], r2
1879; CHECK-NEXT: vmov.32 q5[3], r3
1880; CHECK-NEXT: vmov r4, s20
1881; CHECK-NEXT: vmov r2, s21
1882; CHECK-NEXT: adds.w r4, r4, lr
1883; CHECK-NEXT: adc.w r12, r12, r2
1884; CHECK-NEXT: vmov r2, s22
1885; CHECK-NEXT: adds r2, r2, r4
1886; CHECK-NEXT: vmov.u16 r4, q1[6]
1887; CHECK-NEXT: vmov.32 q3[0], r4
1888; CHECK-NEXT: vmov.u16 r4, q1[7]
1889; CHECK-NEXT: vmov.32 q3[2], r4
1890; CHECK-NEXT: vmov.u16 r4, q0[6]
1891; CHECK-NEXT: vand q1, q3, q2
1892; CHECK-NEXT: vmov.32 q3[0], r4
1893; CHECK-NEXT: vmov.u16 r4, q0[7]
1894; CHECK-NEXT: adc.w r3, r3, r12
1895; CHECK-NEXT: vmov.32 q3[2], r4
1896; CHECK-NEXT: vmov r12, s4
1897; CHECK-NEXT: vand q0, q3, q2
1898; CHECK-NEXT: vmov r4, s0
1899; CHECK-NEXT: umlal r2, r3, r4, r12
1900; CHECK-NEXT: vmov r12, s6
1901; CHECK-NEXT: vmov r4, s2
1902; CHECK-NEXT: umlal r2, r3, r4, r12
1903; CHECK-NEXT: adds r0, r0, r2
1904; CHECK-NEXT: adcs r1, r3
1905; CHECK-NEXT: vpop {d8, d9, d10, d11}
1906; CHECK-NEXT: pop {r4, pc}
1907; CHECK-NEXT: .p2align 4
1908; CHECK-NEXT: @ %bb.1:
1909; CHECK-NEXT: .LCPI38_0:
1910; CHECK-NEXT: .long 65535 @ 0xffff
1911; CHECK-NEXT: .long 0 @ 0x0
1912; CHECK-NEXT: .long 65535 @ 0xffff
1913; CHECK-NEXT: .long 0 @ 0x0
1914entry:
1915 %xx = zext <8 x i16> %x to <8 x i64>
1916 %yy = zext <8 x i16> %y to <8 x i64>
1917 %m = mul <8 x i64> %xx, %yy
1918 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
1919 %r = add i64 %z, %a
1920 ret i64 %r
1921}
1922
1923define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1924; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
1925; CHECK: @ %bb.0: @ %entry
1926; CHECK-NEXT: .save {r4, lr}
1927; CHECK-NEXT: push {r4, lr}
1928; CHECK-NEXT: vmov.u16 r2, q1[0]
1929; CHECK-NEXT: vmov.u16 r3, q0[0]
1930; CHECK-NEXT: sxth r2, r2
1931; CHECK-NEXT: sxth r3, r3
1932; CHECK-NEXT: smull r2, r3, r3, r2
1933; CHECK-NEXT: vmov.32 q2[0], r2
1934; CHECK-NEXT: vmov.u16 r2, q1[1]
1935; CHECK-NEXT: vmov.32 q2[1], r3
1936; CHECK-NEXT: vmov.u16 r3, q0[1]
1937; CHECK-NEXT: sxth r2, r2
1938; CHECK-NEXT: sxth r3, r3
1939; CHECK-NEXT: smull r2, r3, r3, r2
1940; CHECK-NEXT: vmov.32 q2[2], r2
1941; CHECK-NEXT: vmov.32 q2[3], r3
1942; CHECK-NEXT: vmov lr, s10
1943; CHECK-NEXT: vmov r2, s8
1944; CHECK-NEXT: vmov r12, s9
1945; CHECK-NEXT: adds.w lr, lr, r2
1946; CHECK-NEXT: vmov.u16 r2, q1[2]
1947; CHECK-NEXT: adc.w r12, r12, r3
1948; CHECK-NEXT: vmov.u16 r3, q0[2]
1949; CHECK-NEXT: sxth r2, r2
1950; CHECK-NEXT: sxth r3, r3
1951; CHECK-NEXT: smull r2, r3, r3, r2
1952; CHECK-NEXT: vmov.32 q2[0], r2
1953; CHECK-NEXT: vmov.u16 r2, q1[3]
1954; CHECK-NEXT: vmov.32 q2[1], r3
1955; CHECK-NEXT: vmov.u16 r3, q0[3]
1956; CHECK-NEXT: sxth r2, r2
1957; CHECK-NEXT: sxth r3, r3
1958; CHECK-NEXT: smull r2, r3, r3, r2
1959; CHECK-NEXT: vmov.32 q2[2], r2
1960; CHECK-NEXT: vmov.32 q2[3], r3
1961; CHECK-NEXT: vmov r4, s8
1962; CHECK-NEXT: vmov r2, s9
1963; CHECK-NEXT: adds.w r4, r4, lr
1964; CHECK-NEXT: adc.w r12, r12, r2
1965; CHECK-NEXT: vmov r2, s10
1966; CHECK-NEXT: adds.w lr, r4, r2
1967; CHECK-NEXT: vmov.u16 r4, q1[4]
1968; CHECK-NEXT: vmov.u16 r2, q0[4]
1969; CHECK-NEXT: sxth r4, r4
1970; CHECK-NEXT: sxth r2, r2
1971; CHECK-NEXT: adc.w r12, r12, r3
1972; CHECK-NEXT: smull r2, r4, r2, r4
1973; CHECK-NEXT: vmov.32 q2[0], r2
1974; CHECK-NEXT: vmov.u16 r2, q1[5]
1975; CHECK-NEXT: vmov.32 q2[1], r4
1976; CHECK-NEXT: vmov.u16 r4, q0[5]
1977; CHECK-NEXT: sxth r2, r2
1978; CHECK-NEXT: sxth r4, r4
1979; CHECK-NEXT: smull r2, r4, r4, r2
1980; CHECK-NEXT: vmov.32 q2[2], r2
1981; CHECK-NEXT: vmov.32 q2[3], r4
1982; CHECK-NEXT: vmov r3, s8
1983; CHECK-NEXT: vmov r2, s9
1984; CHECK-NEXT: adds.w r3, r3, lr
1985; CHECK-NEXT: adc.w r12, r12, r2
1986; CHECK-NEXT: vmov r2, s10
1987; CHECK-NEXT: adds r2, r2, r3
1988; CHECK-NEXT: adc.w r3, r12, r4
1989; CHECK-NEXT: vmov.u16 r4, q1[6]
1990; CHECK-NEXT: sxth.w r12, r4
1991; CHECK-NEXT: vmov.u16 r4, q0[6]
1992; CHECK-NEXT: sxth r4, r4
1993; CHECK-NEXT: smlal r2, r3, r4, r12
1994; CHECK-NEXT: vmov.u16 r4, q1[7]
1995; CHECK-NEXT: sxth.w r12, r4
1996; CHECK-NEXT: vmov.u16 r4, q0[7]
1997; CHECK-NEXT: sxth r4, r4
1998; CHECK-NEXT: smlal r2, r3, r4, r12
1999; CHECK-NEXT: adds r0, r0, r2
2000; CHECK-NEXT: adcs r1, r3
2001; CHECK-NEXT: pop {r4, pc}
2002entry:
2003 %xx = sext <8 x i16> %x to <8 x i64>
2004 %yy = sext <8 x i16> %y to <8 x i64>
2005 %m = mul <8 x i64> %xx, %yy
2006 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
2007 %r = add i64 %z, %a
2008 ret i64 %r
2009}
2010
2011define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
2012; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
2013; CHECK: @ %bb.0: @ %entry
2014; CHECK-NEXT: .save {r7, lr}
2015; CHECK-NEXT: push {r7, lr}
2016; CHECK-NEXT: adr r2, .LCPI40_0
2017; CHECK-NEXT: vldrw.u32 q2, [r2]
2018; CHECK-NEXT: vand q1, q1, q2
2019; CHECK-NEXT: vand q0, q0, q2
2020; CHECK-NEXT: vmov r2, s4
2021; CHECK-NEXT: vmov r3, s0
2022; CHECK-NEXT: vmov r12, s6
2023; CHECK-NEXT: umull r2, lr, r3, r2
2024; CHECK-NEXT: vmov r3, s2
2025; CHECK-NEXT: umlal r2, lr, r3, r12
2026; CHECK-NEXT: adds r0, r0, r2
2027; CHECK-NEXT: adc.w r1, r1, lr
2028; CHECK-NEXT: pop {r7, pc}
2029; CHECK-NEXT: .p2align 4
2030; CHECK-NEXT: @ %bb.1:
2031; CHECK-NEXT: .LCPI40_0:
2032; CHECK-NEXT: .long 65535 @ 0xffff
2033; CHECK-NEXT: .long 0 @ 0x0
2034; CHECK-NEXT: .long 65535 @ 0xffff
2035; CHECK-NEXT: .long 0 @ 0x0
2036entry:
2037 %xx = zext <2 x i16> %x to <2 x i64>
2038 %yy = zext <2 x i16> %y to <2 x i64>
2039 %m = mul <2 x i64> %xx, %yy
2040 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
2041 %r = add i64 %z, %a
2042 ret i64 %r
2043}
2044
2045define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
2046; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
2047; CHECK: @ %bb.0: @ %entry
2048; CHECK-NEXT: .save {r7, lr}
2049; CHECK-NEXT: push {r7, lr}
2050; CHECK-NEXT: vmov r2, s4
2051; CHECK-NEXT: vmov r3, s0
2052; CHECK-NEXT: sxth r2, r2
2053; CHECK-NEXT: sxth r3, r3
2054; CHECK-NEXT: smull r2, r12, r3, r2
2055; CHECK-NEXT: vmov r3, s6
2056; CHECK-NEXT: sxth.w lr, r3
2057; CHECK-NEXT: vmov r3, s2
2058; CHECK-NEXT: sxth r3, r3
2059; CHECK-NEXT: smlal r2, r12, r3, lr
2060; CHECK-NEXT: adds r0, r0, r2
2061; CHECK-NEXT: adc.w r1, r1, r12
2062; CHECK-NEXT: pop {r7, pc}
2063entry:
2064 %xx = sext <2 x i16> %x to <2 x i64>
2065 %yy = sext <2 x i16> %y to <2 x i64>
2066 %m = mul <2 x i64> %xx, %yy
2067 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
2068 %r = add i64 %z, %a
2069 ret i64 %r
2070}
2071
2072define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
2073; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
2074; CHECK: @ %bb.0: @ %entry
2075; CHECK-NEXT: .vsave {d8, d9, d10, d11}
2076; CHECK-NEXT: vpush {d8, d9, d10, d11}
2077; CHECK-NEXT: vmov.u8 r1, q1[12]
2078; CHECK-NEXT: vmov.i32 q2, #0xff
2079; CHECK-NEXT: vmov.32 q3[0], r1
2080; CHECK-NEXT: vmov.u8 r1, q1[13]
2081; CHECK-NEXT: vmov.32 q3[1], r1
2082; CHECK-NEXT: vmov.u8 r1, q1[14]
2083; CHECK-NEXT: vmov.32 q3[2], r1
2084; CHECK-NEXT: vmov.u8 r1, q1[15]
2085; CHECK-NEXT: vmov.32 q3[3], r1
2086; CHECK-NEXT: vmov.u8 r1, q0[12]
2087; CHECK-NEXT: vmov.32 q4[0], r1
2088; CHECK-NEXT: vmov.u8 r1, q0[13]
2089; CHECK-NEXT: vmov.32 q4[1], r1
2090; CHECK-NEXT: vmov.u8 r1, q0[14]
2091; CHECK-NEXT: vmov.32 q4[2], r1
2092; CHECK-NEXT: vmov.u8 r1, q0[15]
2093; CHECK-NEXT: vmov.32 q4[3], r1
2094; CHECK-NEXT: vand q3, q3, q2
2095; CHECK-NEXT: vand q4, q4, q2
2096; CHECK-NEXT: vmov.u8 r1, q1[4]
2097; CHECK-NEXT: vmul.i32 q3, q4, q3
2098; CHECK-NEXT: vmov.32 q4[0], r1
2099; CHECK-NEXT: vmov.u8 r1, q1[5]
2100; CHECK-NEXT: vmov.32 q4[1], r1
2101; CHECK-NEXT: vmov.u8 r1, q1[6]
2102; CHECK-NEXT: vmov.32 q4[2], r1
2103; CHECK-NEXT: vmov.u8 r1, q1[7]
2104; CHECK-NEXT: vmov.32 q4[3], r1
2105; CHECK-NEXT: vmov.u8 r1, q0[4]
2106; CHECK-NEXT: vmov.32 q5[0], r1
2107; CHECK-NEXT: vmov.u8 r1, q0[5]
2108; CHECK-NEXT: vmov.32 q5[1], r1
2109; CHECK-NEXT: vmov.u8 r1, q0[6]
2110; CHECK-NEXT: vmov.32 q5[2], r1
2111; CHECK-NEXT: vmov.u8 r1, q0[7]
2112; CHECK-NEXT: vmov.32 q5[3], r1
2113; CHECK-NEXT: vand q4, q4, q2
2114; CHECK-NEXT: vand q5, q5, q2
2115; CHECK-NEXT: vmov.u8 r1, q1[8]
2116; CHECK-NEXT: vmul.i32 q4, q5, q4
2117; CHECK-NEXT: vadd.i32 q3, q4, q3
2118; CHECK-NEXT: vmov.32 q4[0], r1
2119; CHECK-NEXT: vmov.u8 r1, q1[9]
2120; CHECK-NEXT: vmov.32 q4[1], r1
2121; CHECK-NEXT: vmov.u8 r1, q1[10]
2122; CHECK-NEXT: vmov.32 q4[2], r1
2123; CHECK-NEXT: vmov.u8 r1, q1[11]
2124; CHECK-NEXT: vmov.32 q4[3], r1
2125; CHECK-NEXT: vmov.u8 r1, q0[8]
2126; CHECK-NEXT: vmov.32 q5[0], r1
2127; CHECK-NEXT: vmov.u8 r1, q0[9]
2128; CHECK-NEXT: vmov.32 q5[1], r1
2129; CHECK-NEXT: vmov.u8 r1, q0[10]
2130; CHECK-NEXT: vmov.32 q5[2], r1
2131; CHECK-NEXT: vmov.u8 r1, q0[11]
2132; CHECK-NEXT: vmov.32 q5[3], r1
2133; CHECK-NEXT: vand q4, q4, q2
2134; CHECK-NEXT: vand q5, q5, q2
2135; CHECK-NEXT: vmov.u8 r1, q1[0]
2136; CHECK-NEXT: vmul.i32 q4, q5, q4
2137; CHECK-NEXT: vmov.32 q5[0], r1
2138; CHECK-NEXT: vmov.u8 r1, q1[1]
2139; CHECK-NEXT: vmov.32 q5[1], r1
2140; CHECK-NEXT: vmov.u8 r1, q1[2]
2141; CHECK-NEXT: vmov.32 q5[2], r1
2142; CHECK-NEXT: vmov.u8 r1, q1[3]
2143; CHECK-NEXT: vmov.32 q5[3], r1
2144; CHECK-NEXT: vmov.u8 r1, q0[0]
2145; CHECK-NEXT: vand q1, q5, q2
2146; CHECK-NEXT: vmov.32 q5[0], r1
2147; CHECK-NEXT: vmov.u8 r1, q0[1]
2148; CHECK-NEXT: vmov.32 q5[1], r1
2149; CHECK-NEXT: vmov.u8 r1, q0[2]
2150; CHECK-NEXT: vmov.32 q5[2], r1
2151; CHECK-NEXT: vmov.u8 r1, q0[3]
2152; CHECK-NEXT: vmov.32 q5[3], r1
2153; CHECK-NEXT: vand q0, q5, q2
2154; CHECK-NEXT: vmul.i32 q0, q0, q1
2155; CHECK-NEXT: vadd.i32 q0, q0, q4
2156; CHECK-NEXT: vadd.i32 q0, q0, q3
2157; CHECK-NEXT: vaddva.u32 r0, q0
2158; CHECK-NEXT: vpop {d8, d9, d10, d11}
2159; CHECK-NEXT: bx lr
2160entry:
2161 %xx = zext <16 x i8> %x to <16 x i32>
2162 %yy = zext <16 x i8> %y to <16 x i32>
2163 %m = mul <16 x i32> %xx, %yy
2164 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
2165 %r = add i32 %z, %a
2166 ret i32 %r
2167}
2168
2169define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
2170; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
2171; CHECK: @ %bb.0: @ %entry
2172; CHECK-NEXT: .vsave {d8, d9}
2173; CHECK-NEXT: vpush {d8, d9}
2174; CHECK-NEXT: vmov.u8 r1, q1[12]
2175; CHECK-NEXT: vmov.32 q2[0], r1
2176; CHECK-NEXT: vmov.u8 r1, q1[13]
2177; CHECK-NEXT: vmov.32 q2[1], r1
2178; CHECK-NEXT: vmov.u8 r1, q1[14]
2179; CHECK-NEXT: vmov.32 q2[2], r1
2180; CHECK-NEXT: vmov.u8 r1, q1[15]
2181; CHECK-NEXT: vmov.32 q2[3], r1
2182; CHECK-NEXT: vmov.u8 r1, q0[12]
2183; CHECK-NEXT: vmov.32 q3[0], r1
2184; CHECK-NEXT: vmov.u8 r1, q0[13]
2185; CHECK-NEXT: vmov.32 q3[1], r1
2186; CHECK-NEXT: vmov.u8 r1, q0[14]
2187; CHECK-NEXT: vmov.32 q3[2], r1
2188; CHECK-NEXT: vmov.u8 r1, q0[15]
2189; CHECK-NEXT: vmov.32 q3[3], r1
2190; CHECK-NEXT: vmovlb.s8 q2, q2
2191; CHECK-NEXT: vmovlb.s8 q3, q3
2192; CHECK-NEXT: vmovlb.s16 q2, q2
2193; CHECK-NEXT: vmovlb.s16 q3, q3
2194; CHECK-NEXT: vmov.u8 r1, q1[4]
2195; CHECK-NEXT: vmul.i32 q2, q3, q2
2196; CHECK-NEXT: vmov.32 q3[0], r1
2197; CHECK-NEXT: vmov.u8 r1, q1[5]
2198; CHECK-NEXT: vmov.32 q3[1], r1
2199; CHECK-NEXT: vmov.u8 r1, q1[6]
2200; CHECK-NEXT: vmov.32 q3[2], r1
2201; CHECK-NEXT: vmov.u8 r1, q1[7]
2202; CHECK-NEXT: vmov.32 q3[3], r1
2203; CHECK-NEXT: vmov.u8 r1, q0[4]
2204; CHECK-NEXT: vmov.32 q4[0], r1
2205; CHECK-NEXT: vmov.u8 r1, q0[5]
2206; CHECK-NEXT: vmov.32 q4[1], r1
2207; CHECK-NEXT: vmov.u8 r1, q0[6]
2208; CHECK-NEXT: vmov.32 q4[2], r1
2209; CHECK-NEXT: vmov.u8 r1, q0[7]
2210; CHECK-NEXT: vmov.32 q4[3], r1
2211; CHECK-NEXT: vmovlb.s8 q3, q3
2212; CHECK-NEXT: vmovlb.s8 q4, q4
2213; CHECK-NEXT: vmovlb.s16 q3, q3
2214; CHECK-NEXT: vmovlb.s16 q4, q4
2215; CHECK-NEXT: vmov.u8 r1, q1[8]
2216; CHECK-NEXT: vmul.i32 q3, q4, q3
2217; CHECK-NEXT: vadd.i32 q2, q3, q2
2218; CHECK-NEXT: vmov.32 q3[0], r1
2219; CHECK-NEXT: vmov.u8 r1, q1[9]
2220; CHECK-NEXT: vmov.32 q3[1], r1
2221; CHECK-NEXT: vmov.u8 r1, q1[10]
2222; CHECK-NEXT: vmov.32 q3[2], r1
2223; CHECK-NEXT: vmov.u8 r1, q1[11]
2224; CHECK-NEXT: vmov.32 q3[3], r1
2225; CHECK-NEXT: vmov.u8 r1, q0[8]
2226; CHECK-NEXT: vmov.32 q4[0], r1
2227; CHECK-NEXT: vmov.u8 r1, q0[9]
2228; CHECK-NEXT: vmov.32 q4[1], r1
2229; CHECK-NEXT: vmov.u8 r1, q0[10]
2230; CHECK-NEXT: vmov.32 q4[2], r1
2231; CHECK-NEXT: vmov.u8 r1, q0[11]
2232; CHECK-NEXT: vmov.32 q4[3], r1
2233; CHECK-NEXT: vmovlb.s8 q3, q3
2234; CHECK-NEXT: vmovlb.s8 q4, q4
2235; CHECK-NEXT: vmovlb.s16 q3, q3
2236; CHECK-NEXT: vmovlb.s16 q4, q4
2237; CHECK-NEXT: vmov.u8 r1, q1[0]
2238; CHECK-NEXT: vmul.i32 q3, q4, q3
2239; CHECK-NEXT: vmov.32 q4[0], r1
2240; CHECK-NEXT: vmov.u8 r1, q1[1]
2241; CHECK-NEXT: vmov.32 q4[1], r1
2242; CHECK-NEXT: vmov.u8 r1, q1[2]
2243; CHECK-NEXT: vmov.32 q4[2], r1
2244; CHECK-NEXT: vmov.u8 r1, q1[3]
2245; CHECK-NEXT: vmov.32 q4[3], r1
2246; CHECK-NEXT: vmov.u8 r1, q0[0]
2247; CHECK-NEXT: vmovlb.s8 q1, q4
2248; CHECK-NEXT: vmov.32 q4[0], r1
2249; CHECK-NEXT: vmov.u8 r1, q0[1]
2250; CHECK-NEXT: vmovlb.s16 q1, q1
2251; CHECK-NEXT: vmov.32 q4[1], r1
2252; CHECK-NEXT: vmov.u8 r1, q0[2]
2253; CHECK-NEXT: vmov.32 q4[2], r1
2254; CHECK-NEXT: vmov.u8 r1, q0[3]
2255; CHECK-NEXT: vmov.32 q4[3], r1
2256; CHECK-NEXT: vmovlb.s8 q0, q4
2257; CHECK-NEXT: vmovlb.s16 q0, q0
2258; CHECK-NEXT: vmul.i32 q0, q0, q1
2259; CHECK-NEXT: vadd.i32 q0, q0, q3
2260; CHECK-NEXT: vadd.i32 q0, q0, q2
2261; CHECK-NEXT: vaddva.u32 r0, q0
2262; CHECK-NEXT: vpop {d8, d9}
2263; CHECK-NEXT: bx lr
2264entry:
2265 %xx = sext <16 x i8> %x to <16 x i32>
2266 %yy = sext <16 x i8> %y to <16 x i32>
2267 %m = mul <16 x i32> %xx, %yy
2268 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
2269 %r = add i32 %z, %a
2270 ret i32 %r
2271}
2272
2273define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
2274; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
2275; CHECK: @ %bb.0: @ %entry
2276; CHECK-NEXT: vmov.i32 q2, #0xff
2277; CHECK-NEXT: vand q1, q1, q2
2278; CHECK-NEXT: vand q0, q0, q2
2279; CHECK-NEXT: vmul.i32 q0, q0, q1
2280; CHECK-NEXT: vaddva.u32 r0, q0
2281; CHECK-NEXT: bx lr
2282entry:
2283 %xx = zext <4 x i8> %x to <4 x i32>
2284 %yy = zext <4 x i8> %y to <4 x i32>
2285 %m = mul <4 x i32> %xx, %yy
2286 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
2287 %r = add i32 %z, %a
2288 ret i32 %r
2289}
2290
2291define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
2292; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
2293; CHECK: @ %bb.0: @ %entry
2294; CHECK-NEXT: vmovlb.s8 q1, q1
2295; CHECK-NEXT: vmovlb.s8 q0, q0
2296; CHECK-NEXT: vmovlb.s16 q1, q1
2297; CHECK-NEXT: vmovlb.s16 q0, q0
2298; CHECK-NEXT: vmul.i32 q0, q0, q1
2299; CHECK-NEXT: vaddva.u32 r0, q0
2300; CHECK-NEXT: bx lr
2301entry:
2302 %xx = sext <4 x i8> %x to <4 x i32>
2303 %yy = sext <4 x i8> %y to <4 x i32>
2304 %m = mul <4 x i32> %xx, %yy
2305 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
2306 %r = add i32 %z, %a
2307 ret i32 %r
2308}
2309
2310define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
2311; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
2312; CHECK: @ %bb.0: @ %entry
2313; CHECK-NEXT: vmov.u8 r1, q1[8]
2314; CHECK-NEXT: vmov.16 q2[0], r1
2315; CHECK-NEXT: vmov.u8 r1, q1[9]
2316; CHECK-NEXT: vmov.16 q2[1], r1
2317; CHECK-NEXT: vmov.u8 r1, q1[10]
2318; CHECK-NEXT: vmov.16 q2[2], r1
2319; CHECK-NEXT: vmov.u8 r1, q1[11]
2320; CHECK-NEXT: vmov.16 q2[3], r1
2321; CHECK-NEXT: vmov.u8 r1, q1[12]
2322; CHECK-NEXT: vmov.16 q2[4], r1
2323; CHECK-NEXT: vmov.u8 r1, q1[13]
2324; CHECK-NEXT: vmov.16 q2[5], r1
2325; CHECK-NEXT: vmov.u8 r1, q1[14]
2326; CHECK-NEXT: vmov.16 q2[6], r1
2327; CHECK-NEXT: vmov.u8 r1, q1[15]
2328; CHECK-NEXT: vmov.16 q2[7], r1
2329; CHECK-NEXT: vmov.u8 r1, q0[8]
2330; CHECK-NEXT: vmov.16 q3[0], r1
2331; CHECK-NEXT: vmov.u8 r1, q0[9]
2332; CHECK-NEXT: vmov.16 q3[1], r1
2333; CHECK-NEXT: vmov.u8 r1, q0[10]
2334; CHECK-NEXT: vmov.16 q3[2], r1
2335; CHECK-NEXT: vmov.u8 r1, q0[11]
2336; CHECK-NEXT: vmov.16 q3[3], r1
2337; CHECK-NEXT: vmov.u8 r1, q0[12]
2338; CHECK-NEXT: vmov.16 q3[4], r1
2339; CHECK-NEXT: vmov.u8 r1, q0[13]
2340; CHECK-NEXT: vmov.16 q3[5], r1
2341; CHECK-NEXT: vmov.u8 r1, q0[14]
2342; CHECK-NEXT: vmov.16 q3[6], r1
2343; CHECK-NEXT: vmov.u8 r1, q0[15]
2344; CHECK-NEXT: vmov.16 q3[7], r1
2345; CHECK-NEXT: vmovlb.u8 q2, q2
2346; CHECK-NEXT: vmovlb.u8 q3, q3
2347; CHECK-NEXT: vmov.u8 r1, q1[0]
2348; CHECK-NEXT: vmul.i16 q2, q3, q2
2349; CHECK-NEXT: vmov.16 q3[0], r1
2350; CHECK-NEXT: vmov.u8 r1, q1[1]
2351; CHECK-NEXT: vmov.16 q3[1], r1
2352; CHECK-NEXT: vmov.u8 r1, q1[2]
2353; CHECK-NEXT: vmov.16 q3[2], r1
2354; CHECK-NEXT: vmov.u8 r1, q1[3]
2355; CHECK-NEXT: vmov.16 q3[3], r1
2356; CHECK-NEXT: vmov.u8 r1, q1[4]
2357; CHECK-NEXT: vmov.16 q3[4], r1
2358; CHECK-NEXT: vmov.u8 r1, q1[5]
2359; CHECK-NEXT: vmov.16 q3[5], r1
2360; CHECK-NEXT: vmov.u8 r1, q1[6]
2361; CHECK-NEXT: vmov.16 q3[6], r1
2362; CHECK-NEXT: vmov.u8 r1, q1[7]
2363; CHECK-NEXT: vmov.16 q3[7], r1
2364; CHECK-NEXT: vmov.u8 r1, q0[0]
2365; CHECK-NEXT: vmovlb.u8 q1, q3
2366; CHECK-NEXT: vmov.16 q3[0], r1
2367; CHECK-NEXT: vmov.u8 r1, q0[1]
2368; CHECK-NEXT: vmov.16 q3[1], r1
2369; CHECK-NEXT: vmov.u8 r1, q0[2]
2370; CHECK-NEXT: vmov.16 q3[2], r1
2371; CHECK-NEXT: vmov.u8 r1, q0[3]
2372; CHECK-NEXT: vmov.16 q3[3], r1
2373; CHECK-NEXT: vmov.u8 r1, q0[4]
2374; CHECK-NEXT: vmov.16 q3[4], r1
2375; CHECK-NEXT: vmov.u8 r1, q0[5]
2376; CHECK-NEXT: vmov.16 q3[5], r1
2377; CHECK-NEXT: vmov.u8 r1, q0[6]
2378; CHECK-NEXT: vmov.16 q3[6], r1
2379; CHECK-NEXT: vmov.u8 r1, q0[7]
2380; CHECK-NEXT: vmov.16 q3[7], r1
2381; CHECK-NEXT: vmovlb.u8 q0, q3
2382; CHECK-NEXT: vmul.i16 q0, q0, q1
2383; CHECK-NEXT: vadd.i16 q0, q0, q2
2384; CHECK-NEXT: vaddva.u16 r0, q0
2385; CHECK-NEXT: uxth r0, r0
2386; CHECK-NEXT: bx lr
2387entry:
2388 %xx = zext <16 x i8> %x to <16 x i16>
2389 %yy = zext <16 x i8> %y to <16 x i16>
2390 %m = mul <16 x i16> %xx, %yy
2391 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
2392 %r = add i16 %z, %a
2393 ret i16 %r
2394}
2395
2396define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
2397; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
2398; CHECK: @ %bb.0: @ %entry
2399; CHECK-NEXT: vmov.u8 r1, q1[8]
2400; CHECK-NEXT: vmov.16 q2[0], r1
2401; CHECK-NEXT: vmov.u8 r1, q1[9]
2402; CHECK-NEXT: vmov.16 q2[1], r1
2403; CHECK-NEXT: vmov.u8 r1, q1[10]
2404; CHECK-NEXT: vmov.16 q2[2], r1
2405; CHECK-NEXT: vmov.u8 r1, q1[11]
2406; CHECK-NEXT: vmov.16 q2[3], r1
2407; CHECK-NEXT: vmov.u8 r1, q1[12]
2408; CHECK-NEXT: vmov.16 q2[4], r1
2409; CHECK-NEXT: vmov.u8 r1, q1[13]
2410; CHECK-NEXT: vmov.16 q2[5], r1
2411; CHECK-NEXT: vmov.u8 r1, q1[14]
2412; CHECK-NEXT: vmov.16 q2[6], r1
2413; CHECK-NEXT: vmov.u8 r1, q1[15]
2414; CHECK-NEXT: vmov.16 q2[7], r1
2415; CHECK-NEXT: vmov.u8 r1, q0[8]
2416; CHECK-NEXT: vmov.16 q3[0], r1
2417; CHECK-NEXT: vmov.u8 r1, q0[9]
2418; CHECK-NEXT: vmov.16 q3[1], r1
2419; CHECK-NEXT: vmov.u8 r1, q0[10]
2420; CHECK-NEXT: vmov.16 q3[2], r1
2421; CHECK-NEXT: vmov.u8 r1, q0[11]
2422; CHECK-NEXT: vmov.16 q3[3], r1
2423; CHECK-NEXT: vmov.u8 r1, q0[12]
2424; CHECK-NEXT: vmov.16 q3[4], r1
2425; CHECK-NEXT: vmov.u8 r1, q0[13]
2426; CHECK-NEXT: vmov.16 q3[5], r1
2427; CHECK-NEXT: vmov.u8 r1, q0[14]
2428; CHECK-NEXT: vmov.16 q3[6], r1
2429; CHECK-NEXT: vmov.u8 r1, q0[15]
2430; CHECK-NEXT: vmov.16 q3[7], r1
2431; CHECK-NEXT: vmovlb.s8 q2, q2
2432; CHECK-NEXT: vmovlb.s8 q3, q3
2433; CHECK-NEXT: vmov.u8 r1, q1[0]
2434; CHECK-NEXT: vmul.i16 q2, q3, q2
2435; CHECK-NEXT: vmov.16 q3[0], r1
2436; CHECK-NEXT: vmov.u8 r1, q1[1]
2437; CHECK-NEXT: vmov.16 q3[1], r1
2438; CHECK-NEXT: vmov.u8 r1, q1[2]
2439; CHECK-NEXT: vmov.16 q3[2], r1
2440; CHECK-NEXT: vmov.u8 r1, q1[3]
2441; CHECK-NEXT: vmov.16 q3[3], r1
2442; CHECK-NEXT: vmov.u8 r1, q1[4]
2443; CHECK-NEXT: vmov.16 q3[4], r1
2444; CHECK-NEXT: vmov.u8 r1, q1[5]
2445; CHECK-NEXT: vmov.16 q3[5], r1
2446; CHECK-NEXT: vmov.u8 r1, q1[6]
2447; CHECK-NEXT: vmov.16 q3[6], r1
2448; CHECK-NEXT: vmov.u8 r1, q1[7]
2449; CHECK-NEXT: vmov.16 q3[7], r1
2450; CHECK-NEXT: vmov.u8 r1, q0[0]
2451; CHECK-NEXT: vmovlb.s8 q1, q3
2452; CHECK-NEXT: vmov.16 q3[0], r1
2453; CHECK-NEXT: vmov.u8 r1, q0[1]
2454; CHECK-NEXT: vmov.16 q3[1], r1
2455; CHECK-NEXT: vmov.u8 r1, q0[2]
2456; CHECK-NEXT: vmov.16 q3[2], r1
2457; CHECK-NEXT: vmov.u8 r1, q0[3]
2458; CHECK-NEXT: vmov.16 q3[3], r1
2459; CHECK-NEXT: vmov.u8 r1, q0[4]
2460; CHECK-NEXT: vmov.16 q3[4], r1
2461; CHECK-NEXT: vmov.u8 r1, q0[5]
2462; CHECK-NEXT: vmov.16 q3[5], r1
2463; CHECK-NEXT: vmov.u8 r1, q0[6]
2464; CHECK-NEXT: vmov.16 q3[6], r1
2465; CHECK-NEXT: vmov.u8 r1, q0[7]
2466; CHECK-NEXT: vmov.16 q3[7], r1
2467; CHECK-NEXT: vmovlb.s8 q0, q3
2468; CHECK-NEXT: vmul.i16 q0, q0, q1
2469; CHECK-NEXT: vadd.i16 q0, q0, q2
2470; CHECK-NEXT: vaddva.u16 r0, q0
2471; CHECK-NEXT: sxth r0, r0
2472; CHECK-NEXT: bx lr
2473entry:
2474 %xx = sext <16 x i8> %x to <16 x i16>
2475 %yy = sext <16 x i8> %y to <16 x i16>
2476 %m = mul <16 x i16> %xx, %yy
2477 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
2478 %r = add i16 %z, %a
2479 ret i16 %r
2480}
2481
2482define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
2483; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
2484; CHECK: @ %bb.0: @ %entry
2485; CHECK-NEXT: vmovlb.u8 q1, q1
2486; CHECK-NEXT: vmovlb.u8 q0, q0
2487; CHECK-NEXT: vmul.i16 q0, q0, q1
2488; CHECK-NEXT: vaddva.u16 r0, q0
2489; CHECK-NEXT: uxth r0, r0
2490; CHECK-NEXT: bx lr
2491entry:
2492 %xx = zext <8 x i8> %x to <8 x i16>
2493 %yy = zext <8 x i8> %y to <8 x i16>
2494 %m = mul <8 x i16> %xx, %yy
2495 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
2496 %r = add i16 %z, %a
2497 ret i16 %r
2498}
2499
2500define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
2501; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
2502; CHECK: @ %bb.0: @ %entry
2503; CHECK-NEXT: vmovlb.s8 q1, q1
2504; CHECK-NEXT: vmovlb.s8 q0, q0
2505; CHECK-NEXT: vmul.i16 q0, q0, q1
2506; CHECK-NEXT: vaddva.u16 r0, q0
2507; CHECK-NEXT: sxth r0, r0
2508; CHECK-NEXT: bx lr
2509entry:
2510 %xx = sext <8 x i8> %x to <8 x i16>
2511 %yy = sext <8 x i8> %y to <8 x i16>
2512 %m = mul <8 x i16> %xx, %yy
2513 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
2514 %r = add i16 %z, %a
2515 ret i16 %r
2516}
2517
2518define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, i8 %a) {
2519; CHECK-LABEL: add_v16i8_v16i8_acc:
2520; CHECK: @ %bb.0: @ %entry
2521; CHECK-NEXT: vmul.i8 q0, q0, q1
2522; CHECK-NEXT: vaddva.u8 r0, q0
2523; CHECK-NEXT: uxtb r0, r0
2524; CHECK-NEXT: bx lr
2525entry:
2526 %m = mul <16 x i8> %x, %y
2527 %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
2528 %r = add i8 %z, %a
2529 ret i8 %r
2530}
2531
2532define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
2533; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
2534; CHECK: @ %bb.0: @ %entry
2535; CHECK-NEXT: .save {r4, r5, r7, lr}
2536; CHECK-NEXT: push {r4, r5, r7, lr}
2537; CHECK-NEXT: .vsave {d8, d9, d10, d11}
2538; CHECK-NEXT: vpush {d8, d9, d10, d11}
2539; CHECK-NEXT: vmov.u8 r2, q1[0]
2540; CHECK-NEXT: vmov.u8 r3, q0[0]
2541; CHECK-NEXT: vmov.32 q3[0], r2
2542; CHECK-NEXT: vmov.u8 r2, q1[1]
2543; CHECK-NEXT: vmov.32 q3[2], r2
2544; CHECK-NEXT: adr r2, .LCPI51_0
2545; CHECK-NEXT: vldrw.u32 q2, [r2]
2546; CHECK-NEXT: vmov.32 q4[0], r3
2547; CHECK-NEXT: vmov.u8 r3, q0[1]
2548; CHECK-NEXT: vmov.u8 r4, q0[2]
2549; CHECK-NEXT: vmov.32 q4[2], r3
2550; CHECK-NEXT: vand q3, q3, q2
2551; CHECK-NEXT: vand q4, q4, q2
2552; CHECK-NEXT: vmov r2, s14
2553; CHECK-NEXT: vmov r3, s18
2554; CHECK-NEXT: umull r12, lr, r3, r2
2555; CHECK-NEXT: vmov r3, s16
2556; CHECK-NEXT: vmov r2, s12
2557; CHECK-NEXT: vmov.32 q4[0], r4
2558; CHECK-NEXT: vmov.u8 r4, q0[3]
2559; CHECK-NEXT: vmov.32 q4[2], r4
2560; CHECK-NEXT: vand q4, q4, q2
2561; CHECK-NEXT: vmov r4, s16
2562; CHECK-NEXT: umull r2, r3, r3, r2
2563; CHECK-NEXT: orr.w lr, lr, r3
2564; CHECK-NEXT: vmov.u8 r3, q1[2]
2565; CHECK-NEXT: vmov.32 q3[0], r3
2566; CHECK-NEXT: vmov.u8 r3, q1[3]
2567; CHECK-NEXT: vmov.32 q3[2], r3
2568; CHECK-NEXT: add r2, r12
2569; CHECK-NEXT: vand q3, q3, q2
2570; CHECK-NEXT: vmov r3, s12
2571; CHECK-NEXT: umull r3, r4, r4, r3
2572; CHECK-NEXT: vmov.32 q5[0], r3
2573; CHECK-NEXT: vmov r3, s14
2574; CHECK-NEXT: vmov.32 q5[1], r4
2575; CHECK-NEXT: vmov r4, s18
2576; CHECK-NEXT: umull r3, r4, r4, r3
2577; CHECK-NEXT: vmov.32 q5[2], r3
2578; CHECK-NEXT: vmov.32 q5[3], r4
2579; CHECK-NEXT: vmov r3, s20
2580; CHECK-NEXT: vmov r5, s21
2581; CHECK-NEXT: adds r2, r2, r3
2582; CHECK-NEXT: adc.w r3, lr, r5
2583; CHECK-NEXT: vmov r5, s22
2584; CHECK-NEXT: adds.w r12, r2, r5
2585; CHECK-NEXT: vmov.u8 r5, q1[4]
2586; CHECK-NEXT: adcs r3, r4
2587; CHECK-NEXT: vmov.u8 r4, q0[4]
2588; CHECK-NEXT: vmov.32 q3[0], r5
2589; CHECK-NEXT: vmov.u8 r5, q1[5]
2590; CHECK-NEXT: vmov.32 q4[0], r4
2591; CHECK-NEXT: vmov.u8 r4, q0[5]
2592; CHECK-NEXT: vmov.32 q3[2], r5
2593; CHECK-NEXT: vmov.32 q4[2], r4
2594; CHECK-NEXT: vand q3, q3, q2
2595; CHECK-NEXT: vand q4, q4, q2
2596; CHECK-NEXT: vmov r5, s12
2597; CHECK-NEXT: vmov r4, s16
2598; CHECK-NEXT: umull r5, r4, r4, r5
2599; CHECK-NEXT: vmov.32 q5[0], r5
2600; CHECK-NEXT: vmov r5, s14
2601; CHECK-NEXT: vmov.32 q5[1], r4
2602; CHECK-NEXT: vmov r4, s18
2603; CHECK-NEXT: umull r5, r4, r4, r5
2604; CHECK-NEXT: vmov.32 q5[2], r5
2605; CHECK-NEXT: vmov.32 q5[3], r4
2606; CHECK-NEXT: vmov r2, s20
2607; CHECK-NEXT: vmov r5, s21
2608; CHECK-NEXT: adds.w r2, r2, r12
2609; CHECK-NEXT: adcs r3, r5
2610; CHECK-NEXT: vmov r5, s22
2611; CHECK-NEXT: adds.w r12, r2, r5
2612; CHECK-NEXT: vmov.u8 r5, q1[6]
2613; CHECK-NEXT: adcs r3, r4
2614; CHECK-NEXT: vmov.u8 r4, q0[6]
2615; CHECK-NEXT: vmov.32 q3[0], r5
2616; CHECK-NEXT: vmov.u8 r5, q1[7]
2617; CHECK-NEXT: vmov.32 q4[0], r4
2618; CHECK-NEXT: vmov.u8 r4, q0[7]
2619; CHECK-NEXT: vmov.32 q3[2], r5
2620; CHECK-NEXT: vmov.32 q4[2], r4
2621; CHECK-NEXT: vand q3, q3, q2
2622; CHECK-NEXT: vand q4, q4, q2
2623; CHECK-NEXT: vmov r5, s12
2624; CHECK-NEXT: vmov r4, s16
2625; CHECK-NEXT: umull r5, r4, r4, r5
2626; CHECK-NEXT: vmov.32 q5[0], r5
2627; CHECK-NEXT: vmov r5, s14
2628; CHECK-NEXT: vmov.32 q5[1], r4
2629; CHECK-NEXT: vmov r4, s18
2630; CHECK-NEXT: umull r5, r4, r4, r5
2631; CHECK-NEXT: vmov.32 q5[2], r5
2632; CHECK-NEXT: vmov.32 q5[3], r4
2633; CHECK-NEXT: vmov r2, s20
2634; CHECK-NEXT: vmov r5, s21
2635; CHECK-NEXT: adds.w r2, r2, r12
2636; CHECK-NEXT: adcs r3, r5
2637; CHECK-NEXT: vmov r5, s22
2638; CHECK-NEXT: adds.w r12, r2, r5
2639; CHECK-NEXT: vmov.u8 r5, q1[8]
2640; CHECK-NEXT: adcs r3, r4
2641; CHECK-NEXT: vmov.u8 r4, q0[8]
2642; CHECK-NEXT: vmov.32 q3[0], r5
2643; CHECK-NEXT: vmov.u8 r5, q1[9]
2644; CHECK-NEXT: vmov.32 q4[0], r4
2645; CHECK-NEXT: vmov.u8 r4, q0[9]
2646; CHECK-NEXT: vmov.32 q3[2], r5
2647; CHECK-NEXT: vmov.32 q4[2], r4
2648; CHECK-NEXT: vand q3, q3, q2
2649; CHECK-NEXT: vand q4, q4, q2
2650; CHECK-NEXT: vmov r5, s12
2651; CHECK-NEXT: vmov r4, s16
2652; CHECK-NEXT: umull r5, r4, r4, r5
2653; CHECK-NEXT: vmov.32 q5[0], r5
2654; CHECK-NEXT: vmov r5, s14
2655; CHECK-NEXT: vmov.32 q5[1], r4
2656; CHECK-NEXT: vmov r4, s18
2657; CHECK-NEXT: umull r5, r4, r4, r5
2658; CHECK-NEXT: vmov.32 q5[2], r5
2659; CHECK-NEXT: vmov.32 q5[3], r4
2660; CHECK-NEXT: vmov r2, s20
2661; CHECK-NEXT: vmov r5, s21
2662; CHECK-NEXT: adds.w r2, r2, r12
2663; CHECK-NEXT: adcs r3, r5
2664; CHECK-NEXT: vmov r5, s22
2665; CHECK-NEXT: adds.w r12, r2, r5
2666; CHECK-NEXT: vmov.u8 r5, q1[10]
2667; CHECK-NEXT: adcs r3, r4
2668; CHECK-NEXT: vmov.u8 r4, q0[10]
2669; CHECK-NEXT: vmov.32 q3[0], r5
2670; CHECK-NEXT: vmov.u8 r5, q1[11]
2671; CHECK-NEXT: vmov.32 q4[0], r4
2672; CHECK-NEXT: vmov.u8 r4, q0[11]
2673; CHECK-NEXT: vmov.32 q3[2], r5
2674; CHECK-NEXT: vmov.32 q4[2], r4
2675; CHECK-NEXT: vand q3, q3, q2
2676; CHECK-NEXT: vand q4, q4, q2
2677; CHECK-NEXT: vmov r5, s12
2678; CHECK-NEXT: vmov r4, s16
2679; CHECK-NEXT: umull r5, r4, r4, r5
2680; CHECK-NEXT: vmov.32 q5[0], r5
2681; CHECK-NEXT: vmov r5, s14
2682; CHECK-NEXT: vmov.32 q5[1], r4
2683; CHECK-NEXT: vmov r4, s18
2684; CHECK-NEXT: umull r5, r4, r4, r5
2685; CHECK-NEXT: vmov.32 q5[2], r5
2686; CHECK-NEXT: vmov.32 q5[3], r4
2687; CHECK-NEXT: vmov r2, s20
2688; CHECK-NEXT: vmov r5, s21
2689; CHECK-NEXT: adds.w r2, r2, r12
2690; CHECK-NEXT: adcs r3, r5
2691; CHECK-NEXT: vmov r5, s22
2692; CHECK-NEXT: adds.w r12, r2, r5
2693; CHECK-NEXT: vmov.u8 r5, q1[12]
2694; CHECK-NEXT: adcs r3, r4
2695; CHECK-NEXT: vmov.u8 r4, q0[12]
2696; CHECK-NEXT: vmov.32 q3[0], r5
2697; CHECK-NEXT: vmov.u8 r5, q1[13]
2698; CHECK-NEXT: vmov.32 q4[0], r4
2699; CHECK-NEXT: vmov.u8 r4, q0[13]
2700; CHECK-NEXT: vmov.32 q3[2], r5
2701; CHECK-NEXT: vmov.32 q4[2], r4
2702; CHECK-NEXT: vand q3, q3, q2
2703; CHECK-NEXT: vand q4, q4, q2
2704; CHECK-NEXT: vmov r5, s12
2705; CHECK-NEXT: vmov r4, s16
2706; CHECK-NEXT: umull r5, r4, r4, r5
2707; CHECK-NEXT: vmov.32 q5[0], r5
2708; CHECK-NEXT: vmov r5, s14
2709; CHECK-NEXT: vmov.32 q5[1], r4
2710; CHECK-NEXT: vmov r4, s18
2711; CHECK-NEXT: umull r5, r4, r4, r5
2712; CHECK-NEXT: vmov.32 q5[2], r5
2713; CHECK-NEXT: vmov.32 q5[3], r4
2714; CHECK-NEXT: vmov r2, s20
2715; CHECK-NEXT: vmov r5, s21
2716; CHECK-NEXT: adds.w r2, r2, r12
2717; CHECK-NEXT: adcs r3, r5
2718; CHECK-NEXT: vmov r5, s22
2719; CHECK-NEXT: adds r2, r2, r5
2720; CHECK-NEXT: vmov.u8 r5, q1[14]
2721; CHECK-NEXT: vmov.32 q3[0], r5
2722; CHECK-NEXT: vmov.u8 r5, q1[15]
2723; CHECK-NEXT: adcs r3, r4
2724; CHECK-NEXT: vmov.32 q3[2], r5
2725; CHECK-NEXT: vmov.u8 r4, q0[14]
2726; CHECK-NEXT: vand q1, q3, q2
2727; CHECK-NEXT: vmov.32 q3[0], r4
2728; CHECK-NEXT: vmov.u8 r4, q0[15]
2729; CHECK-NEXT: vmov.32 q3[2], r4
2730; CHECK-NEXT: vmov r5, s4
2731; CHECK-NEXT: vand q0, q3, q2
2732; CHECK-NEXT: vmov r4, s0
2733; CHECK-NEXT: umlal r2, r3, r4, r5
2734; CHECK-NEXT: vmov r5, s6
2735; CHECK-NEXT: vmov r4, s2
2736; CHECK-NEXT: umlal r2, r3, r4, r5
2737; CHECK-NEXT: adds r0, r0, r2
2738; CHECK-NEXT: adcs r1, r3
2739; CHECK-NEXT: vpop {d8, d9, d10, d11}
2740; CHECK-NEXT: pop {r4, r5, r7, pc}
2741; CHECK-NEXT: .p2align 4
2742; CHECK-NEXT: @ %bb.1:
2743; CHECK-NEXT: .LCPI51_0:
2744; CHECK-NEXT: .long 255 @ 0xff
2745; CHECK-NEXT: .long 0 @ 0x0
2746; CHECK-NEXT: .long 255 @ 0xff
2747; CHECK-NEXT: .long 0 @ 0x0
2748entry:
2749 %xx = zext <16 x i8> %x to <16 x i64>
2750 %yy = zext <16 x i8> %y to <16 x i64>
2751 %m = mul <16 x i64> %xx, %yy
2752 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
2753 %r = add i64 %z, %a
2754 ret i64 %r
2755}
2756
2757define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
2758; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
2759; CHECK: @ %bb.0: @ %entry
2760; CHECK-NEXT: .save {r4, lr}
2761; CHECK-NEXT: push {r4, lr}
2762; CHECK-NEXT: vmov.u8 r2, q1[0]
2763; CHECK-NEXT: vmov.u8 r3, q0[0]
2764; CHECK-NEXT: sxtb r2, r2
2765; CHECK-NEXT: sxtb r3, r3
2766; CHECK-NEXT: smull r2, r3, r3, r2
2767; CHECK-NEXT: vmov.32 q2[0], r2
2768; CHECK-NEXT: vmov.u8 r2, q1[1]
2769; CHECK-NEXT: vmov.32 q2[1], r3
2770; CHECK-NEXT: vmov.u8 r3, q0[1]
2771; CHECK-NEXT: sxtb r2, r2
2772; CHECK-NEXT: sxtb r3, r3
2773; CHECK-NEXT: smull r2, r3, r3, r2
2774; CHECK-NEXT: vmov.32 q2[2], r2
2775; CHECK-NEXT: vmov.32 q2[3], r3
2776; CHECK-NEXT: vmov lr, s10
2777; CHECK-NEXT: vmov r2, s8
2778; CHECK-NEXT: vmov r12, s9
2779; CHECK-NEXT: adds.w lr, lr, r2
2780; CHECK-NEXT: vmov.u8 r2, q1[2]
2781; CHECK-NEXT: adc.w r12, r12, r3
2782; CHECK-NEXT: vmov.u8 r3, q0[2]
2783; CHECK-NEXT: sxtb r2, r2
2784; CHECK-NEXT: sxtb r3, r3
2785; CHECK-NEXT: smull r2, r3, r3, r2
2786; CHECK-NEXT: vmov.32 q2[0], r2
2787; CHECK-NEXT: vmov.u8 r2, q1[3]
2788; CHECK-NEXT: vmov.32 q2[1], r3
2789; CHECK-NEXT: vmov.u8 r3, q0[3]
2790; CHECK-NEXT: sxtb r2, r2
2791; CHECK-NEXT: sxtb r3, r3
2792; CHECK-NEXT: smull r2, r3, r3, r2
2793; CHECK-NEXT: vmov.32 q2[2], r2
2794; CHECK-NEXT: vmov.32 q2[3], r3
2795; CHECK-NEXT: vmov r4, s8
2796; CHECK-NEXT: vmov r2, s9
2797; CHECK-NEXT: adds.w r4, r4, lr
2798; CHECK-NEXT: adc.w r12, r12, r2
2799; CHECK-NEXT: vmov r2, s10
2800; CHECK-NEXT: adds.w lr, r4, r2
2801; CHECK-NEXT: vmov.u8 r4, q1[4]
2802; CHECK-NEXT: vmov.u8 r2, q0[4]
2803; CHECK-NEXT: sxtb r4, r4
2804; CHECK-NEXT: sxtb r2, r2
2805; CHECK-NEXT: adc.w r12, r12, r3
2806; CHECK-NEXT: smull r2, r4, r2, r4
2807; CHECK-NEXT: vmov.32 q2[0], r2
2808; CHECK-NEXT: vmov.u8 r2, q1[5]
2809; CHECK-NEXT: vmov.32 q2[1], r4
2810; CHECK-NEXT: vmov.u8 r4, q0[5]
2811; CHECK-NEXT: sxtb r2, r2
2812; CHECK-NEXT: sxtb r4, r4
2813; CHECK-NEXT: smull r2, r4, r4, r2
2814; CHECK-NEXT: vmov.32 q2[2], r2
2815; CHECK-NEXT: vmov.32 q2[3], r4
2816; CHECK-NEXT: vmov r3, s8
2817; CHECK-NEXT: vmov r2, s9
2818; CHECK-NEXT: adds.w r3, r3, lr
2819; CHECK-NEXT: adc.w r12, r12, r2
2820; CHECK-NEXT: vmov r2, s10
2821; CHECK-NEXT: adds.w lr, r3, r2
2822; CHECK-NEXT: vmov.u8 r2, q0[6]
2823; CHECK-NEXT: adc.w r12, r12, r4
2824; CHECK-NEXT: vmov.u8 r4, q1[6]
2825; CHECK-NEXT: sxtb r4, r4
2826; CHECK-NEXT: sxtb r2, r2
2827; CHECK-NEXT: smull r2, r4, r2, r4
2828; CHECK-NEXT: vmov.32 q2[0], r2
2829; CHECK-NEXT: vmov.u8 r2, q1[7]
2830; CHECK-NEXT: vmov.32 q2[1], r4
2831; CHECK-NEXT: vmov.u8 r4, q0[7]
2832; CHECK-NEXT: sxtb r2, r2
2833; CHECK-NEXT: sxtb r4, r4
2834; CHECK-NEXT: smull r2, r4, r4, r2
2835; CHECK-NEXT: vmov.32 q2[2], r2
2836; CHECK-NEXT: vmov.32 q2[3], r4
2837; CHECK-NEXT: vmov r3, s8
2838; CHECK-NEXT: vmov r2, s9
2839; CHECK-NEXT: adds.w r3, r3, lr
2840; CHECK-NEXT: adc.w r12, r12, r2
2841; CHECK-NEXT: vmov r2, s10
2842; CHECK-NEXT: adds.w lr, r3, r2
2843; CHECK-NEXT: vmov.u8 r2, q0[8]
2844; CHECK-NEXT: adc.w r12, r12, r4
2845; CHECK-NEXT: vmov.u8 r4, q1[8]
2846; CHECK-NEXT: sxtb r4, r4
2847; CHECK-NEXT: sxtb r2, r2
2848; CHECK-NEXT: smull r2, r4, r2, r4
2849; CHECK-NEXT: vmov.32 q2[0], r2
2850; CHECK-NEXT: vmov.u8 r2, q1[9]
2851; CHECK-NEXT: vmov.32 q2[1], r4
2852; CHECK-NEXT: vmov.u8 r4, q0[9]
2853; CHECK-NEXT: sxtb r2, r2
2854; CHECK-NEXT: sxtb r4, r4
2855; CHECK-NEXT: smull r2, r4, r4, r2
2856; CHECK-NEXT: vmov.32 q2[2], r2
2857; CHECK-NEXT: vmov.32 q2[3], r4
2858; CHECK-NEXT: vmov r3, s8
2859; CHECK-NEXT: vmov r2, s9
2860; CHECK-NEXT: adds.w r3, r3, lr
2861; CHECK-NEXT: adc.w r12, r12, r2
2862; CHECK-NEXT: vmov r2, s10
2863; CHECK-NEXT: adds.w lr, r3, r2
2864; CHECK-NEXT: vmov.u8 r2, q0[10]
2865; CHECK-NEXT: adc.w r12, r12, r4
2866; CHECK-NEXT: vmov.u8 r4, q1[10]
2867; CHECK-NEXT: sxtb r4, r4
2868; CHECK-NEXT: sxtb r2, r2
2869; CHECK-NEXT: smull r2, r4, r2, r4
2870; CHECK-NEXT: vmov.32 q2[0], r2
2871; CHECK-NEXT: vmov.u8 r2, q1[11]
2872; CHECK-NEXT: vmov.32 q2[1], r4
2873; CHECK-NEXT: vmov.u8 r4, q0[11]
2874; CHECK-NEXT: sxtb r2, r2
2875; CHECK-NEXT: sxtb r4, r4
2876; CHECK-NEXT: smull r2, r4, r4, r2
2877; CHECK-NEXT: vmov.32 q2[2], r2
2878; CHECK-NEXT: vmov.32 q2[3], r4
2879; CHECK-NEXT: vmov r3, s8
2880; CHECK-NEXT: vmov r2, s9
2881; CHECK-NEXT: adds.w r3, r3, lr
2882; CHECK-NEXT: adc.w r12, r12, r2
2883; CHECK-NEXT: vmov r2, s10
2884; CHECK-NEXT: adds.w lr, r3, r2
2885; CHECK-NEXT: vmov.u8 r2, q0[12]
2886; CHECK-NEXT: adc.w r12, r12, r4
2887; CHECK-NEXT: vmov.u8 r4, q1[12]
2888; CHECK-NEXT: sxtb r4, r4
2889; CHECK-NEXT: sxtb r2, r2
2890; CHECK-NEXT: smull r2, r4, r2, r4
2891; CHECK-NEXT: vmov.32 q2[0], r2
2892; CHECK-NEXT: vmov.u8 r2, q1[13]
2893; CHECK-NEXT: vmov.32 q2[1], r4
2894; CHECK-NEXT: vmov.u8 r4, q0[13]
2895; CHECK-NEXT: sxtb r2, r2
2896; CHECK-NEXT: sxtb r4, r4
2897; CHECK-NEXT: smull r2, r4, r4, r2
2898; CHECK-NEXT: vmov.32 q2[2], r2
2899; CHECK-NEXT: vmov.32 q2[3], r4
2900; CHECK-NEXT: vmov r3, s8
2901; CHECK-NEXT: vmov r2, s9
2902; CHECK-NEXT: adds.w r3, r3, lr
2903; CHECK-NEXT: adc.w r12, r12, r2
2904; CHECK-NEXT: vmov r2, s10
2905; CHECK-NEXT: adds r2, r2, r3
2906; CHECK-NEXT: adc.w r3, r12, r4
2907; CHECK-NEXT: vmov.u8 r4, q1[14]
2908; CHECK-NEXT: sxtb.w r12, r4
2909; CHECK-NEXT: vmov.u8 r4, q0[14]
2910; CHECK-NEXT: sxtb r4, r4
2911; CHECK-NEXT: smlal r2, r3, r4, r12
2912; CHECK-NEXT: vmov.u8 r4, q1[15]
2913; CHECK-NEXT: sxtb.w r12, r4
2914; CHECK-NEXT: vmov.u8 r4, q0[15]
2915; CHECK-NEXT: sxtb r4, r4
2916; CHECK-NEXT: smlal r2, r3, r4, r12
2917; CHECK-NEXT: adds r0, r0, r2
2918; CHECK-NEXT: adcs r1, r3
2919; CHECK-NEXT: pop {r4, pc}
2920entry:
2921 %xx = sext <16 x i8> %x to <16 x i64>
2922 %yy = sext <16 x i8> %y to <16 x i64>
2923 %m = mul <16 x i64> %xx, %yy
2924 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
2925 %r = add i64 %z, %a
2926 ret i64 %r
2927}
2928
2929define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
2930; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
2931; CHECK: @ %bb.0: @ %entry
2932; CHECK-NEXT: .save {r7, lr}
2933; CHECK-NEXT: push {r7, lr}
2934; CHECK-NEXT: adr r2, .LCPI53_0
2935; CHECK-NEXT: vldrw.u32 q2, [r2]
2936; CHECK-NEXT: vand q1, q1, q2
2937; CHECK-NEXT: vand q0, q0, q2
2938; CHECK-NEXT: vmov r2, s6
2939; CHECK-NEXT: vmov r3, s2
2940; CHECK-NEXT: umull r12, lr, r3, r2
2941; CHECK-NEXT: vmov r2, s4
2942; CHECK-NEXT: vmov r3, s0
2943; CHECK-NEXT: umull r2, r3, r3, r2
2944; CHECK-NEXT: add r2, r12
2945; CHECK-NEXT: orr.w r3, r3, lr
2946; CHECK-NEXT: adds r0, r0, r2
2947; CHECK-NEXT: adcs r1, r3
2948; CHECK-NEXT: pop {r7, pc}
2949; CHECK-NEXT: .p2align 4
2950; CHECK-NEXT: @ %bb.1:
2951; CHECK-NEXT: .LCPI53_0:
2952; CHECK-NEXT: .long 255 @ 0xff
2953; CHECK-NEXT: .long 0 @ 0x0
2954; CHECK-NEXT: .long 255 @ 0xff
2955; CHECK-NEXT: .long 0 @ 0x0
2956entry:
2957 %xx = zext <2 x i8> %x to <2 x i64>
2958 %yy = zext <2 x i8> %y to <2 x i64>
2959 %m = mul <2 x i64> %xx, %yy
2960 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
2961 %r = add i64 %z, %a
2962 ret i64 %r
2963}
2964
2965define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
2966; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
2967; CHECK: @ %bb.0: @ %entry
2968; CHECK-NEXT: .save {r7, lr}
2969; CHECK-NEXT: push {r7, lr}
2970; CHECK-NEXT: vmov r2, s4
2971; CHECK-NEXT: vmov r3, s0
2972; CHECK-NEXT: sxtb r2, r2
2973; CHECK-NEXT: sxtb r3, r3
2974; CHECK-NEXT: smull r2, r12, r3, r2
2975; CHECK-NEXT: vmov r3, s6
2976; CHECK-NEXT: sxtb.w lr, r3
2977; CHECK-NEXT: vmov r3, s2
2978; CHECK-NEXT: sxtb r3, r3
2979; CHECK-NEXT: smlal r2, r12, r3, lr
2980; CHECK-NEXT: adds r0, r0, r2
2981; CHECK-NEXT: adc.w r1, r1, r12
2982; CHECK-NEXT: pop {r7, pc}
2983entry:
2984 %xx = sext <2 x i8> %x to <2 x i64>
2985 %yy = sext <2 x i8> %y to <2 x i64>
2986 %m = mul <2 x i64> %xx, %yy
2987 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
2988 %r = add i64 %z, %a
2989 ret i64 %r
2990}
2991
2992define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64 %a) {
2993; CHECK-LABEL: add_v2i64_v2i64_acc:
2994; CHECK: @ %bb.0: @ %entry
2995; CHECK-NEXT: .save {r4, r5, r6, lr}
2996; CHECK-NEXT: push {r4, r5, r6, lr}
2997; CHECK-NEXT: vmov r2, s4
2998; CHECK-NEXT: vmov r3, s0
2999; CHECK-NEXT: vmov r4, s5
3000; CHECK-NEXT: vmov r6, s7
3001; CHECK-NEXT: umull r12, lr, r3, r2
3002; CHECK-NEXT: mla r3, r3, r4, lr
3003; CHECK-NEXT: vmov r4, s1
3004; CHECK-NEXT: vmov.32 q2[0], r12
3005; CHECK-NEXT: mla r2, r4, r2, r3
3006; CHECK-NEXT: vmov r4, s6
3007; CHECK-NEXT: vmov r3, s2
3008; CHECK-NEXT: vmov.32 q2[1], r2
3009; CHECK-NEXT: vmov r12, s8
3010; CHECK-NEXT: umull lr, r5, r3, r4
3011; CHECK-NEXT: mla r3, r3, r6, r5
3012; CHECK-NEXT: vmov r5, s3
3013; CHECK-NEXT: adds.w r6, r12, lr
3014; CHECK-NEXT: mla r3, r5, r4, r3
3015; CHECK-NEXT: adcs r2, r3
3016; CHECK-NEXT: adds r0, r0, r6
3017; CHECK-NEXT: adcs r1, r2
3018; CHECK-NEXT: pop {r4, r5, r6, pc}
3019entry:
3020 %m = mul <2 x i64> %x, %y
3021 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
3022 %r = add i64 %z, %a
3023 ret i64 %r
3024}
3025
3026declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
3027declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
3028declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
3029declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
3030declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
3031declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
3032declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
3033declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
3034declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
3035declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)