blob: 9b6d668f899e1b6010fb00a183970aca9584a512 [file] [log] [blame]
David Green0ac4f6b2020-02-17 11:41:16 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3
4define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
5; CHECK-LABEL: add_v4i32_v4i32:
6; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00007; CHECK-NEXT: vmlav.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00008; CHECK-NEXT: bx lr
9entry:
10 %m = mul <4 x i32> %x, %y
11 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
12 ret i32 %z
13}
14
15define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
16; CHECK-LABEL: add_v4i32_v4i64_zext:
17; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +000018; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +000019; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +000020entry:
21 %xx = zext <4 x i32> %x to <4 x i64>
22 %yy = zext <4 x i32> %y to <4 x i64>
23 %m = mul <4 x i64> %xx, %yy
24 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
25 ret i64 %z
26}
27
28define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
29; CHECK-LABEL: add_v4i32_v4i64_sext:
30; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +000031; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +000032; CHECK-NEXT: bx lr
33entry:
34 %xx = sext <4 x i32> %x to <4 x i64>
35 %yy = sext <4 x i32> %y to <4 x i64>
36 %m = mul <4 x i64> %xx, %yy
37 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
38 ret i64 %z
39}
40
41define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
42; CHECK-LABEL: add_v2i32_v2i64_zext:
43; CHECK: @ %bb.0: @ %entry
44; CHECK-NEXT: vmov r0, s4
45; CHECK-NEXT: vmov r1, s0
46; CHECK-NEXT: vmov r2, s6
47; CHECK-NEXT: vmov r3, s2
48; CHECK-NEXT: umull r0, r1, r1, r0
49; CHECK-NEXT: umlal r0, r1, r3, r2
50; CHECK-NEXT: bx lr
51entry:
52 %xx = zext <2 x i32> %x to <2 x i64>
53 %yy = zext <2 x i32> %y to <2 x i64>
54 %m = mul <2 x i64> %xx, %yy
55 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
56 ret i64 %z
57}
58
59define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
60; CHECK-LABEL: add_v2i32_v2i64_sext:
61; CHECK: @ %bb.0: @ %entry
62; CHECK-NEXT: vmov r0, s4
63; CHECK-NEXT: vmov r1, s0
64; CHECK-NEXT: vmov r2, s6
65; CHECK-NEXT: vmov r3, s2
66; CHECK-NEXT: smull r0, r1, r1, r0
67; CHECK-NEXT: smlal r0, r1, r3, r2
68; CHECK-NEXT: bx lr
69entry:
70 %xx = sext <2 x i32> %x to <2 x i64>
71 %yy = sext <2 x i32> %y to <2 x i64>
72 %m = mul <2 x i64> %xx, %yy
73 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
74 ret i64 %z
75}
76
77define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
78; CHECK-LABEL: add_v8i16_v8i32_zext:
79; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +000080; CHECK-NEXT: vmlav.u16 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +000081; CHECK-NEXT: bx lr
82entry:
83 %xx = zext <8 x i16> %x to <8 x i32>
84 %yy = zext <8 x i16> %y to <8 x i32>
85 %m = mul <8 x i32> %xx, %yy
86 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
87 ret i32 %z
88}
89
90define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
91; CHECK-LABEL: add_v8i16_v8i32_sext:
92; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +000093; CHECK-NEXT: vmlav.s16 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +000094; CHECK-NEXT: bx lr
95entry:
96 %xx = sext <8 x i16> %x to <8 x i32>
97 %yy = sext <8 x i16> %y to <8 x i32>
98 %m = mul <8 x i32> %xx, %yy
99 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
100 ret i32 %z
101}
102
103define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
104; CHECK-LABEL: add_v4i16_v4i32_zext:
105; CHECK: @ %bb.0: @ %entry
106; CHECK-NEXT: vmovlb.u16 q1, q1
107; CHECK-NEXT: vmovlb.u16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +0000108; CHECK-NEXT: vmlav.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000109; CHECK-NEXT: bx lr
110entry:
111 %xx = zext <4 x i16> %x to <4 x i32>
112 %yy = zext <4 x i16> %y to <4 x i32>
113 %m = mul <4 x i32> %xx, %yy
114 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
115 ret i32 %z
116}
117
118define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
119; CHECK-LABEL: add_v4i16_v4i32_sext:
120; CHECK: @ %bb.0: @ %entry
121; CHECK-NEXT: vmovlb.s16 q1, q1
122; CHECK-NEXT: vmovlb.s16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +0000123; CHECK-NEXT: vmlav.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000124; CHECK-NEXT: bx lr
125entry:
126 %xx = sext <4 x i16> %x to <4 x i32>
127 %yy = sext <4 x i16> %y to <4 x i32>
128 %m = mul <4 x i32> %xx, %yy
129 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
130 ret i32 %z
131}
132
133define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
134; CHECK-LABEL: add_v8i16_v8i16:
135; CHECK: @ %bb.0: @ %entry
136; CHECK-NEXT: vmul.i16 q0, q0, q1
137; CHECK-NEXT: vaddv.u16 r0, q0
138; CHECK-NEXT: uxth r0, r0
139; CHECK-NEXT: bx lr
140entry:
141 %m = mul <8 x i16> %x, %y
142 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
143 ret i16 %z
144}
145
146define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
147; CHECK-LABEL: add_v8i16_v8i64_zext:
148; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000149; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000150; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +0000151entry:
152 %xx = zext <8 x i16> %x to <8 x i64>
153 %yy = zext <8 x i16> %y to <8 x i64>
154 %m = mul <8 x i64> %xx, %yy
155 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
156 ret i64 %z
157}
158
159define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
160; CHECK-LABEL: add_v8i16_v8i64_sext:
161; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000162; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000163; CHECK-NEXT: bx lr
164entry:
165 %xx = sext <8 x i16> %x to <8 x i64>
166 %yy = sext <8 x i16> %y to <8 x i64>
167 %m = mul <8 x i64> %xx, %yy
168 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
169 ret i64 %z
170}
171
172define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
173; CHECK-LABEL: add_v2i16_v2i64_zext:
174; CHECK: @ %bb.0: @ %entry
175; CHECK-NEXT: adr r0, .LCPI12_0
176; CHECK-NEXT: vldrw.u32 q2, [r0]
177; CHECK-NEXT: vand q1, q1, q2
178; CHECK-NEXT: vand q0, q0, q2
179; CHECK-NEXT: vmov r0, s4
180; CHECK-NEXT: vmov r1, s0
181; CHECK-NEXT: vmov r2, s6
182; CHECK-NEXT: vmov r3, s2
183; CHECK-NEXT: umull r0, r1, r1, r0
184; CHECK-NEXT: umlal r0, r1, r3, r2
185; CHECK-NEXT: bx lr
186; CHECK-NEXT: .p2align 4
187; CHECK-NEXT: @ %bb.1:
188; CHECK-NEXT: .LCPI12_0:
189; CHECK-NEXT: .long 65535 @ 0xffff
190; CHECK-NEXT: .long 0 @ 0x0
191; CHECK-NEXT: .long 65535 @ 0xffff
192; CHECK-NEXT: .long 0 @ 0x0
193entry:
194 %xx = zext <2 x i16> %x to <2 x i64>
195 %yy = zext <2 x i16> %y to <2 x i64>
196 %m = mul <2 x i64> %xx, %yy
197 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
198 ret i64 %z
199}
200
201define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
202; CHECK-LABEL: add_v2i16_v2i64_sext:
203; CHECK: @ %bb.0: @ %entry
204; CHECK-NEXT: vmov r0, s4
205; CHECK-NEXT: vmov r1, s0
206; CHECK-NEXT: vmov r2, s6
207; CHECK-NEXT: vmov r3, s2
208; CHECK-NEXT: sxth r0, r0
209; CHECK-NEXT: sxth r1, r1
210; CHECK-NEXT: smull r0, r1, r1, r0
211; CHECK-NEXT: sxth r2, r2
212; CHECK-NEXT: sxth r3, r3
213; CHECK-NEXT: smlal r0, r1, r3, r2
214; CHECK-NEXT: bx lr
215entry:
216 %xx = sext <2 x i16> %x to <2 x i64>
217 %yy = sext <2 x i16> %y to <2 x i64>
218 %m = mul <2 x i64> %xx, %yy
219 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
220 ret i64 %z
221}
222
223define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
224; CHECK-LABEL: add_v16i8_v16i32_zext:
225; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000226; CHECK-NEXT: vmlav.u8 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000227; CHECK-NEXT: bx lr
228entry:
229 %xx = zext <16 x i8> %x to <16 x i32>
230 %yy = zext <16 x i8> %y to <16 x i32>
231 %m = mul <16 x i32> %xx, %yy
232 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
233 ret i32 %z
234}
235
236define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
237; CHECK-LABEL: add_v16i8_v16i32_sext:
238; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000239; CHECK-NEXT: vmlav.s8 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000240; CHECK-NEXT: bx lr
241entry:
242 %xx = sext <16 x i8> %x to <16 x i32>
243 %yy = sext <16 x i8> %y to <16 x i32>
244 %m = mul <16 x i32> %xx, %yy
245 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
246 ret i32 %z
247}
248
249define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
250; CHECK-LABEL: add_v4i8_v4i32_zext:
251; CHECK: @ %bb.0: @ %entry
252; CHECK-NEXT: vmov.i32 q2, #0xff
253; CHECK-NEXT: vand q1, q1, q2
254; CHECK-NEXT: vand q0, q0, q2
David Green33aa5df2020-02-17 12:00:17 +0000255; CHECK-NEXT: vmlav.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000256; CHECK-NEXT: bx lr
257entry:
258 %xx = zext <4 x i8> %x to <4 x i32>
259 %yy = zext <4 x i8> %y to <4 x i32>
260 %m = mul <4 x i32> %xx, %yy
261 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
262 ret i32 %z
263}
264
265define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
266; CHECK-LABEL: add_v4i8_v4i32_sext:
267; CHECK: @ %bb.0: @ %entry
268; CHECK-NEXT: vmovlb.s8 q1, q1
269; CHECK-NEXT: vmovlb.s8 q0, q0
270; CHECK-NEXT: vmovlb.s16 q1, q1
271; CHECK-NEXT: vmovlb.s16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +0000272; CHECK-NEXT: vmlav.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000273; CHECK-NEXT: bx lr
274entry:
275 %xx = sext <4 x i8> %x to <4 x i32>
276 %yy = sext <4 x i8> %y to <4 x i32>
277 %m = mul <4 x i32> %xx, %yy
278 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
279 ret i32 %z
280}
281
282define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
283; CHECK-LABEL: add_v16i8_v16i16_zext:
284; CHECK: @ %bb.0: @ %entry
285; CHECK-NEXT: vmov.u8 r0, q1[8]
286; CHECK-NEXT: vmov.16 q2[0], r0
287; CHECK-NEXT: vmov.u8 r0, q1[9]
288; CHECK-NEXT: vmov.16 q2[1], r0
289; CHECK-NEXT: vmov.u8 r0, q1[10]
290; CHECK-NEXT: vmov.16 q2[2], r0
291; CHECK-NEXT: vmov.u8 r0, q1[11]
292; CHECK-NEXT: vmov.16 q2[3], r0
293; CHECK-NEXT: vmov.u8 r0, q1[12]
294; CHECK-NEXT: vmov.16 q2[4], r0
295; CHECK-NEXT: vmov.u8 r0, q1[13]
296; CHECK-NEXT: vmov.16 q2[5], r0
297; CHECK-NEXT: vmov.u8 r0, q1[14]
298; CHECK-NEXT: vmov.16 q2[6], r0
299; CHECK-NEXT: vmov.u8 r0, q1[15]
300; CHECK-NEXT: vmov.16 q2[7], r0
301; CHECK-NEXT: vmov.u8 r0, q0[8]
302; CHECK-NEXT: vmov.16 q3[0], r0
303; CHECK-NEXT: vmov.u8 r0, q0[9]
304; CHECK-NEXT: vmov.16 q3[1], r0
305; CHECK-NEXT: vmov.u8 r0, q0[10]
306; CHECK-NEXT: vmov.16 q3[2], r0
307; CHECK-NEXT: vmov.u8 r0, q0[11]
308; CHECK-NEXT: vmov.16 q3[3], r0
309; CHECK-NEXT: vmov.u8 r0, q0[12]
310; CHECK-NEXT: vmov.16 q3[4], r0
311; CHECK-NEXT: vmov.u8 r0, q0[13]
312; CHECK-NEXT: vmov.16 q3[5], r0
313; CHECK-NEXT: vmov.u8 r0, q0[14]
314; CHECK-NEXT: vmov.16 q3[6], r0
315; CHECK-NEXT: vmov.u8 r0, q0[15]
316; CHECK-NEXT: vmov.16 q3[7], r0
317; CHECK-NEXT: vmovlb.u8 q2, q2
318; CHECK-NEXT: vmovlb.u8 q3, q3
319; CHECK-NEXT: vmov.u8 r0, q1[0]
320; CHECK-NEXT: vmul.i16 q2, q3, q2
321; CHECK-NEXT: vmov.16 q3[0], r0
322; CHECK-NEXT: vmov.u8 r0, q1[1]
323; CHECK-NEXT: vmov.16 q3[1], r0
324; CHECK-NEXT: vmov.u8 r0, q1[2]
325; CHECK-NEXT: vmov.16 q3[2], r0
326; CHECK-NEXT: vmov.u8 r0, q1[3]
327; CHECK-NEXT: vmov.16 q3[3], r0
328; CHECK-NEXT: vmov.u8 r0, q1[4]
329; CHECK-NEXT: vmov.16 q3[4], r0
330; CHECK-NEXT: vmov.u8 r0, q1[5]
331; CHECK-NEXT: vmov.16 q3[5], r0
332; CHECK-NEXT: vmov.u8 r0, q1[6]
333; CHECK-NEXT: vmov.16 q3[6], r0
334; CHECK-NEXT: vmov.u8 r0, q1[7]
335; CHECK-NEXT: vmov.16 q3[7], r0
336; CHECK-NEXT: vmov.u8 r0, q0[0]
337; CHECK-NEXT: vmovlb.u8 q1, q3
338; CHECK-NEXT: vmov.16 q3[0], r0
339; CHECK-NEXT: vmov.u8 r0, q0[1]
340; CHECK-NEXT: vmov.16 q3[1], r0
341; CHECK-NEXT: vmov.u8 r0, q0[2]
342; CHECK-NEXT: vmov.16 q3[2], r0
343; CHECK-NEXT: vmov.u8 r0, q0[3]
344; CHECK-NEXT: vmov.16 q3[3], r0
345; CHECK-NEXT: vmov.u8 r0, q0[4]
346; CHECK-NEXT: vmov.16 q3[4], r0
347; CHECK-NEXT: vmov.u8 r0, q0[5]
348; CHECK-NEXT: vmov.16 q3[5], r0
349; CHECK-NEXT: vmov.u8 r0, q0[6]
350; CHECK-NEXT: vmov.16 q3[6], r0
351; CHECK-NEXT: vmov.u8 r0, q0[7]
352; CHECK-NEXT: vmov.16 q3[7], r0
353; CHECK-NEXT: vmovlb.u8 q0, q3
354; CHECK-NEXT: vmul.i16 q0, q0, q1
355; CHECK-NEXT: vadd.i16 q0, q0, q2
356; CHECK-NEXT: vaddv.u16 r0, q0
357; CHECK-NEXT: uxth r0, r0
358; CHECK-NEXT: bx lr
359entry:
360 %xx = zext <16 x i8> %x to <16 x i16>
361 %yy = zext <16 x i8> %y to <16 x i16>
362 %m = mul <16 x i16> %xx, %yy
363 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
364 ret i16 %z
365}
366
367define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
368; CHECK-LABEL: add_v16i8_v16i16_sext:
369; CHECK: @ %bb.0: @ %entry
370; CHECK-NEXT: vmov.u8 r0, q1[8]
371; CHECK-NEXT: vmov.16 q2[0], r0
372; CHECK-NEXT: vmov.u8 r0, q1[9]
373; CHECK-NEXT: vmov.16 q2[1], r0
374; CHECK-NEXT: vmov.u8 r0, q1[10]
375; CHECK-NEXT: vmov.16 q2[2], r0
376; CHECK-NEXT: vmov.u8 r0, q1[11]
377; CHECK-NEXT: vmov.16 q2[3], r0
378; CHECK-NEXT: vmov.u8 r0, q1[12]
379; CHECK-NEXT: vmov.16 q2[4], r0
380; CHECK-NEXT: vmov.u8 r0, q1[13]
381; CHECK-NEXT: vmov.16 q2[5], r0
382; CHECK-NEXT: vmov.u8 r0, q1[14]
383; CHECK-NEXT: vmov.16 q2[6], r0
384; CHECK-NEXT: vmov.u8 r0, q1[15]
385; CHECK-NEXT: vmov.16 q2[7], r0
386; CHECK-NEXT: vmov.u8 r0, q0[8]
387; CHECK-NEXT: vmov.16 q3[0], r0
388; CHECK-NEXT: vmov.u8 r0, q0[9]
389; CHECK-NEXT: vmov.16 q3[1], r0
390; CHECK-NEXT: vmov.u8 r0, q0[10]
391; CHECK-NEXT: vmov.16 q3[2], r0
392; CHECK-NEXT: vmov.u8 r0, q0[11]
393; CHECK-NEXT: vmov.16 q3[3], r0
394; CHECK-NEXT: vmov.u8 r0, q0[12]
395; CHECK-NEXT: vmov.16 q3[4], r0
396; CHECK-NEXT: vmov.u8 r0, q0[13]
397; CHECK-NEXT: vmov.16 q3[5], r0
398; CHECK-NEXT: vmov.u8 r0, q0[14]
399; CHECK-NEXT: vmov.16 q3[6], r0
400; CHECK-NEXT: vmov.u8 r0, q0[15]
401; CHECK-NEXT: vmov.16 q3[7], r0
402; CHECK-NEXT: vmovlb.s8 q2, q2
403; CHECK-NEXT: vmovlb.s8 q3, q3
404; CHECK-NEXT: vmov.u8 r0, q1[0]
405; CHECK-NEXT: vmul.i16 q2, q3, q2
406; CHECK-NEXT: vmov.16 q3[0], r0
407; CHECK-NEXT: vmov.u8 r0, q1[1]
408; CHECK-NEXT: vmov.16 q3[1], r0
409; CHECK-NEXT: vmov.u8 r0, q1[2]
410; CHECK-NEXT: vmov.16 q3[2], r0
411; CHECK-NEXT: vmov.u8 r0, q1[3]
412; CHECK-NEXT: vmov.16 q3[3], r0
413; CHECK-NEXT: vmov.u8 r0, q1[4]
414; CHECK-NEXT: vmov.16 q3[4], r0
415; CHECK-NEXT: vmov.u8 r0, q1[5]
416; CHECK-NEXT: vmov.16 q3[5], r0
417; CHECK-NEXT: vmov.u8 r0, q1[6]
418; CHECK-NEXT: vmov.16 q3[6], r0
419; CHECK-NEXT: vmov.u8 r0, q1[7]
420; CHECK-NEXT: vmov.16 q3[7], r0
421; CHECK-NEXT: vmov.u8 r0, q0[0]
422; CHECK-NEXT: vmovlb.s8 q1, q3
423; CHECK-NEXT: vmov.16 q3[0], r0
424; CHECK-NEXT: vmov.u8 r0, q0[1]
425; CHECK-NEXT: vmov.16 q3[1], r0
426; CHECK-NEXT: vmov.u8 r0, q0[2]
427; CHECK-NEXT: vmov.16 q3[2], r0
428; CHECK-NEXT: vmov.u8 r0, q0[3]
429; CHECK-NEXT: vmov.16 q3[3], r0
430; CHECK-NEXT: vmov.u8 r0, q0[4]
431; CHECK-NEXT: vmov.16 q3[4], r0
432; CHECK-NEXT: vmov.u8 r0, q0[5]
433; CHECK-NEXT: vmov.16 q3[5], r0
434; CHECK-NEXT: vmov.u8 r0, q0[6]
435; CHECK-NEXT: vmov.16 q3[6], r0
436; CHECK-NEXT: vmov.u8 r0, q0[7]
437; CHECK-NEXT: vmov.16 q3[7], r0
438; CHECK-NEXT: vmovlb.s8 q0, q3
439; CHECK-NEXT: vmul.i16 q0, q0, q1
440; CHECK-NEXT: vadd.i16 q0, q0, q2
441; CHECK-NEXT: vaddv.u16 r0, q0
442; CHECK-NEXT: sxth r0, r0
443; CHECK-NEXT: bx lr
444entry:
445 %xx = sext <16 x i8> %x to <16 x i16>
446 %yy = sext <16 x i8> %y to <16 x i16>
447 %m = mul <16 x i16> %xx, %yy
448 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
449 ret i16 %z
450}
451
452define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
453; CHECK-LABEL: add_v8i8_v8i16_zext:
454; CHECK: @ %bb.0: @ %entry
455; CHECK-NEXT: vmovlb.u8 q1, q1
456; CHECK-NEXT: vmovlb.u8 q0, q0
457; CHECK-NEXT: vmul.i16 q0, q0, q1
458; CHECK-NEXT: vaddv.u16 r0, q0
459; CHECK-NEXT: uxth r0, r0
460; CHECK-NEXT: bx lr
461entry:
462 %xx = zext <8 x i8> %x to <8 x i16>
463 %yy = zext <8 x i8> %y to <8 x i16>
464 %m = mul <8 x i16> %xx, %yy
465 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
466 ret i16 %z
467}
468
469define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
470; CHECK-LABEL: add_v8i8_v8i16_sext:
471; CHECK: @ %bb.0: @ %entry
472; CHECK-NEXT: vmovlb.s8 q1, q1
473; CHECK-NEXT: vmovlb.s8 q0, q0
474; CHECK-NEXT: vmul.i16 q0, q0, q1
475; CHECK-NEXT: vaddv.u16 r0, q0
476; CHECK-NEXT: sxth r0, r0
477; CHECK-NEXT: bx lr
478entry:
479 %xx = sext <8 x i8> %x to <8 x i16>
480 %yy = sext <8 x i8> %y to <8 x i16>
481 %m = mul <8 x i16> %xx, %yy
482 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
483 ret i16 %z
484}
485
486define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
487; CHECK-LABEL: add_v16i8_v16i8:
488; CHECK: @ %bb.0: @ %entry
489; CHECK-NEXT: vmul.i8 q0, q0, q1
490; CHECK-NEXT: vaddv.u8 r0, q0
491; CHECK-NEXT: uxtb r0, r0
492; CHECK-NEXT: bx lr
493entry:
494 %m = mul <16 x i8> %x, %y
495 %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
496 ret i8 %z
497}
498
499define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
500; CHECK-LABEL: add_v16i8_v16i64_zext:
501; CHECK: @ %bb.0: @ %entry
502; CHECK-NEXT: .save {r7, lr}
503; CHECK-NEXT: push {r7, lr}
504; CHECK-NEXT: .vsave {d8, d9, d10, d11}
505; CHECK-NEXT: vpush {d8, d9, d10, d11}
506; CHECK-NEXT: vmov.u8 r0, q1[0]
507; CHECK-NEXT: vmov.u8 r1, q0[0]
508; CHECK-NEXT: vmov.32 q3[0], r0
509; CHECK-NEXT: vmov.u8 r0, q1[1]
510; CHECK-NEXT: vmov.32 q3[2], r0
511; CHECK-NEXT: adr r0, .LCPI23_0
512; CHECK-NEXT: vldrw.u32 q2, [r0]
513; CHECK-NEXT: vmov.32 q4[0], r1
514; CHECK-NEXT: vmov.u8 r1, q0[1]
515; CHECK-NEXT: vmov.32 q4[2], r1
516; CHECK-NEXT: vand q3, q3, q2
517; CHECK-NEXT: vand q4, q4, q2
518; CHECK-NEXT: vmov r0, s14
519; CHECK-NEXT: vmov r1, s18
520; CHECK-NEXT: vmov r2, s12
521; CHECK-NEXT: vmov r3, s16
522; CHECK-NEXT: umull r12, r1, r1, r0
523; CHECK-NEXT: vmov.u8 r0, q0[2]
524; CHECK-NEXT: vmov.32 q4[0], r0
525; CHECK-NEXT: vmov.u8 r0, q0[3]
526; CHECK-NEXT: vmov.32 q4[2], r0
527; CHECK-NEXT: umull r2, r3, r3, r2
528; CHECK-NEXT: vand q4, q4, q2
529; CHECK-NEXT: vmov r0, s16
530; CHECK-NEXT: orr.w lr, r3, r1
531; CHECK-NEXT: vmov.u8 r3, q1[2]
532; CHECK-NEXT: vmov.32 q3[0], r3
533; CHECK-NEXT: vmov.u8 r3, q1[3]
534; CHECK-NEXT: vmov.32 q3[2], r3
535; CHECK-NEXT: add r2, r12
536; CHECK-NEXT: vand q3, q3, q2
537; CHECK-NEXT: vmov r3, s12
538; CHECK-NEXT: umull r0, r3, r0, r3
539; CHECK-NEXT: vmov.32 q5[0], r0
540; CHECK-NEXT: vmov r0, s14
541; CHECK-NEXT: vmov.32 q5[1], r3
542; CHECK-NEXT: vmov r3, s18
543; CHECK-NEXT: umull r0, r3, r3, r0
544; CHECK-NEXT: vmov.32 q5[2], r0
545; CHECK-NEXT: vmov.32 q5[3], r3
546; CHECK-NEXT: vmov r1, s20
547; CHECK-NEXT: vmov r0, s21
548; CHECK-NEXT: adds r1, r1, r2
549; CHECK-NEXT: adc.w r2, lr, r0
550; CHECK-NEXT: vmov r0, s22
551; CHECK-NEXT: adds.w r12, r1, r0
552; CHECK-NEXT: adc.w r1, r2, r3
553; CHECK-NEXT: vmov.u8 r2, q1[4]
554; CHECK-NEXT: vmov.u8 r3, q0[4]
555; CHECK-NEXT: vmov.32 q3[0], r2
556; CHECK-NEXT: vmov.u8 r2, q1[5]
557; CHECK-NEXT: vmov.32 q4[0], r3
558; CHECK-NEXT: vmov.u8 r3, q0[5]
559; CHECK-NEXT: vmov.32 q3[2], r2
560; CHECK-NEXT: vmov.32 q4[2], r3
561; CHECK-NEXT: vand q3, q3, q2
562; CHECK-NEXT: vand q4, q4, q2
563; CHECK-NEXT: vmov r2, s12
564; CHECK-NEXT: vmov r3, s16
565; CHECK-NEXT: umull r2, r3, r3, r2
566; CHECK-NEXT: vmov.32 q5[0], r2
567; CHECK-NEXT: vmov r2, s14
568; CHECK-NEXT: vmov.32 q5[1], r3
569; CHECK-NEXT: vmov r3, s18
570; CHECK-NEXT: umull r2, r3, r3, r2
571; CHECK-NEXT: vmov.32 q5[2], r2
572; CHECK-NEXT: vmov.32 q5[3], r3
573; CHECK-NEXT: vmov r0, s20
574; CHECK-NEXT: vmov r2, s21
575; CHECK-NEXT: adds.w r0, r0, r12
576; CHECK-NEXT: adcs r1, r2
577; CHECK-NEXT: vmov r2, s22
578; CHECK-NEXT: adds.w r12, r0, r2
579; CHECK-NEXT: vmov.u8 r2, q1[6]
580; CHECK-NEXT: adcs r1, r3
581; CHECK-NEXT: vmov.u8 r3, q0[6]
582; CHECK-NEXT: vmov.32 q3[0], r2
583; CHECK-NEXT: vmov.u8 r2, q1[7]
584; CHECK-NEXT: vmov.32 q4[0], r3
585; CHECK-NEXT: vmov.u8 r3, q0[7]
586; CHECK-NEXT: vmov.32 q3[2], r2
587; CHECK-NEXT: vmov.32 q4[2], r3
588; CHECK-NEXT: vand q3, q3, q2
589; CHECK-NEXT: vand q4, q4, q2
590; CHECK-NEXT: vmov r2, s12
591; CHECK-NEXT: vmov r3, s16
592; CHECK-NEXT: umull r2, r3, r3, r2
593; CHECK-NEXT: vmov.32 q5[0], r2
594; CHECK-NEXT: vmov r2, s14
595; CHECK-NEXT: vmov.32 q5[1], r3
596; CHECK-NEXT: vmov r3, s18
597; CHECK-NEXT: umull r2, r3, r3, r2
598; CHECK-NEXT: vmov.32 q5[2], r2
599; CHECK-NEXT: vmov.32 q5[3], r3
600; CHECK-NEXT: vmov r0, s20
601; CHECK-NEXT: vmov r2, s21
602; CHECK-NEXT: adds.w r0, r0, r12
603; CHECK-NEXT: adcs r1, r2
604; CHECK-NEXT: vmov r2, s22
605; CHECK-NEXT: adds.w r12, r0, r2
606; CHECK-NEXT: vmov.u8 r2, q1[8]
607; CHECK-NEXT: adcs r1, r3
608; CHECK-NEXT: vmov.u8 r3, q0[8]
609; CHECK-NEXT: vmov.32 q3[0], r2
610; CHECK-NEXT: vmov.u8 r2, q1[9]
611; CHECK-NEXT: vmov.32 q4[0], r3
612; CHECK-NEXT: vmov.u8 r3, q0[9]
613; CHECK-NEXT: vmov.32 q3[2], r2
614; CHECK-NEXT: vmov.32 q4[2], r3
615; CHECK-NEXT: vand q3, q3, q2
616; CHECK-NEXT: vand q4, q4, q2
617; CHECK-NEXT: vmov r2, s12
618; CHECK-NEXT: vmov r3, s16
619; CHECK-NEXT: umull r2, r3, r3, r2
620; CHECK-NEXT: vmov.32 q5[0], r2
621; CHECK-NEXT: vmov r2, s14
622; CHECK-NEXT: vmov.32 q5[1], r3
623; CHECK-NEXT: vmov r3, s18
624; CHECK-NEXT: umull r2, r3, r3, r2
625; CHECK-NEXT: vmov.32 q5[2], r2
626; CHECK-NEXT: vmov.32 q5[3], r3
627; CHECK-NEXT: vmov r0, s20
628; CHECK-NEXT: vmov r2, s21
629; CHECK-NEXT: adds.w r0, r0, r12
630; CHECK-NEXT: adcs r1, r2
631; CHECK-NEXT: vmov r2, s22
632; CHECK-NEXT: adds.w r12, r0, r2
633; CHECK-NEXT: vmov.u8 r2, q1[10]
634; CHECK-NEXT: adcs r1, r3
635; CHECK-NEXT: vmov.u8 r3, q0[10]
636; CHECK-NEXT: vmov.32 q3[0], r2
637; CHECK-NEXT: vmov.u8 r2, q1[11]
638; CHECK-NEXT: vmov.32 q4[0], r3
639; CHECK-NEXT: vmov.u8 r3, q0[11]
640; CHECK-NEXT: vmov.32 q3[2], r2
641; CHECK-NEXT: vmov.32 q4[2], r3
642; CHECK-NEXT: vand q3, q3, q2
643; CHECK-NEXT: vand q4, q4, q2
644; CHECK-NEXT: vmov r2, s12
645; CHECK-NEXT: vmov r3, s16
646; CHECK-NEXT: umull r2, r3, r3, r2
647; CHECK-NEXT: vmov.32 q5[0], r2
648; CHECK-NEXT: vmov r2, s14
649; CHECK-NEXT: vmov.32 q5[1], r3
650; CHECK-NEXT: vmov r3, s18
651; CHECK-NEXT: umull r2, r3, r3, r2
652; CHECK-NEXT: vmov.32 q5[2], r2
653; CHECK-NEXT: vmov.32 q5[3], r3
654; CHECK-NEXT: vmov r0, s20
655; CHECK-NEXT: vmov r2, s21
656; CHECK-NEXT: adds.w r0, r0, r12
657; CHECK-NEXT: adcs r1, r2
658; CHECK-NEXT: vmov r2, s22
659; CHECK-NEXT: adds.w r12, r0, r2
660; CHECK-NEXT: vmov.u8 r2, q1[12]
661; CHECK-NEXT: adcs r1, r3
662; CHECK-NEXT: vmov.u8 r3, q0[12]
663; CHECK-NEXT: vmov.32 q3[0], r2
664; CHECK-NEXT: vmov.u8 r2, q1[13]
665; CHECK-NEXT: vmov.32 q4[0], r3
666; CHECK-NEXT: vmov.u8 r3, q0[13]
667; CHECK-NEXT: vmov.32 q3[2], r2
668; CHECK-NEXT: vmov.32 q4[2], r3
669; CHECK-NEXT: vand q3, q3, q2
670; CHECK-NEXT: vand q4, q4, q2
671; CHECK-NEXT: vmov r2, s12
672; CHECK-NEXT: vmov r3, s16
673; CHECK-NEXT: umull r2, r3, r3, r2
674; CHECK-NEXT: vmov.32 q5[0], r2
675; CHECK-NEXT: vmov r2, s14
676; CHECK-NEXT: vmov.32 q5[1], r3
677; CHECK-NEXT: vmov r3, s18
678; CHECK-NEXT: umull r2, r3, r3, r2
679; CHECK-NEXT: vmov.32 q5[2], r2
680; CHECK-NEXT: vmov.32 q5[3], r3
681; CHECK-NEXT: vmov r0, s20
682; CHECK-NEXT: vmov r2, s21
683; CHECK-NEXT: adds.w r0, r0, r12
684; CHECK-NEXT: adcs r1, r2
685; CHECK-NEXT: vmov r2, s22
686; CHECK-NEXT: adds r0, r0, r2
687; CHECK-NEXT: vmov.u8 r2, q1[14]
688; CHECK-NEXT: vmov.32 q3[0], r2
689; CHECK-NEXT: vmov.u8 r2, q1[15]
690; CHECK-NEXT: adcs r1, r3
691; CHECK-NEXT: vmov.32 q3[2], r2
692; CHECK-NEXT: vmov.u8 r3, q0[14]
693; CHECK-NEXT: vand q1, q3, q2
694; CHECK-NEXT: vmov.32 q3[0], r3
695; CHECK-NEXT: vmov.u8 r3, q0[15]
696; CHECK-NEXT: vmov.32 q3[2], r3
697; CHECK-NEXT: vmov r2, s4
698; CHECK-NEXT: vand q0, q3, q2
699; CHECK-NEXT: vmov r3, s0
700; CHECK-NEXT: umlal r0, r1, r3, r2
701; CHECK-NEXT: vmov r2, s6
702; CHECK-NEXT: vmov r3, s2
703; CHECK-NEXT: umlal r0, r1, r3, r2
704; CHECK-NEXT: vpop {d8, d9, d10, d11}
705; CHECK-NEXT: pop {r7, pc}
706; CHECK-NEXT: .p2align 4
707; CHECK-NEXT: @ %bb.1:
708; CHECK-NEXT: .LCPI23_0:
709; CHECK-NEXT: .long 255 @ 0xff
710; CHECK-NEXT: .long 0 @ 0x0
711; CHECK-NEXT: .long 255 @ 0xff
712; CHECK-NEXT: .long 0 @ 0x0
713entry:
714 %xx = zext <16 x i8> %x to <16 x i64>
715 %yy = zext <16 x i8> %y to <16 x i64>
716 %m = mul <16 x i64> %xx, %yy
717 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
718 ret i64 %z
719}
720
721define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
722; CHECK-LABEL: add_v16i8_v16i64_sext:
723; CHECK: @ %bb.0: @ %entry
724; CHECK-NEXT: vmov.u8 r0, q1[0]
725; CHECK-NEXT: vmov.u8 r1, q0[0]
726; CHECK-NEXT: sxtb r0, r0
727; CHECK-NEXT: sxtb r1, r1
728; CHECK-NEXT: smull r0, r1, r1, r0
729; CHECK-NEXT: vmov.32 q2[0], r0
730; CHECK-NEXT: vmov.u8 r0, q1[1]
731; CHECK-NEXT: vmov.32 q2[1], r1
732; CHECK-NEXT: vmov.u8 r1, q0[1]
733; CHECK-NEXT: sxtb r0, r0
734; CHECK-NEXT: sxtb r1, r1
735; CHECK-NEXT: smull r0, r1, r1, r0
736; CHECK-NEXT: vmov.32 q2[2], r0
737; CHECK-NEXT: vmov.32 q2[3], r1
738; CHECK-NEXT: vmov r2, s10
739; CHECK-NEXT: vmov r3, s8
740; CHECK-NEXT: vmov r0, s9
741; CHECK-NEXT: adds r2, r2, r3
742; CHECK-NEXT: vmov.u8 r3, q0[2]
743; CHECK-NEXT: adc.w r12, r0, r1
744; CHECK-NEXT: vmov.u8 r1, q1[2]
745; CHECK-NEXT: sxtb r1, r1
746; CHECK-NEXT: sxtb r3, r3
747; CHECK-NEXT: smull r1, r3, r3, r1
748; CHECK-NEXT: vmov.32 q2[0], r1
749; CHECK-NEXT: vmov.u8 r1, q1[3]
750; CHECK-NEXT: vmov.32 q2[1], r3
751; CHECK-NEXT: vmov.u8 r3, q0[3]
752; CHECK-NEXT: sxtb r1, r1
753; CHECK-NEXT: sxtb r3, r3
754; CHECK-NEXT: smull r1, r3, r3, r1
755; CHECK-NEXT: vmov.32 q2[2], r1
756; CHECK-NEXT: vmov.32 q2[3], r3
757; CHECK-NEXT: vmov r0, s8
758; CHECK-NEXT: vmov r1, s9
759; CHECK-NEXT: adds r0, r0, r2
760; CHECK-NEXT: vmov r2, s10
761; CHECK-NEXT: adc.w r1, r1, r12
762; CHECK-NEXT: adds.w r12, r0, r2
763; CHECK-NEXT: vmov.u8 r2, q1[4]
764; CHECK-NEXT: adcs r1, r3
765; CHECK-NEXT: vmov.u8 r3, q0[4]
766; CHECK-NEXT: sxtb r2, r2
767; CHECK-NEXT: sxtb r3, r3
768; CHECK-NEXT: smull r2, r3, r3, r2
769; CHECK-NEXT: vmov.32 q2[0], r2
770; CHECK-NEXT: vmov.u8 r2, q1[5]
771; CHECK-NEXT: vmov.32 q2[1], r3
772; CHECK-NEXT: vmov.u8 r3, q0[5]
773; CHECK-NEXT: sxtb r2, r2
774; CHECK-NEXT: sxtb r3, r3
775; CHECK-NEXT: smull r2, r3, r3, r2
776; CHECK-NEXT: vmov.32 q2[2], r2
777; CHECK-NEXT: vmov.32 q2[3], r3
778; CHECK-NEXT: vmov r0, s8
779; CHECK-NEXT: vmov r2, s9
780; CHECK-NEXT: adds.w r0, r0, r12
781; CHECK-NEXT: adcs r1, r2
782; CHECK-NEXT: vmov r2, s10
783; CHECK-NEXT: adds.w r12, r0, r2
784; CHECK-NEXT: vmov.u8 r2, q1[6]
785; CHECK-NEXT: adcs r1, r3
786; CHECK-NEXT: vmov.u8 r3, q0[6]
787; CHECK-NEXT: sxtb r2, r2
788; CHECK-NEXT: sxtb r3, r3
789; CHECK-NEXT: smull r2, r3, r3, r2
790; CHECK-NEXT: vmov.32 q2[0], r2
791; CHECK-NEXT: vmov.u8 r2, q1[7]
792; CHECK-NEXT: vmov.32 q2[1], r3
793; CHECK-NEXT: vmov.u8 r3, q0[7]
794; CHECK-NEXT: sxtb r2, r2
795; CHECK-NEXT: sxtb r3, r3
796; CHECK-NEXT: smull r2, r3, r3, r2
797; CHECK-NEXT: vmov.32 q2[2], r2
798; CHECK-NEXT: vmov.32 q2[3], r3
799; CHECK-NEXT: vmov r0, s8
800; CHECK-NEXT: vmov r2, s9
801; CHECK-NEXT: adds.w r0, r0, r12
802; CHECK-NEXT: adcs r1, r2
803; CHECK-NEXT: vmov r2, s10
804; CHECK-NEXT: adds.w r12, r0, r2
805; CHECK-NEXT: vmov.u8 r2, q1[8]
806; CHECK-NEXT: adcs r1, r3
807; CHECK-NEXT: vmov.u8 r3, q0[8]
808; CHECK-NEXT: sxtb r2, r2
809; CHECK-NEXT: sxtb r3, r3
810; CHECK-NEXT: smull r2, r3, r3, r2
811; CHECK-NEXT: vmov.32 q2[0], r2
812; CHECK-NEXT: vmov.u8 r2, q1[9]
813; CHECK-NEXT: vmov.32 q2[1], r3
814; CHECK-NEXT: vmov.u8 r3, q0[9]
815; CHECK-NEXT: sxtb r2, r2
816; CHECK-NEXT: sxtb r3, r3
817; CHECK-NEXT: smull r2, r3, r3, r2
818; CHECK-NEXT: vmov.32 q2[2], r2
819; CHECK-NEXT: vmov.32 q2[3], r3
820; CHECK-NEXT: vmov r0, s8
821; CHECK-NEXT: vmov r2, s9
822; CHECK-NEXT: adds.w r0, r0, r12
823; CHECK-NEXT: adcs r1, r2
824; CHECK-NEXT: vmov r2, s10
825; CHECK-NEXT: adds.w r12, r0, r2
826; CHECK-NEXT: vmov.u8 r2, q1[10]
827; CHECK-NEXT: adcs r1, r3
828; CHECK-NEXT: vmov.u8 r3, q0[10]
829; CHECK-NEXT: sxtb r2, r2
830; CHECK-NEXT: sxtb r3, r3
831; CHECK-NEXT: smull r2, r3, r3, r2
832; CHECK-NEXT: vmov.32 q2[0], r2
833; CHECK-NEXT: vmov.u8 r2, q1[11]
834; CHECK-NEXT: vmov.32 q2[1], r3
835; CHECK-NEXT: vmov.u8 r3, q0[11]
836; CHECK-NEXT: sxtb r2, r2
837; CHECK-NEXT: sxtb r3, r3
838; CHECK-NEXT: smull r2, r3, r3, r2
839; CHECK-NEXT: vmov.32 q2[2], r2
840; CHECK-NEXT: vmov.32 q2[3], r3
841; CHECK-NEXT: vmov r0, s8
842; CHECK-NEXT: vmov r2, s9
843; CHECK-NEXT: adds.w r0, r0, r12
844; CHECK-NEXT: adcs r1, r2
845; CHECK-NEXT: vmov r2, s10
846; CHECK-NEXT: adds.w r12, r0, r2
847; CHECK-NEXT: vmov.u8 r2, q1[12]
848; CHECK-NEXT: adcs r1, r3
849; CHECK-NEXT: vmov.u8 r3, q0[12]
850; CHECK-NEXT: sxtb r2, r2
851; CHECK-NEXT: sxtb r3, r3
852; CHECK-NEXT: smull r2, r3, r3, r2
853; CHECK-NEXT: vmov.32 q2[0], r2
854; CHECK-NEXT: vmov.u8 r2, q1[13]
855; CHECK-NEXT: vmov.32 q2[1], r3
856; CHECK-NEXT: vmov.u8 r3, q0[13]
857; CHECK-NEXT: sxtb r2, r2
858; CHECK-NEXT: sxtb r3, r3
859; CHECK-NEXT: smull r2, r3, r3, r2
860; CHECK-NEXT: vmov.32 q2[2], r2
861; CHECK-NEXT: vmov.32 q2[3], r3
862; CHECK-NEXT: vmov r0, s8
863; CHECK-NEXT: vmov r2, s9
864; CHECK-NEXT: adds.w r0, r0, r12
865; CHECK-NEXT: adcs r1, r2
866; CHECK-NEXT: vmov r2, s10
867; CHECK-NEXT: adds r0, r0, r2
868; CHECK-NEXT: vmov.u8 r2, q1[14]
869; CHECK-NEXT: adcs r1, r3
870; CHECK-NEXT: vmov.u8 r3, q0[14]
871; CHECK-NEXT: sxtb r2, r2
872; CHECK-NEXT: sxtb r3, r3
873; CHECK-NEXT: smlal r0, r1, r3, r2
874; CHECK-NEXT: vmov.u8 r2, q1[15]
875; CHECK-NEXT: vmov.u8 r3, q0[15]
876; CHECK-NEXT: sxtb r2, r2
877; CHECK-NEXT: sxtb r3, r3
878; CHECK-NEXT: smlal r0, r1, r3, r2
879; CHECK-NEXT: bx lr
880entry:
881 %xx = sext <16 x i8> %x to <16 x i64>
882 %yy = sext <16 x i8> %y to <16 x i64>
883 %m = mul <16 x i64> %xx, %yy
884 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
885 ret i64 %z
886}
887
888define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
889; CHECK-LABEL: add_v2i8_v2i64_zext:
890; CHECK: @ %bb.0: @ %entry
891; CHECK-NEXT: adr r0, .LCPI25_0
892; CHECK-NEXT: vldrw.u32 q2, [r0]
893; CHECK-NEXT: vand q1, q1, q2
894; CHECK-NEXT: vand q0, q0, q2
895; CHECK-NEXT: vmov r0, s6
896; CHECK-NEXT: vmov r1, s2
897; CHECK-NEXT: vmov r2, s4
898; CHECK-NEXT: vmov r3, s0
899; CHECK-NEXT: umull r0, r1, r1, r0
900; CHECK-NEXT: umull r2, r3, r3, r2
901; CHECK-NEXT: add r0, r2
902; CHECK-NEXT: orrs r1, r3
903; CHECK-NEXT: bx lr
904; CHECK-NEXT: .p2align 4
905; CHECK-NEXT: @ %bb.1:
906; CHECK-NEXT: .LCPI25_0:
907; CHECK-NEXT: .long 255 @ 0xff
908; CHECK-NEXT: .long 0 @ 0x0
909; CHECK-NEXT: .long 255 @ 0xff
910; CHECK-NEXT: .long 0 @ 0x0
911entry:
912 %xx = zext <2 x i8> %x to <2 x i64>
913 %yy = zext <2 x i8> %y to <2 x i64>
914 %m = mul <2 x i64> %xx, %yy
915 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
916 ret i64 %z
917}
918
919define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
920; CHECK-LABEL: add_v2i8_v2i64_sext:
921; CHECK: @ %bb.0: @ %entry
922; CHECK-NEXT: vmov r0, s4
923; CHECK-NEXT: vmov r1, s0
924; CHECK-NEXT: vmov r2, s6
925; CHECK-NEXT: vmov r3, s2
926; CHECK-NEXT: sxtb r0, r0
927; CHECK-NEXT: sxtb r1, r1
928; CHECK-NEXT: smull r0, r1, r1, r0
929; CHECK-NEXT: sxtb r2, r2
930; CHECK-NEXT: sxtb r3, r3
931; CHECK-NEXT: smlal r0, r1, r3, r2
932; CHECK-NEXT: bx lr
933entry:
934 %xx = sext <2 x i8> %x to <2 x i64>
935 %yy = sext <2 x i8> %y to <2 x i64>
936 %m = mul <2 x i64> %xx, %yy
937 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
938 ret i64 %z
939}
940
941define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
942; CHECK-LABEL: add_v2i64_v2i64:
943; CHECK: @ %bb.0: @ %entry
944; CHECK-NEXT: .save {r4, lr}
945; CHECK-NEXT: push {r4, lr}
946; CHECK-NEXT: vmov r0, s4
947; CHECK-NEXT: vmov r1, s0
948; CHECK-NEXT: vmov r2, s5
949; CHECK-NEXT: vmov r4, s7
950; CHECK-NEXT: umull r12, r3, r1, r0
951; CHECK-NEXT: mla r1, r1, r2, r3
952; CHECK-NEXT: vmov r2, s1
953; CHECK-NEXT: vmov r3, s2
954; CHECK-NEXT: vmov.32 q2[0], r12
955; CHECK-NEXT: mla r1, r2, r0, r1
956; CHECK-NEXT: vmov r2, s6
957; CHECK-NEXT: vmov.32 q2[1], r1
958; CHECK-NEXT: vmov r12, s8
959; CHECK-NEXT: umull lr, r0, r3, r2
960; CHECK-NEXT: mla r0, r3, r4, r0
961; CHECK-NEXT: vmov r3, s3
962; CHECK-NEXT: mla r2, r3, r2, r0
963; CHECK-NEXT: adds.w r0, r12, lr
964; CHECK-NEXT: adcs r1, r2
965; CHECK-NEXT: pop {r4, pc}
966entry:
967 %m = mul <2 x i64> %x, %y
968 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
969 ret i64 %z
970}
971
972define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, i32 %a) {
973; CHECK-LABEL: add_v4i32_v4i32_acc:
974; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000975; CHECK-NEXT: vmlava.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000976; CHECK-NEXT: bx lr
977entry:
978 %m = mul <4 x i32> %x, %y
979 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
980 %r = add i32 %z, %a
981 ret i32 %r
982}
983
984define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
985; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
986; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000987; CHECK-NEXT: vmlalva.u32 r0, r1, q0, q1
988; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +0000989entry:
990 %xx = zext <4 x i32> %x to <4 x i64>
991 %yy = zext <4 x i32> %y to <4 x i64>
992 %m = mul <4 x i64> %xx, %yy
993 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
994 %r = add i64 %z, %a
995 ret i64 %r
996}
997
998define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
999; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
1000; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001001; CHECK-NEXT: vmlalva.s32 r0, r1, q0, q1
1002; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +00001003entry:
1004 %xx = sext <4 x i32> %x to <4 x i64>
1005 %yy = sext <4 x i32> %y to <4 x i64>
1006 %m = mul <4 x i64> %xx, %yy
1007 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
1008 %r = add i64 %z, %a
1009 ret i64 %r
1010}
1011
1012define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
1013; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
1014; CHECK: @ %bb.0: @ %entry
1015; CHECK-NEXT: .save {r7, lr}
1016; CHECK-NEXT: push {r7, lr}
1017; CHECK-NEXT: vmov r2, s4
1018; CHECK-NEXT: vmov r3, s0
1019; CHECK-NEXT: vmov r12, s6
1020; CHECK-NEXT: umull r2, lr, r3, r2
1021; CHECK-NEXT: vmov r3, s2
1022; CHECK-NEXT: umlal r2, lr, r3, r12
1023; CHECK-NEXT: adds r0, r0, r2
1024; CHECK-NEXT: adc.w r1, r1, lr
1025; CHECK-NEXT: pop {r7, pc}
1026entry:
1027 %xx = zext <2 x i32> %x to <2 x i64>
1028 %yy = zext <2 x i32> %y to <2 x i64>
1029 %m = mul <2 x i64> %xx, %yy
1030 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1031 %r = add i64 %z, %a
1032 ret i64 %r
1033}
1034
1035define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
1036; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
1037; CHECK: @ %bb.0: @ %entry
1038; CHECK-NEXT: .save {r7, lr}
1039; CHECK-NEXT: push {r7, lr}
1040; CHECK-NEXT: vmov r2, s4
1041; CHECK-NEXT: vmov r3, s0
1042; CHECK-NEXT: vmov r12, s6
1043; CHECK-NEXT: smull r2, lr, r3, r2
1044; CHECK-NEXT: vmov r3, s2
1045; CHECK-NEXT: smlal r2, lr, r3, r12
1046; CHECK-NEXT: adds r0, r0, r2
1047; CHECK-NEXT: adc.w r1, r1, lr
1048; CHECK-NEXT: pop {r7, pc}
1049entry:
1050 %xx = sext <2 x i32> %x to <2 x i64>
1051 %yy = sext <2 x i32> %y to <2 x i64>
1052 %m = mul <2 x i64> %xx, %yy
1053 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1054 %r = add i64 %z, %a
1055 ret i64 %r
1056}
1057
1058define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
1059; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
1060; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001061; CHECK-NEXT: vmlava.u16 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001062; CHECK-NEXT: bx lr
1063entry:
1064 %xx = zext <8 x i16> %x to <8 x i32>
1065 %yy = zext <8 x i16> %y to <8 x i32>
1066 %m = mul <8 x i32> %xx, %yy
1067 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
1068 %r = add i32 %z, %a
1069 ret i32 %r
1070}
1071
1072define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
1073; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
1074; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001075; CHECK-NEXT: vmlava.s16 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001076; CHECK-NEXT: bx lr
1077entry:
1078 %xx = sext <8 x i16> %x to <8 x i32>
1079 %yy = sext <8 x i16> %y to <8 x i32>
1080 %m = mul <8 x i32> %xx, %yy
1081 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
1082 %r = add i32 %z, %a
1083 ret i32 %r
1084}
1085
1086define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
1087; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
1088; CHECK: @ %bb.0: @ %entry
1089; CHECK-NEXT: vmovlb.u16 q1, q1
1090; CHECK-NEXT: vmovlb.u16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +00001091; CHECK-NEXT: vmlava.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001092; CHECK-NEXT: bx lr
1093entry:
1094 %xx = zext <4 x i16> %x to <4 x i32>
1095 %yy = zext <4 x i16> %y to <4 x i32>
1096 %m = mul <4 x i32> %xx, %yy
1097 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1098 %r = add i32 %z, %a
1099 ret i32 %r
1100}
1101
1102define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
1103; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
1104; CHECK: @ %bb.0: @ %entry
1105; CHECK-NEXT: vmovlb.s16 q1, q1
1106; CHECK-NEXT: vmovlb.s16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +00001107; CHECK-NEXT: vmlava.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001108; CHECK-NEXT: bx lr
1109entry:
1110 %xx = sext <4 x i16> %x to <4 x i32>
1111 %yy = sext <4 x i16> %y to <4 x i32>
1112 %m = mul <4 x i32> %xx, %yy
1113 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1114 %r = add i32 %z, %a
1115 ret i32 %r
1116}
1117
1118define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, i16 %a) {
1119; CHECK-LABEL: add_v8i16_v8i16_acc:
1120; CHECK: @ %bb.0: @ %entry
1121; CHECK-NEXT: vmul.i16 q0, q0, q1
1122; CHECK-NEXT: vaddva.u16 r0, q0
1123; CHECK-NEXT: uxth r0, r0
1124; CHECK-NEXT: bx lr
1125entry:
1126 %m = mul <8 x i16> %x, %y
1127 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
1128 %r = add i16 %z, %a
1129 ret i16 %r
1130}
1131
1132define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1133; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
1134; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001135; CHECK-NEXT: vmlalva.u16 r0, r1, q0, q1
1136; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +00001137entry:
1138 %xx = zext <8 x i16> %x to <8 x i64>
1139 %yy = zext <8 x i16> %y to <8 x i64>
1140 %m = mul <8 x i64> %xx, %yy
1141 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
1142 %r = add i64 %z, %a
1143 ret i64 %r
1144}
1145
1146define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1147; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
1148; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001149; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q1
1150; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +00001151entry:
1152 %xx = sext <8 x i16> %x to <8 x i64>
1153 %yy = sext <8 x i16> %y to <8 x i64>
1154 %m = mul <8 x i64> %xx, %yy
1155 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
1156 %r = add i64 %z, %a
1157 ret i64 %r
1158}
1159
1160define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
1161; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
1162; CHECK: @ %bb.0: @ %entry
1163; CHECK-NEXT: .save {r7, lr}
1164; CHECK-NEXT: push {r7, lr}
1165; CHECK-NEXT: adr r2, .LCPI40_0
1166; CHECK-NEXT: vldrw.u32 q2, [r2]
1167; CHECK-NEXT: vand q1, q1, q2
1168; CHECK-NEXT: vand q0, q0, q2
1169; CHECK-NEXT: vmov r2, s4
1170; CHECK-NEXT: vmov r3, s0
1171; CHECK-NEXT: vmov r12, s6
1172; CHECK-NEXT: umull r2, lr, r3, r2
1173; CHECK-NEXT: vmov r3, s2
1174; CHECK-NEXT: umlal r2, lr, r3, r12
1175; CHECK-NEXT: adds r0, r0, r2
1176; CHECK-NEXT: adc.w r1, r1, lr
1177; CHECK-NEXT: pop {r7, pc}
1178; CHECK-NEXT: .p2align 4
1179; CHECK-NEXT: @ %bb.1:
1180; CHECK-NEXT: .LCPI40_0:
1181; CHECK-NEXT: .long 65535 @ 0xffff
1182; CHECK-NEXT: .long 0 @ 0x0
1183; CHECK-NEXT: .long 65535 @ 0xffff
1184; CHECK-NEXT: .long 0 @ 0x0
1185entry:
1186 %xx = zext <2 x i16> %x to <2 x i64>
1187 %yy = zext <2 x i16> %y to <2 x i64>
1188 %m = mul <2 x i64> %xx, %yy
1189 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1190 %r = add i64 %z, %a
1191 ret i64 %r
1192}
1193
1194define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
1195; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
1196; CHECK: @ %bb.0: @ %entry
1197; CHECK-NEXT: .save {r7, lr}
1198; CHECK-NEXT: push {r7, lr}
1199; CHECK-NEXT: vmov r2, s4
1200; CHECK-NEXT: vmov r3, s0
1201; CHECK-NEXT: sxth r2, r2
1202; CHECK-NEXT: sxth r3, r3
1203; CHECK-NEXT: smull r2, r12, r3, r2
1204; CHECK-NEXT: vmov r3, s6
1205; CHECK-NEXT: sxth.w lr, r3
1206; CHECK-NEXT: vmov r3, s2
1207; CHECK-NEXT: sxth r3, r3
1208; CHECK-NEXT: smlal r2, r12, r3, lr
1209; CHECK-NEXT: adds r0, r0, r2
1210; CHECK-NEXT: adc.w r1, r1, r12
1211; CHECK-NEXT: pop {r7, pc}
1212entry:
1213 %xx = sext <2 x i16> %x to <2 x i64>
1214 %yy = sext <2 x i16> %y to <2 x i64>
1215 %m = mul <2 x i64> %xx, %yy
1216 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1217 %r = add i64 %z, %a
1218 ret i64 %r
1219}
1220
1221define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1222; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
1223; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001224; CHECK-NEXT: vmlava.u8 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001225; CHECK-NEXT: bx lr
1226entry:
1227 %xx = zext <16 x i8> %x to <16 x i32>
1228 %yy = zext <16 x i8> %y to <16 x i32>
1229 %m = mul <16 x i32> %xx, %yy
1230 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
1231 %r = add i32 %z, %a
1232 ret i32 %r
1233}
1234
1235define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1236; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
1237; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001238; CHECK-NEXT: vmlava.s8 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001239; CHECK-NEXT: bx lr
1240entry:
1241 %xx = sext <16 x i8> %x to <16 x i32>
1242 %yy = sext <16 x i8> %y to <16 x i32>
1243 %m = mul <16 x i32> %xx, %yy
1244 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
1245 %r = add i32 %z, %a
1246 ret i32 %r
1247}
1248
1249define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
1250; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
1251; CHECK: @ %bb.0: @ %entry
1252; CHECK-NEXT: vmov.i32 q2, #0xff
1253; CHECK-NEXT: vand q1, q1, q2
1254; CHECK-NEXT: vand q0, q0, q2
David Green33aa5df2020-02-17 12:00:17 +00001255; CHECK-NEXT: vmlava.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001256; CHECK-NEXT: bx lr
1257entry:
1258 %xx = zext <4 x i8> %x to <4 x i32>
1259 %yy = zext <4 x i8> %y to <4 x i32>
1260 %m = mul <4 x i32> %xx, %yy
1261 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1262 %r = add i32 %z, %a
1263 ret i32 %r
1264}
1265
1266define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
1267; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
1268; CHECK: @ %bb.0: @ %entry
1269; CHECK-NEXT: vmovlb.s8 q1, q1
1270; CHECK-NEXT: vmovlb.s8 q0, q0
1271; CHECK-NEXT: vmovlb.s16 q1, q1
1272; CHECK-NEXT: vmovlb.s16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +00001273; CHECK-NEXT: vmlava.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001274; CHECK-NEXT: bx lr
1275entry:
1276 %xx = sext <4 x i8> %x to <4 x i32>
1277 %yy = sext <4 x i8> %y to <4 x i32>
1278 %m = mul <4 x i32> %xx, %yy
1279 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1280 %r = add i32 %z, %a
1281 ret i32 %r
1282}
1283
1284define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
1285; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
1286; CHECK: @ %bb.0: @ %entry
1287; CHECK-NEXT: vmov.u8 r1, q1[8]
1288; CHECK-NEXT: vmov.16 q2[0], r1
1289; CHECK-NEXT: vmov.u8 r1, q1[9]
1290; CHECK-NEXT: vmov.16 q2[1], r1
1291; CHECK-NEXT: vmov.u8 r1, q1[10]
1292; CHECK-NEXT: vmov.16 q2[2], r1
1293; CHECK-NEXT: vmov.u8 r1, q1[11]
1294; CHECK-NEXT: vmov.16 q2[3], r1
1295; CHECK-NEXT: vmov.u8 r1, q1[12]
1296; CHECK-NEXT: vmov.16 q2[4], r1
1297; CHECK-NEXT: vmov.u8 r1, q1[13]
1298; CHECK-NEXT: vmov.16 q2[5], r1
1299; CHECK-NEXT: vmov.u8 r1, q1[14]
1300; CHECK-NEXT: vmov.16 q2[6], r1
1301; CHECK-NEXT: vmov.u8 r1, q1[15]
1302; CHECK-NEXT: vmov.16 q2[7], r1
1303; CHECK-NEXT: vmov.u8 r1, q0[8]
1304; CHECK-NEXT: vmov.16 q3[0], r1
1305; CHECK-NEXT: vmov.u8 r1, q0[9]
1306; CHECK-NEXT: vmov.16 q3[1], r1
1307; CHECK-NEXT: vmov.u8 r1, q0[10]
1308; CHECK-NEXT: vmov.16 q3[2], r1
1309; CHECK-NEXT: vmov.u8 r1, q0[11]
1310; CHECK-NEXT: vmov.16 q3[3], r1
1311; CHECK-NEXT: vmov.u8 r1, q0[12]
1312; CHECK-NEXT: vmov.16 q3[4], r1
1313; CHECK-NEXT: vmov.u8 r1, q0[13]
1314; CHECK-NEXT: vmov.16 q3[5], r1
1315; CHECK-NEXT: vmov.u8 r1, q0[14]
1316; CHECK-NEXT: vmov.16 q3[6], r1
1317; CHECK-NEXT: vmov.u8 r1, q0[15]
1318; CHECK-NEXT: vmov.16 q3[7], r1
1319; CHECK-NEXT: vmovlb.u8 q2, q2
1320; CHECK-NEXT: vmovlb.u8 q3, q3
1321; CHECK-NEXT: vmov.u8 r1, q1[0]
1322; CHECK-NEXT: vmul.i16 q2, q3, q2
1323; CHECK-NEXT: vmov.16 q3[0], r1
1324; CHECK-NEXT: vmov.u8 r1, q1[1]
1325; CHECK-NEXT: vmov.16 q3[1], r1
1326; CHECK-NEXT: vmov.u8 r1, q1[2]
1327; CHECK-NEXT: vmov.16 q3[2], r1
1328; CHECK-NEXT: vmov.u8 r1, q1[3]
1329; CHECK-NEXT: vmov.16 q3[3], r1
1330; CHECK-NEXT: vmov.u8 r1, q1[4]
1331; CHECK-NEXT: vmov.16 q3[4], r1
1332; CHECK-NEXT: vmov.u8 r1, q1[5]
1333; CHECK-NEXT: vmov.16 q3[5], r1
1334; CHECK-NEXT: vmov.u8 r1, q1[6]
1335; CHECK-NEXT: vmov.16 q3[6], r1
1336; CHECK-NEXT: vmov.u8 r1, q1[7]
1337; CHECK-NEXT: vmov.16 q3[7], r1
1338; CHECK-NEXT: vmov.u8 r1, q0[0]
1339; CHECK-NEXT: vmovlb.u8 q1, q3
1340; CHECK-NEXT: vmov.16 q3[0], r1
1341; CHECK-NEXT: vmov.u8 r1, q0[1]
1342; CHECK-NEXT: vmov.16 q3[1], r1
1343; CHECK-NEXT: vmov.u8 r1, q0[2]
1344; CHECK-NEXT: vmov.16 q3[2], r1
1345; CHECK-NEXT: vmov.u8 r1, q0[3]
1346; CHECK-NEXT: vmov.16 q3[3], r1
1347; CHECK-NEXT: vmov.u8 r1, q0[4]
1348; CHECK-NEXT: vmov.16 q3[4], r1
1349; CHECK-NEXT: vmov.u8 r1, q0[5]
1350; CHECK-NEXT: vmov.16 q3[5], r1
1351; CHECK-NEXT: vmov.u8 r1, q0[6]
1352; CHECK-NEXT: vmov.16 q3[6], r1
1353; CHECK-NEXT: vmov.u8 r1, q0[7]
1354; CHECK-NEXT: vmov.16 q3[7], r1
1355; CHECK-NEXT: vmovlb.u8 q0, q3
1356; CHECK-NEXT: vmul.i16 q0, q0, q1
1357; CHECK-NEXT: vadd.i16 q0, q0, q2
1358; CHECK-NEXT: vaddva.u16 r0, q0
1359; CHECK-NEXT: uxth r0, r0
1360; CHECK-NEXT: bx lr
1361entry:
1362 %xx = zext <16 x i8> %x to <16 x i16>
1363 %yy = zext <16 x i8> %y to <16 x i16>
1364 %m = mul <16 x i16> %xx, %yy
1365 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
1366 %r = add i16 %z, %a
1367 ret i16 %r
1368}
1369
1370define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
1371; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
1372; CHECK: @ %bb.0: @ %entry
1373; CHECK-NEXT: vmov.u8 r1, q1[8]
1374; CHECK-NEXT: vmov.16 q2[0], r1
1375; CHECK-NEXT: vmov.u8 r1, q1[9]
1376; CHECK-NEXT: vmov.16 q2[1], r1
1377; CHECK-NEXT: vmov.u8 r1, q1[10]
1378; CHECK-NEXT: vmov.16 q2[2], r1
1379; CHECK-NEXT: vmov.u8 r1, q1[11]
1380; CHECK-NEXT: vmov.16 q2[3], r1
1381; CHECK-NEXT: vmov.u8 r1, q1[12]
1382; CHECK-NEXT: vmov.16 q2[4], r1
1383; CHECK-NEXT: vmov.u8 r1, q1[13]
1384; CHECK-NEXT: vmov.16 q2[5], r1
1385; CHECK-NEXT: vmov.u8 r1, q1[14]
1386; CHECK-NEXT: vmov.16 q2[6], r1
1387; CHECK-NEXT: vmov.u8 r1, q1[15]
1388; CHECK-NEXT: vmov.16 q2[7], r1
1389; CHECK-NEXT: vmov.u8 r1, q0[8]
1390; CHECK-NEXT: vmov.16 q3[0], r1
1391; CHECK-NEXT: vmov.u8 r1, q0[9]
1392; CHECK-NEXT: vmov.16 q3[1], r1
1393; CHECK-NEXT: vmov.u8 r1, q0[10]
1394; CHECK-NEXT: vmov.16 q3[2], r1
1395; CHECK-NEXT: vmov.u8 r1, q0[11]
1396; CHECK-NEXT: vmov.16 q3[3], r1
1397; CHECK-NEXT: vmov.u8 r1, q0[12]
1398; CHECK-NEXT: vmov.16 q3[4], r1
1399; CHECK-NEXT: vmov.u8 r1, q0[13]
1400; CHECK-NEXT: vmov.16 q3[5], r1
1401; CHECK-NEXT: vmov.u8 r1, q0[14]
1402; CHECK-NEXT: vmov.16 q3[6], r1
1403; CHECK-NEXT: vmov.u8 r1, q0[15]
1404; CHECK-NEXT: vmov.16 q3[7], r1
1405; CHECK-NEXT: vmovlb.s8 q2, q2
1406; CHECK-NEXT: vmovlb.s8 q3, q3
1407; CHECK-NEXT: vmov.u8 r1, q1[0]
1408; CHECK-NEXT: vmul.i16 q2, q3, q2
1409; CHECK-NEXT: vmov.16 q3[0], r1
1410; CHECK-NEXT: vmov.u8 r1, q1[1]
1411; CHECK-NEXT: vmov.16 q3[1], r1
1412; CHECK-NEXT: vmov.u8 r1, q1[2]
1413; CHECK-NEXT: vmov.16 q3[2], r1
1414; CHECK-NEXT: vmov.u8 r1, q1[3]
1415; CHECK-NEXT: vmov.16 q3[3], r1
1416; CHECK-NEXT: vmov.u8 r1, q1[4]
1417; CHECK-NEXT: vmov.16 q3[4], r1
1418; CHECK-NEXT: vmov.u8 r1, q1[5]
1419; CHECK-NEXT: vmov.16 q3[5], r1
1420; CHECK-NEXT: vmov.u8 r1, q1[6]
1421; CHECK-NEXT: vmov.16 q3[6], r1
1422; CHECK-NEXT: vmov.u8 r1, q1[7]
1423; CHECK-NEXT: vmov.16 q3[7], r1
1424; CHECK-NEXT: vmov.u8 r1, q0[0]
1425; CHECK-NEXT: vmovlb.s8 q1, q3
1426; CHECK-NEXT: vmov.16 q3[0], r1
1427; CHECK-NEXT: vmov.u8 r1, q0[1]
1428; CHECK-NEXT: vmov.16 q3[1], r1
1429; CHECK-NEXT: vmov.u8 r1, q0[2]
1430; CHECK-NEXT: vmov.16 q3[2], r1
1431; CHECK-NEXT: vmov.u8 r1, q0[3]
1432; CHECK-NEXT: vmov.16 q3[3], r1
1433; CHECK-NEXT: vmov.u8 r1, q0[4]
1434; CHECK-NEXT: vmov.16 q3[4], r1
1435; CHECK-NEXT: vmov.u8 r1, q0[5]
1436; CHECK-NEXT: vmov.16 q3[5], r1
1437; CHECK-NEXT: vmov.u8 r1, q0[6]
1438; CHECK-NEXT: vmov.16 q3[6], r1
1439; CHECK-NEXT: vmov.u8 r1, q0[7]
1440; CHECK-NEXT: vmov.16 q3[7], r1
1441; CHECK-NEXT: vmovlb.s8 q0, q3
1442; CHECK-NEXT: vmul.i16 q0, q0, q1
1443; CHECK-NEXT: vadd.i16 q0, q0, q2
1444; CHECK-NEXT: vaddva.u16 r0, q0
1445; CHECK-NEXT: sxth r0, r0
1446; CHECK-NEXT: bx lr
1447entry:
1448 %xx = sext <16 x i8> %x to <16 x i16>
1449 %yy = sext <16 x i8> %y to <16 x i16>
1450 %m = mul <16 x i16> %xx, %yy
1451 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
1452 %r = add i16 %z, %a
1453 ret i16 %r
1454}
1455
1456define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
1457; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
1458; CHECK: @ %bb.0: @ %entry
1459; CHECK-NEXT: vmovlb.u8 q1, q1
1460; CHECK-NEXT: vmovlb.u8 q0, q0
1461; CHECK-NEXT: vmul.i16 q0, q0, q1
1462; CHECK-NEXT: vaddva.u16 r0, q0
1463; CHECK-NEXT: uxth r0, r0
1464; CHECK-NEXT: bx lr
1465entry:
1466 %xx = zext <8 x i8> %x to <8 x i16>
1467 %yy = zext <8 x i8> %y to <8 x i16>
1468 %m = mul <8 x i16> %xx, %yy
1469 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
1470 %r = add i16 %z, %a
1471 ret i16 %r
1472}
1473
1474define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
1475; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
1476; CHECK: @ %bb.0: @ %entry
1477; CHECK-NEXT: vmovlb.s8 q1, q1
1478; CHECK-NEXT: vmovlb.s8 q0, q0
1479; CHECK-NEXT: vmul.i16 q0, q0, q1
1480; CHECK-NEXT: vaddva.u16 r0, q0
1481; CHECK-NEXT: sxth r0, r0
1482; CHECK-NEXT: bx lr
1483entry:
1484 %xx = sext <8 x i8> %x to <8 x i16>
1485 %yy = sext <8 x i8> %y to <8 x i16>
1486 %m = mul <8 x i16> %xx, %yy
1487 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
1488 %r = add i16 %z, %a
1489 ret i16 %r
1490}
1491
1492define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, i8 %a) {
1493; CHECK-LABEL: add_v16i8_v16i8_acc:
1494; CHECK: @ %bb.0: @ %entry
1495; CHECK-NEXT: vmul.i8 q0, q0, q1
1496; CHECK-NEXT: vaddva.u8 r0, q0
1497; CHECK-NEXT: uxtb r0, r0
1498; CHECK-NEXT: bx lr
1499entry:
1500 %m = mul <16 x i8> %x, %y
1501 %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
1502 %r = add i8 %z, %a
1503 ret i8 %r
1504}
1505
1506define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
1507; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
1508; CHECK: @ %bb.0: @ %entry
1509; CHECK-NEXT: .save {r4, r5, r7, lr}
1510; CHECK-NEXT: push {r4, r5, r7, lr}
1511; CHECK-NEXT: .vsave {d8, d9, d10, d11}
1512; CHECK-NEXT: vpush {d8, d9, d10, d11}
1513; CHECK-NEXT: vmov.u8 r2, q1[0]
1514; CHECK-NEXT: vmov.u8 r3, q0[0]
1515; CHECK-NEXT: vmov.32 q3[0], r2
1516; CHECK-NEXT: vmov.u8 r2, q1[1]
1517; CHECK-NEXT: vmov.32 q3[2], r2
1518; CHECK-NEXT: adr r2, .LCPI51_0
1519; CHECK-NEXT: vldrw.u32 q2, [r2]
1520; CHECK-NEXT: vmov.32 q4[0], r3
1521; CHECK-NEXT: vmov.u8 r3, q0[1]
1522; CHECK-NEXT: vmov.u8 r4, q0[2]
1523; CHECK-NEXT: vmov.32 q4[2], r3
1524; CHECK-NEXT: vand q3, q3, q2
1525; CHECK-NEXT: vand q4, q4, q2
1526; CHECK-NEXT: vmov r2, s14
1527; CHECK-NEXT: vmov r3, s18
1528; CHECK-NEXT: umull r12, lr, r3, r2
1529; CHECK-NEXT: vmov r3, s16
1530; CHECK-NEXT: vmov r2, s12
1531; CHECK-NEXT: vmov.32 q4[0], r4
1532; CHECK-NEXT: vmov.u8 r4, q0[3]
1533; CHECK-NEXT: vmov.32 q4[2], r4
1534; CHECK-NEXT: vand q4, q4, q2
1535; CHECK-NEXT: vmov r4, s16
1536; CHECK-NEXT: umull r2, r3, r3, r2
1537; CHECK-NEXT: orr.w lr, lr, r3
1538; CHECK-NEXT: vmov.u8 r3, q1[2]
1539; CHECK-NEXT: vmov.32 q3[0], r3
1540; CHECK-NEXT: vmov.u8 r3, q1[3]
1541; CHECK-NEXT: vmov.32 q3[2], r3
1542; CHECK-NEXT: add r2, r12
1543; CHECK-NEXT: vand q3, q3, q2
1544; CHECK-NEXT: vmov r3, s12
1545; CHECK-NEXT: umull r3, r4, r4, r3
1546; CHECK-NEXT: vmov.32 q5[0], r3
1547; CHECK-NEXT: vmov r3, s14
1548; CHECK-NEXT: vmov.32 q5[1], r4
1549; CHECK-NEXT: vmov r4, s18
1550; CHECK-NEXT: umull r3, r4, r4, r3
1551; CHECK-NEXT: vmov.32 q5[2], r3
1552; CHECK-NEXT: vmov.32 q5[3], r4
1553; CHECK-NEXT: vmov r3, s20
1554; CHECK-NEXT: vmov r5, s21
1555; CHECK-NEXT: adds r2, r2, r3
1556; CHECK-NEXT: adc.w r3, lr, r5
1557; CHECK-NEXT: vmov r5, s22
1558; CHECK-NEXT: adds.w r12, r2, r5
1559; CHECK-NEXT: vmov.u8 r5, q1[4]
1560; CHECK-NEXT: adcs r3, r4
1561; CHECK-NEXT: vmov.u8 r4, q0[4]
1562; CHECK-NEXT: vmov.32 q3[0], r5
1563; CHECK-NEXT: vmov.u8 r5, q1[5]
1564; CHECK-NEXT: vmov.32 q4[0], r4
1565; CHECK-NEXT: vmov.u8 r4, q0[5]
1566; CHECK-NEXT: vmov.32 q3[2], r5
1567; CHECK-NEXT: vmov.32 q4[2], r4
1568; CHECK-NEXT: vand q3, q3, q2
1569; CHECK-NEXT: vand q4, q4, q2
1570; CHECK-NEXT: vmov r5, s12
1571; CHECK-NEXT: vmov r4, s16
1572; CHECK-NEXT: umull r5, r4, r4, r5
1573; CHECK-NEXT: vmov.32 q5[0], r5
1574; CHECK-NEXT: vmov r5, s14
1575; CHECK-NEXT: vmov.32 q5[1], r4
1576; CHECK-NEXT: vmov r4, s18
1577; CHECK-NEXT: umull r5, r4, r4, r5
1578; CHECK-NEXT: vmov.32 q5[2], r5
1579; CHECK-NEXT: vmov.32 q5[3], r4
1580; CHECK-NEXT: vmov r2, s20
1581; CHECK-NEXT: vmov r5, s21
1582; CHECK-NEXT: adds.w r2, r2, r12
1583; CHECK-NEXT: adcs r3, r5
1584; CHECK-NEXT: vmov r5, s22
1585; CHECK-NEXT: adds.w r12, r2, r5
1586; CHECK-NEXT: vmov.u8 r5, q1[6]
1587; CHECK-NEXT: adcs r3, r4
1588; CHECK-NEXT: vmov.u8 r4, q0[6]
1589; CHECK-NEXT: vmov.32 q3[0], r5
1590; CHECK-NEXT: vmov.u8 r5, q1[7]
1591; CHECK-NEXT: vmov.32 q4[0], r4
1592; CHECK-NEXT: vmov.u8 r4, q0[7]
1593; CHECK-NEXT: vmov.32 q3[2], r5
1594; CHECK-NEXT: vmov.32 q4[2], r4
1595; CHECK-NEXT: vand q3, q3, q2
1596; CHECK-NEXT: vand q4, q4, q2
1597; CHECK-NEXT: vmov r5, s12
1598; CHECK-NEXT: vmov r4, s16
1599; CHECK-NEXT: umull r5, r4, r4, r5
1600; CHECK-NEXT: vmov.32 q5[0], r5
1601; CHECK-NEXT: vmov r5, s14
1602; CHECK-NEXT: vmov.32 q5[1], r4
1603; CHECK-NEXT: vmov r4, s18
1604; CHECK-NEXT: umull r5, r4, r4, r5
1605; CHECK-NEXT: vmov.32 q5[2], r5
1606; CHECK-NEXT: vmov.32 q5[3], r4
1607; CHECK-NEXT: vmov r2, s20
1608; CHECK-NEXT: vmov r5, s21
1609; CHECK-NEXT: adds.w r2, r2, r12
1610; CHECK-NEXT: adcs r3, r5
1611; CHECK-NEXT: vmov r5, s22
1612; CHECK-NEXT: adds.w r12, r2, r5
1613; CHECK-NEXT: vmov.u8 r5, q1[8]
1614; CHECK-NEXT: adcs r3, r4
1615; CHECK-NEXT: vmov.u8 r4, q0[8]
1616; CHECK-NEXT: vmov.32 q3[0], r5
1617; CHECK-NEXT: vmov.u8 r5, q1[9]
1618; CHECK-NEXT: vmov.32 q4[0], r4
1619; CHECK-NEXT: vmov.u8 r4, q0[9]
1620; CHECK-NEXT: vmov.32 q3[2], r5
1621; CHECK-NEXT: vmov.32 q4[2], r4
1622; CHECK-NEXT: vand q3, q3, q2
1623; CHECK-NEXT: vand q4, q4, q2
1624; CHECK-NEXT: vmov r5, s12
1625; CHECK-NEXT: vmov r4, s16
1626; CHECK-NEXT: umull r5, r4, r4, r5
1627; CHECK-NEXT: vmov.32 q5[0], r5
1628; CHECK-NEXT: vmov r5, s14
1629; CHECK-NEXT: vmov.32 q5[1], r4
1630; CHECK-NEXT: vmov r4, s18
1631; CHECK-NEXT: umull r5, r4, r4, r5
1632; CHECK-NEXT: vmov.32 q5[2], r5
1633; CHECK-NEXT: vmov.32 q5[3], r4
1634; CHECK-NEXT: vmov r2, s20
1635; CHECK-NEXT: vmov r5, s21
1636; CHECK-NEXT: adds.w r2, r2, r12
1637; CHECK-NEXT: adcs r3, r5
1638; CHECK-NEXT: vmov r5, s22
1639; CHECK-NEXT: adds.w r12, r2, r5
1640; CHECK-NEXT: vmov.u8 r5, q1[10]
1641; CHECK-NEXT: adcs r3, r4
1642; CHECK-NEXT: vmov.u8 r4, q0[10]
1643; CHECK-NEXT: vmov.32 q3[0], r5
1644; CHECK-NEXT: vmov.u8 r5, q1[11]
1645; CHECK-NEXT: vmov.32 q4[0], r4
1646; CHECK-NEXT: vmov.u8 r4, q0[11]
1647; CHECK-NEXT: vmov.32 q3[2], r5
1648; CHECK-NEXT: vmov.32 q4[2], r4
1649; CHECK-NEXT: vand q3, q3, q2
1650; CHECK-NEXT: vand q4, q4, q2
1651; CHECK-NEXT: vmov r5, s12
1652; CHECK-NEXT: vmov r4, s16
1653; CHECK-NEXT: umull r5, r4, r4, r5
1654; CHECK-NEXT: vmov.32 q5[0], r5
1655; CHECK-NEXT: vmov r5, s14
1656; CHECK-NEXT: vmov.32 q5[1], r4
1657; CHECK-NEXT: vmov r4, s18
1658; CHECK-NEXT: umull r5, r4, r4, r5
1659; CHECK-NEXT: vmov.32 q5[2], r5
1660; CHECK-NEXT: vmov.32 q5[3], r4
1661; CHECK-NEXT: vmov r2, s20
1662; CHECK-NEXT: vmov r5, s21
1663; CHECK-NEXT: adds.w r2, r2, r12
1664; CHECK-NEXT: adcs r3, r5
1665; CHECK-NEXT: vmov r5, s22
1666; CHECK-NEXT: adds.w r12, r2, r5
1667; CHECK-NEXT: vmov.u8 r5, q1[12]
1668; CHECK-NEXT: adcs r3, r4
1669; CHECK-NEXT: vmov.u8 r4, q0[12]
1670; CHECK-NEXT: vmov.32 q3[0], r5
1671; CHECK-NEXT: vmov.u8 r5, q1[13]
1672; CHECK-NEXT: vmov.32 q4[0], r4
1673; CHECK-NEXT: vmov.u8 r4, q0[13]
1674; CHECK-NEXT: vmov.32 q3[2], r5
1675; CHECK-NEXT: vmov.32 q4[2], r4
1676; CHECK-NEXT: vand q3, q3, q2
1677; CHECK-NEXT: vand q4, q4, q2
1678; CHECK-NEXT: vmov r5, s12
1679; CHECK-NEXT: vmov r4, s16
1680; CHECK-NEXT: umull r5, r4, r4, r5
1681; CHECK-NEXT: vmov.32 q5[0], r5
1682; CHECK-NEXT: vmov r5, s14
1683; CHECK-NEXT: vmov.32 q5[1], r4
1684; CHECK-NEXT: vmov r4, s18
1685; CHECK-NEXT: umull r5, r4, r4, r5
1686; CHECK-NEXT: vmov.32 q5[2], r5
1687; CHECK-NEXT: vmov.32 q5[3], r4
1688; CHECK-NEXT: vmov r2, s20
1689; CHECK-NEXT: vmov r5, s21
1690; CHECK-NEXT: adds.w r2, r2, r12
1691; CHECK-NEXT: adcs r3, r5
1692; CHECK-NEXT: vmov r5, s22
1693; CHECK-NEXT: adds r2, r2, r5
1694; CHECK-NEXT: vmov.u8 r5, q1[14]
1695; CHECK-NEXT: vmov.32 q3[0], r5
1696; CHECK-NEXT: vmov.u8 r5, q1[15]
1697; CHECK-NEXT: adcs r3, r4
1698; CHECK-NEXT: vmov.32 q3[2], r5
1699; CHECK-NEXT: vmov.u8 r4, q0[14]
1700; CHECK-NEXT: vand q1, q3, q2
1701; CHECK-NEXT: vmov.32 q3[0], r4
1702; CHECK-NEXT: vmov.u8 r4, q0[15]
1703; CHECK-NEXT: vmov.32 q3[2], r4
1704; CHECK-NEXT: vmov r5, s4
1705; CHECK-NEXT: vand q0, q3, q2
1706; CHECK-NEXT: vmov r4, s0
1707; CHECK-NEXT: umlal r2, r3, r4, r5
1708; CHECK-NEXT: vmov r5, s6
1709; CHECK-NEXT: vmov r4, s2
1710; CHECK-NEXT: umlal r2, r3, r4, r5
1711; CHECK-NEXT: adds r0, r0, r2
1712; CHECK-NEXT: adcs r1, r3
1713; CHECK-NEXT: vpop {d8, d9, d10, d11}
1714; CHECK-NEXT: pop {r4, r5, r7, pc}
1715; CHECK-NEXT: .p2align 4
1716; CHECK-NEXT: @ %bb.1:
1717; CHECK-NEXT: .LCPI51_0:
1718; CHECK-NEXT: .long 255 @ 0xff
1719; CHECK-NEXT: .long 0 @ 0x0
1720; CHECK-NEXT: .long 255 @ 0xff
1721; CHECK-NEXT: .long 0 @ 0x0
1722entry:
1723 %xx = zext <16 x i8> %x to <16 x i64>
1724 %yy = zext <16 x i8> %y to <16 x i64>
1725 %m = mul <16 x i64> %xx, %yy
1726 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
1727 %r = add i64 %z, %a
1728 ret i64 %r
1729}
1730
1731define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
1732; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
1733; CHECK: @ %bb.0: @ %entry
1734; CHECK-NEXT: .save {r4, lr}
1735; CHECK-NEXT: push {r4, lr}
1736; CHECK-NEXT: vmov.u8 r2, q1[0]
1737; CHECK-NEXT: vmov.u8 r3, q0[0]
1738; CHECK-NEXT: sxtb r2, r2
1739; CHECK-NEXT: sxtb r3, r3
1740; CHECK-NEXT: smull r2, r3, r3, r2
1741; CHECK-NEXT: vmov.32 q2[0], r2
1742; CHECK-NEXT: vmov.u8 r2, q1[1]
1743; CHECK-NEXT: vmov.32 q2[1], r3
1744; CHECK-NEXT: vmov.u8 r3, q0[1]
1745; CHECK-NEXT: sxtb r2, r2
1746; CHECK-NEXT: sxtb r3, r3
1747; CHECK-NEXT: smull r2, r3, r3, r2
1748; CHECK-NEXT: vmov.32 q2[2], r2
1749; CHECK-NEXT: vmov.32 q2[3], r3
1750; CHECK-NEXT: vmov lr, s10
1751; CHECK-NEXT: vmov r2, s8
1752; CHECK-NEXT: vmov r12, s9
1753; CHECK-NEXT: adds.w lr, lr, r2
1754; CHECK-NEXT: vmov.u8 r2, q1[2]
1755; CHECK-NEXT: adc.w r12, r12, r3
1756; CHECK-NEXT: vmov.u8 r3, q0[2]
1757; CHECK-NEXT: sxtb r2, r2
1758; CHECK-NEXT: sxtb r3, r3
1759; CHECK-NEXT: smull r2, r3, r3, r2
1760; CHECK-NEXT: vmov.32 q2[0], r2
1761; CHECK-NEXT: vmov.u8 r2, q1[3]
1762; CHECK-NEXT: vmov.32 q2[1], r3
1763; CHECK-NEXT: vmov.u8 r3, q0[3]
1764; CHECK-NEXT: sxtb r2, r2
1765; CHECK-NEXT: sxtb r3, r3
1766; CHECK-NEXT: smull r2, r3, r3, r2
1767; CHECK-NEXT: vmov.32 q2[2], r2
1768; CHECK-NEXT: vmov.32 q2[3], r3
1769; CHECK-NEXT: vmov r4, s8
1770; CHECK-NEXT: vmov r2, s9
1771; CHECK-NEXT: adds.w r4, r4, lr
1772; CHECK-NEXT: adc.w r12, r12, r2
1773; CHECK-NEXT: vmov r2, s10
1774; CHECK-NEXT: adds.w lr, r4, r2
1775; CHECK-NEXT: vmov.u8 r4, q1[4]
1776; CHECK-NEXT: vmov.u8 r2, q0[4]
1777; CHECK-NEXT: sxtb r4, r4
1778; CHECK-NEXT: sxtb r2, r2
1779; CHECK-NEXT: adc.w r12, r12, r3
1780; CHECK-NEXT: smull r2, r4, r2, r4
1781; CHECK-NEXT: vmov.32 q2[0], r2
1782; CHECK-NEXT: vmov.u8 r2, q1[5]
1783; CHECK-NEXT: vmov.32 q2[1], r4
1784; CHECK-NEXT: vmov.u8 r4, q0[5]
1785; CHECK-NEXT: sxtb r2, r2
1786; CHECK-NEXT: sxtb r4, r4
1787; CHECK-NEXT: smull r2, r4, r4, r2
1788; CHECK-NEXT: vmov.32 q2[2], r2
1789; CHECK-NEXT: vmov.32 q2[3], r4
1790; CHECK-NEXT: vmov r3, s8
1791; CHECK-NEXT: vmov r2, s9
1792; CHECK-NEXT: adds.w r3, r3, lr
1793; CHECK-NEXT: adc.w r12, r12, r2
1794; CHECK-NEXT: vmov r2, s10
1795; CHECK-NEXT: adds.w lr, r3, r2
1796; CHECK-NEXT: vmov.u8 r2, q0[6]
1797; CHECK-NEXT: adc.w r12, r12, r4
1798; CHECK-NEXT: vmov.u8 r4, q1[6]
1799; CHECK-NEXT: sxtb r4, r4
1800; CHECK-NEXT: sxtb r2, r2
1801; CHECK-NEXT: smull r2, r4, r2, r4
1802; CHECK-NEXT: vmov.32 q2[0], r2
1803; CHECK-NEXT: vmov.u8 r2, q1[7]
1804; CHECK-NEXT: vmov.32 q2[1], r4
1805; CHECK-NEXT: vmov.u8 r4, q0[7]
1806; CHECK-NEXT: sxtb r2, r2
1807; CHECK-NEXT: sxtb r4, r4
1808; CHECK-NEXT: smull r2, r4, r4, r2
1809; CHECK-NEXT: vmov.32 q2[2], r2
1810; CHECK-NEXT: vmov.32 q2[3], r4
1811; CHECK-NEXT: vmov r3, s8
1812; CHECK-NEXT: vmov r2, s9
1813; CHECK-NEXT: adds.w r3, r3, lr
1814; CHECK-NEXT: adc.w r12, r12, r2
1815; CHECK-NEXT: vmov r2, s10
1816; CHECK-NEXT: adds.w lr, r3, r2
1817; CHECK-NEXT: vmov.u8 r2, q0[8]
1818; CHECK-NEXT: adc.w r12, r12, r4
1819; CHECK-NEXT: vmov.u8 r4, q1[8]
1820; CHECK-NEXT: sxtb r4, r4
1821; CHECK-NEXT: sxtb r2, r2
1822; CHECK-NEXT: smull r2, r4, r2, r4
1823; CHECK-NEXT: vmov.32 q2[0], r2
1824; CHECK-NEXT: vmov.u8 r2, q1[9]
1825; CHECK-NEXT: vmov.32 q2[1], r4
1826; CHECK-NEXT: vmov.u8 r4, q0[9]
1827; CHECK-NEXT: sxtb r2, r2
1828; CHECK-NEXT: sxtb r4, r4
1829; CHECK-NEXT: smull r2, r4, r4, r2
1830; CHECK-NEXT: vmov.32 q2[2], r2
1831; CHECK-NEXT: vmov.32 q2[3], r4
1832; CHECK-NEXT: vmov r3, s8
1833; CHECK-NEXT: vmov r2, s9
1834; CHECK-NEXT: adds.w r3, r3, lr
1835; CHECK-NEXT: adc.w r12, r12, r2
1836; CHECK-NEXT: vmov r2, s10
1837; CHECK-NEXT: adds.w lr, r3, r2
1838; CHECK-NEXT: vmov.u8 r2, q0[10]
1839; CHECK-NEXT: adc.w r12, r12, r4
1840; CHECK-NEXT: vmov.u8 r4, q1[10]
1841; CHECK-NEXT: sxtb r4, r4
1842; CHECK-NEXT: sxtb r2, r2
1843; CHECK-NEXT: smull r2, r4, r2, r4
1844; CHECK-NEXT: vmov.32 q2[0], r2
1845; CHECK-NEXT: vmov.u8 r2, q1[11]
1846; CHECK-NEXT: vmov.32 q2[1], r4
1847; CHECK-NEXT: vmov.u8 r4, q0[11]
1848; CHECK-NEXT: sxtb r2, r2
1849; CHECK-NEXT: sxtb r4, r4
1850; CHECK-NEXT: smull r2, r4, r4, r2
1851; CHECK-NEXT: vmov.32 q2[2], r2
1852; CHECK-NEXT: vmov.32 q2[3], r4
1853; CHECK-NEXT: vmov r3, s8
1854; CHECK-NEXT: vmov r2, s9
1855; CHECK-NEXT: adds.w r3, r3, lr
1856; CHECK-NEXT: adc.w r12, r12, r2
1857; CHECK-NEXT: vmov r2, s10
1858; CHECK-NEXT: adds.w lr, r3, r2
1859; CHECK-NEXT: vmov.u8 r2, q0[12]
1860; CHECK-NEXT: adc.w r12, r12, r4
1861; CHECK-NEXT: vmov.u8 r4, q1[12]
1862; CHECK-NEXT: sxtb r4, r4
1863; CHECK-NEXT: sxtb r2, r2
1864; CHECK-NEXT: smull r2, r4, r2, r4
1865; CHECK-NEXT: vmov.32 q2[0], r2
1866; CHECK-NEXT: vmov.u8 r2, q1[13]
1867; CHECK-NEXT: vmov.32 q2[1], r4
1868; CHECK-NEXT: vmov.u8 r4, q0[13]
1869; CHECK-NEXT: sxtb r2, r2
1870; CHECK-NEXT: sxtb r4, r4
1871; CHECK-NEXT: smull r2, r4, r4, r2
1872; CHECK-NEXT: vmov.32 q2[2], r2
1873; CHECK-NEXT: vmov.32 q2[3], r4
1874; CHECK-NEXT: vmov r3, s8
1875; CHECK-NEXT: vmov r2, s9
1876; CHECK-NEXT: adds.w r3, r3, lr
1877; CHECK-NEXT: adc.w r12, r12, r2
1878; CHECK-NEXT: vmov r2, s10
1879; CHECK-NEXT: adds r2, r2, r3
1880; CHECK-NEXT: adc.w r3, r12, r4
1881; CHECK-NEXT: vmov.u8 r4, q1[14]
1882; CHECK-NEXT: sxtb.w r12, r4
1883; CHECK-NEXT: vmov.u8 r4, q0[14]
1884; CHECK-NEXT: sxtb r4, r4
1885; CHECK-NEXT: smlal r2, r3, r4, r12
1886; CHECK-NEXT: vmov.u8 r4, q1[15]
1887; CHECK-NEXT: sxtb.w r12, r4
1888; CHECK-NEXT: vmov.u8 r4, q0[15]
1889; CHECK-NEXT: sxtb r4, r4
1890; CHECK-NEXT: smlal r2, r3, r4, r12
1891; CHECK-NEXT: adds r0, r0, r2
1892; CHECK-NEXT: adcs r1, r3
1893; CHECK-NEXT: pop {r4, pc}
1894entry:
1895 %xx = sext <16 x i8> %x to <16 x i64>
1896 %yy = sext <16 x i8> %y to <16 x i64>
1897 %m = mul <16 x i64> %xx, %yy
1898 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
1899 %r = add i64 %z, %a
1900 ret i64 %r
1901}
1902
1903define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
1904; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
1905; CHECK: @ %bb.0: @ %entry
1906; CHECK-NEXT: .save {r7, lr}
1907; CHECK-NEXT: push {r7, lr}
1908; CHECK-NEXT: adr r2, .LCPI53_0
1909; CHECK-NEXT: vldrw.u32 q2, [r2]
1910; CHECK-NEXT: vand q1, q1, q2
1911; CHECK-NEXT: vand q0, q0, q2
1912; CHECK-NEXT: vmov r2, s6
1913; CHECK-NEXT: vmov r3, s2
1914; CHECK-NEXT: umull r12, lr, r3, r2
1915; CHECK-NEXT: vmov r2, s4
1916; CHECK-NEXT: vmov r3, s0
1917; CHECK-NEXT: umull r2, r3, r3, r2
1918; CHECK-NEXT: add r2, r12
1919; CHECK-NEXT: orr.w r3, r3, lr
1920; CHECK-NEXT: adds r0, r0, r2
1921; CHECK-NEXT: adcs r1, r3
1922; CHECK-NEXT: pop {r7, pc}
1923; CHECK-NEXT: .p2align 4
1924; CHECK-NEXT: @ %bb.1:
1925; CHECK-NEXT: .LCPI53_0:
1926; CHECK-NEXT: .long 255 @ 0xff
1927; CHECK-NEXT: .long 0 @ 0x0
1928; CHECK-NEXT: .long 255 @ 0xff
1929; CHECK-NEXT: .long 0 @ 0x0
1930entry:
1931 %xx = zext <2 x i8> %x to <2 x i64>
1932 %yy = zext <2 x i8> %y to <2 x i64>
1933 %m = mul <2 x i64> %xx, %yy
1934 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1935 %r = add i64 %z, %a
1936 ret i64 %r
1937}
1938
1939define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
1940; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
1941; CHECK: @ %bb.0: @ %entry
1942; CHECK-NEXT: .save {r7, lr}
1943; CHECK-NEXT: push {r7, lr}
1944; CHECK-NEXT: vmov r2, s4
1945; CHECK-NEXT: vmov r3, s0
1946; CHECK-NEXT: sxtb r2, r2
1947; CHECK-NEXT: sxtb r3, r3
1948; CHECK-NEXT: smull r2, r12, r3, r2
1949; CHECK-NEXT: vmov r3, s6
1950; CHECK-NEXT: sxtb.w lr, r3
1951; CHECK-NEXT: vmov r3, s2
1952; CHECK-NEXT: sxtb r3, r3
1953; CHECK-NEXT: smlal r2, r12, r3, lr
1954; CHECK-NEXT: adds r0, r0, r2
1955; CHECK-NEXT: adc.w r1, r1, r12
1956; CHECK-NEXT: pop {r7, pc}
1957entry:
1958 %xx = sext <2 x i8> %x to <2 x i64>
1959 %yy = sext <2 x i8> %y to <2 x i64>
1960 %m = mul <2 x i64> %xx, %yy
1961 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1962 %r = add i64 %z, %a
1963 ret i64 %r
1964}
1965
1966define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64 %a) {
1967; CHECK-LABEL: add_v2i64_v2i64_acc:
1968; CHECK: @ %bb.0: @ %entry
1969; CHECK-NEXT: .save {r4, r5, r6, lr}
1970; CHECK-NEXT: push {r4, r5, r6, lr}
1971; CHECK-NEXT: vmov r2, s4
1972; CHECK-NEXT: vmov r3, s0
1973; CHECK-NEXT: vmov r4, s5
1974; CHECK-NEXT: vmov r6, s7
1975; CHECK-NEXT: umull r12, lr, r3, r2
1976; CHECK-NEXT: mla r3, r3, r4, lr
1977; CHECK-NEXT: vmov r4, s1
1978; CHECK-NEXT: vmov.32 q2[0], r12
1979; CHECK-NEXT: mla r2, r4, r2, r3
1980; CHECK-NEXT: vmov r4, s6
1981; CHECK-NEXT: vmov r3, s2
1982; CHECK-NEXT: vmov.32 q2[1], r2
1983; CHECK-NEXT: vmov r12, s8
1984; CHECK-NEXT: umull lr, r5, r3, r4
1985; CHECK-NEXT: mla r3, r3, r6, r5
1986; CHECK-NEXT: vmov r5, s3
1987; CHECK-NEXT: adds.w r6, r12, lr
1988; CHECK-NEXT: mla r3, r5, r4, r3
1989; CHECK-NEXT: adcs r2, r3
1990; CHECK-NEXT: adds r0, r0, r6
1991; CHECK-NEXT: adcs r1, r2
1992; CHECK-NEXT: pop {r4, r5, r6, pc}
1993entry:
1994 %m = mul <2 x i64> %x, %y
1995 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1996 %r = add i64 %z, %a
1997 ret i64 %r
1998}
1999
2000declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
2001declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
2002declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
2003declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
2004declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
2005declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
2006declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
2007declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
2008declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
2009declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)