blob: 690e2c38da9e7383df5c5df6e116f0bbf2e251d1 [file] [log] [blame]
David Green0ac4f6b2020-02-17 11:41:16 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3
4define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
5; CHECK-LABEL: add_v4i32_v4i32:
6; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00007; CHECK-NEXT: vmlav.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00008; CHECK-NEXT: bx lr
9entry:
10 %m = mul <4 x i32> %x, %y
11 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
12 ret i32 %z
13}
14
15define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
16; CHECK-LABEL: add_v4i32_v4i64_zext:
17; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +000018; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +000019; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +000020entry:
21 %xx = zext <4 x i32> %x to <4 x i64>
22 %yy = zext <4 x i32> %y to <4 x i64>
23 %m = mul <4 x i64> %xx, %yy
24 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
25 ret i64 %z
26}
27
28define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
29; CHECK-LABEL: add_v4i32_v4i64_sext:
30; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +000031; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +000032; CHECK-NEXT: bx lr
33entry:
34 %xx = sext <4 x i32> %x to <4 x i64>
35 %yy = sext <4 x i32> %y to <4 x i64>
36 %m = mul <4 x i64> %xx, %yy
37 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
38 ret i64 %z
39}
40
41define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
42; CHECK-LABEL: add_v2i32_v2i64_zext:
43; CHECK: @ %bb.0: @ %entry
44; CHECK-NEXT: vmov r0, s4
45; CHECK-NEXT: vmov r1, s0
46; CHECK-NEXT: vmov r2, s6
47; CHECK-NEXT: vmov r3, s2
48; CHECK-NEXT: umull r0, r1, r1, r0
49; CHECK-NEXT: umlal r0, r1, r3, r2
50; CHECK-NEXT: bx lr
51entry:
52 %xx = zext <2 x i32> %x to <2 x i64>
53 %yy = zext <2 x i32> %y to <2 x i64>
54 %m = mul <2 x i64> %xx, %yy
55 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
56 ret i64 %z
57}
58
59define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
60; CHECK-LABEL: add_v2i32_v2i64_sext:
61; CHECK: @ %bb.0: @ %entry
62; CHECK-NEXT: vmov r0, s4
63; CHECK-NEXT: vmov r1, s0
64; CHECK-NEXT: vmov r2, s6
65; CHECK-NEXT: vmov r3, s2
66; CHECK-NEXT: smull r0, r1, r1, r0
67; CHECK-NEXT: smlal r0, r1, r3, r2
68; CHECK-NEXT: bx lr
69entry:
70 %xx = sext <2 x i32> %x to <2 x i64>
71 %yy = sext <2 x i32> %y to <2 x i64>
72 %m = mul <2 x i64> %xx, %yy
73 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
74 ret i64 %z
75}
76
77define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
78; CHECK-LABEL: add_v8i16_v8i32_zext:
79; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +000080; CHECK-NEXT: vmlav.u16 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +000081; CHECK-NEXT: bx lr
82entry:
83 %xx = zext <8 x i16> %x to <8 x i32>
84 %yy = zext <8 x i16> %y to <8 x i32>
85 %m = mul <8 x i32> %xx, %yy
86 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
87 ret i32 %z
88}
89
90define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
91; CHECK-LABEL: add_v8i16_v8i32_sext:
92; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +000093; CHECK-NEXT: vmlav.s16 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +000094; CHECK-NEXT: bx lr
95entry:
96 %xx = sext <8 x i16> %x to <8 x i32>
97 %yy = sext <8 x i16> %y to <8 x i32>
98 %m = mul <8 x i32> %xx, %yy
99 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
100 ret i32 %z
101}
102
103define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
104; CHECK-LABEL: add_v4i16_v4i32_zext:
105; CHECK: @ %bb.0: @ %entry
106; CHECK-NEXT: vmovlb.u16 q1, q1
107; CHECK-NEXT: vmovlb.u16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +0000108; CHECK-NEXT: vmlav.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000109; CHECK-NEXT: bx lr
110entry:
111 %xx = zext <4 x i16> %x to <4 x i32>
112 %yy = zext <4 x i16> %y to <4 x i32>
113 %m = mul <4 x i32> %xx, %yy
114 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
115 ret i32 %z
116}
117
118define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
119; CHECK-LABEL: add_v4i16_v4i32_sext:
120; CHECK: @ %bb.0: @ %entry
121; CHECK-NEXT: vmovlb.s16 q1, q1
122; CHECK-NEXT: vmovlb.s16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +0000123; CHECK-NEXT: vmlav.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000124; CHECK-NEXT: bx lr
125entry:
126 %xx = sext <4 x i16> %x to <4 x i32>
127 %yy = sext <4 x i16> %y to <4 x i32>
128 %m = mul <4 x i32> %xx, %yy
129 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
130 ret i32 %z
131}
132
133define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
134; CHECK-LABEL: add_v8i16_v8i16:
135; CHECK: @ %bb.0: @ %entry
136; CHECK-NEXT: vmul.i16 q0, q0, q1
137; CHECK-NEXT: vaddv.u16 r0, q0
138; CHECK-NEXT: uxth r0, r0
139; CHECK-NEXT: bx lr
140entry:
141 %m = mul <8 x i16> %x, %y
142 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
143 ret i16 %z
144}
145
146define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
147; CHECK-LABEL: add_v8i16_v8i64_zext:
148; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000149; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000150; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +0000151entry:
152 %xx = zext <8 x i16> %x to <8 x i64>
153 %yy = zext <8 x i16> %y to <8 x i64>
154 %m = mul <8 x i64> %xx, %yy
155 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
156 ret i64 %z
157}
158
159define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
160; CHECK-LABEL: add_v8i16_v8i64_sext:
161; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000162; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000163; CHECK-NEXT: bx lr
164entry:
165 %xx = sext <8 x i16> %x to <8 x i64>
166 %yy = sext <8 x i16> %y to <8 x i64>
167 %m = mul <8 x i64> %xx, %yy
168 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
169 ret i64 %z
170}
171
172define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
173; CHECK-LABEL: add_v2i16_v2i64_zext:
174; CHECK: @ %bb.0: @ %entry
David Greenc9eaed52020-03-28 16:22:05 +0000175; CHECK-NEXT: vmov.i64 q2, #0xffff
David Green0ac4f6b2020-02-17 11:41:16 +0000176; CHECK-NEXT: vand q1, q1, q2
177; CHECK-NEXT: vand q0, q0, q2
178; CHECK-NEXT: vmov r0, s4
179; CHECK-NEXT: vmov r1, s0
180; CHECK-NEXT: vmov r2, s6
181; CHECK-NEXT: vmov r3, s2
182; CHECK-NEXT: umull r0, r1, r1, r0
183; CHECK-NEXT: umlal r0, r1, r3, r2
184; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +0000185entry:
186 %xx = zext <2 x i16> %x to <2 x i64>
187 %yy = zext <2 x i16> %y to <2 x i64>
188 %m = mul <2 x i64> %xx, %yy
189 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
190 ret i64 %z
191}
192
193define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
194; CHECK-LABEL: add_v2i16_v2i64_sext:
195; CHECK: @ %bb.0: @ %entry
196; CHECK-NEXT: vmov r0, s4
197; CHECK-NEXT: vmov r1, s0
198; CHECK-NEXT: vmov r2, s6
199; CHECK-NEXT: vmov r3, s2
200; CHECK-NEXT: sxth r0, r0
201; CHECK-NEXT: sxth r1, r1
202; CHECK-NEXT: smull r0, r1, r1, r0
203; CHECK-NEXT: sxth r2, r2
204; CHECK-NEXT: sxth r3, r3
205; CHECK-NEXT: smlal r0, r1, r3, r2
206; CHECK-NEXT: bx lr
207entry:
208 %xx = sext <2 x i16> %x to <2 x i64>
209 %yy = sext <2 x i16> %y to <2 x i64>
210 %m = mul <2 x i64> %xx, %yy
211 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
212 ret i64 %z
213}
214
215define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
216; CHECK-LABEL: add_v16i8_v16i32_zext:
217; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000218; CHECK-NEXT: vmlav.u8 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000219; CHECK-NEXT: bx lr
220entry:
221 %xx = zext <16 x i8> %x to <16 x i32>
222 %yy = zext <16 x i8> %y to <16 x i32>
223 %m = mul <16 x i32> %xx, %yy
224 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
225 ret i32 %z
226}
227
228define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
229; CHECK-LABEL: add_v16i8_v16i32_sext:
230; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000231; CHECK-NEXT: vmlav.s8 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000232; CHECK-NEXT: bx lr
233entry:
234 %xx = sext <16 x i8> %x to <16 x i32>
235 %yy = sext <16 x i8> %y to <16 x i32>
236 %m = mul <16 x i32> %xx, %yy
237 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
238 ret i32 %z
239}
240
241define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
242; CHECK-LABEL: add_v4i8_v4i32_zext:
243; CHECK: @ %bb.0: @ %entry
244; CHECK-NEXT: vmov.i32 q2, #0xff
245; CHECK-NEXT: vand q1, q1, q2
246; CHECK-NEXT: vand q0, q0, q2
David Green33aa5df2020-02-17 12:00:17 +0000247; CHECK-NEXT: vmlav.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000248; CHECK-NEXT: bx lr
249entry:
250 %xx = zext <4 x i8> %x to <4 x i32>
251 %yy = zext <4 x i8> %y to <4 x i32>
252 %m = mul <4 x i32> %xx, %yy
253 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
254 ret i32 %z
255}
256
257define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
258; CHECK-LABEL: add_v4i8_v4i32_sext:
259; CHECK: @ %bb.0: @ %entry
260; CHECK-NEXT: vmovlb.s8 q1, q1
261; CHECK-NEXT: vmovlb.s8 q0, q0
262; CHECK-NEXT: vmovlb.s16 q1, q1
263; CHECK-NEXT: vmovlb.s16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +0000264; CHECK-NEXT: vmlav.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000265; CHECK-NEXT: bx lr
266entry:
267 %xx = sext <4 x i8> %x to <4 x i32>
268 %yy = sext <4 x i8> %y to <4 x i32>
269 %m = mul <4 x i32> %xx, %yy
270 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
271 ret i32 %z
272}
273
274define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
275; CHECK-LABEL: add_v16i8_v16i16_zext:
276; CHECK: @ %bb.0: @ %entry
277; CHECK-NEXT: vmov.u8 r0, q1[8]
278; CHECK-NEXT: vmov.16 q2[0], r0
279; CHECK-NEXT: vmov.u8 r0, q1[9]
280; CHECK-NEXT: vmov.16 q2[1], r0
281; CHECK-NEXT: vmov.u8 r0, q1[10]
282; CHECK-NEXT: vmov.16 q2[2], r0
283; CHECK-NEXT: vmov.u8 r0, q1[11]
284; CHECK-NEXT: vmov.16 q2[3], r0
285; CHECK-NEXT: vmov.u8 r0, q1[12]
286; CHECK-NEXT: vmov.16 q2[4], r0
287; CHECK-NEXT: vmov.u8 r0, q1[13]
288; CHECK-NEXT: vmov.16 q2[5], r0
289; CHECK-NEXT: vmov.u8 r0, q1[14]
290; CHECK-NEXT: vmov.16 q2[6], r0
291; CHECK-NEXT: vmov.u8 r0, q1[15]
292; CHECK-NEXT: vmov.16 q2[7], r0
293; CHECK-NEXT: vmov.u8 r0, q0[8]
294; CHECK-NEXT: vmov.16 q3[0], r0
295; CHECK-NEXT: vmov.u8 r0, q0[9]
296; CHECK-NEXT: vmov.16 q3[1], r0
297; CHECK-NEXT: vmov.u8 r0, q0[10]
298; CHECK-NEXT: vmov.16 q3[2], r0
299; CHECK-NEXT: vmov.u8 r0, q0[11]
300; CHECK-NEXT: vmov.16 q3[3], r0
301; CHECK-NEXT: vmov.u8 r0, q0[12]
302; CHECK-NEXT: vmov.16 q3[4], r0
303; CHECK-NEXT: vmov.u8 r0, q0[13]
304; CHECK-NEXT: vmov.16 q3[5], r0
305; CHECK-NEXT: vmov.u8 r0, q0[14]
306; CHECK-NEXT: vmov.16 q3[6], r0
307; CHECK-NEXT: vmov.u8 r0, q0[15]
308; CHECK-NEXT: vmov.16 q3[7], r0
309; CHECK-NEXT: vmovlb.u8 q2, q2
310; CHECK-NEXT: vmovlb.u8 q3, q3
311; CHECK-NEXT: vmov.u8 r0, q1[0]
312; CHECK-NEXT: vmul.i16 q2, q3, q2
313; CHECK-NEXT: vmov.16 q3[0], r0
314; CHECK-NEXT: vmov.u8 r0, q1[1]
315; CHECK-NEXT: vmov.16 q3[1], r0
316; CHECK-NEXT: vmov.u8 r0, q1[2]
317; CHECK-NEXT: vmov.16 q3[2], r0
318; CHECK-NEXT: vmov.u8 r0, q1[3]
319; CHECK-NEXT: vmov.16 q3[3], r0
320; CHECK-NEXT: vmov.u8 r0, q1[4]
321; CHECK-NEXT: vmov.16 q3[4], r0
322; CHECK-NEXT: vmov.u8 r0, q1[5]
323; CHECK-NEXT: vmov.16 q3[5], r0
324; CHECK-NEXT: vmov.u8 r0, q1[6]
325; CHECK-NEXT: vmov.16 q3[6], r0
326; CHECK-NEXT: vmov.u8 r0, q1[7]
327; CHECK-NEXT: vmov.16 q3[7], r0
328; CHECK-NEXT: vmov.u8 r0, q0[0]
329; CHECK-NEXT: vmovlb.u8 q1, q3
330; CHECK-NEXT: vmov.16 q3[0], r0
331; CHECK-NEXT: vmov.u8 r0, q0[1]
332; CHECK-NEXT: vmov.16 q3[1], r0
333; CHECK-NEXT: vmov.u8 r0, q0[2]
334; CHECK-NEXT: vmov.16 q3[2], r0
335; CHECK-NEXT: vmov.u8 r0, q0[3]
336; CHECK-NEXT: vmov.16 q3[3], r0
337; CHECK-NEXT: vmov.u8 r0, q0[4]
338; CHECK-NEXT: vmov.16 q3[4], r0
339; CHECK-NEXT: vmov.u8 r0, q0[5]
340; CHECK-NEXT: vmov.16 q3[5], r0
341; CHECK-NEXT: vmov.u8 r0, q0[6]
342; CHECK-NEXT: vmov.16 q3[6], r0
343; CHECK-NEXT: vmov.u8 r0, q0[7]
344; CHECK-NEXT: vmov.16 q3[7], r0
345; CHECK-NEXT: vmovlb.u8 q0, q3
346; CHECK-NEXT: vmul.i16 q0, q0, q1
347; CHECK-NEXT: vadd.i16 q0, q0, q2
348; CHECK-NEXT: vaddv.u16 r0, q0
349; CHECK-NEXT: uxth r0, r0
350; CHECK-NEXT: bx lr
351entry:
352 %xx = zext <16 x i8> %x to <16 x i16>
353 %yy = zext <16 x i8> %y to <16 x i16>
354 %m = mul <16 x i16> %xx, %yy
355 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
356 ret i16 %z
357}
358
359define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
360; CHECK-LABEL: add_v16i8_v16i16_sext:
361; CHECK: @ %bb.0: @ %entry
362; CHECK-NEXT: vmov.u8 r0, q1[8]
363; CHECK-NEXT: vmov.16 q2[0], r0
364; CHECK-NEXT: vmov.u8 r0, q1[9]
365; CHECK-NEXT: vmov.16 q2[1], r0
366; CHECK-NEXT: vmov.u8 r0, q1[10]
367; CHECK-NEXT: vmov.16 q2[2], r0
368; CHECK-NEXT: vmov.u8 r0, q1[11]
369; CHECK-NEXT: vmov.16 q2[3], r0
370; CHECK-NEXT: vmov.u8 r0, q1[12]
371; CHECK-NEXT: vmov.16 q2[4], r0
372; CHECK-NEXT: vmov.u8 r0, q1[13]
373; CHECK-NEXT: vmov.16 q2[5], r0
374; CHECK-NEXT: vmov.u8 r0, q1[14]
375; CHECK-NEXT: vmov.16 q2[6], r0
376; CHECK-NEXT: vmov.u8 r0, q1[15]
377; CHECK-NEXT: vmov.16 q2[7], r0
378; CHECK-NEXT: vmov.u8 r0, q0[8]
379; CHECK-NEXT: vmov.16 q3[0], r0
380; CHECK-NEXT: vmov.u8 r0, q0[9]
381; CHECK-NEXT: vmov.16 q3[1], r0
382; CHECK-NEXT: vmov.u8 r0, q0[10]
383; CHECK-NEXT: vmov.16 q3[2], r0
384; CHECK-NEXT: vmov.u8 r0, q0[11]
385; CHECK-NEXT: vmov.16 q3[3], r0
386; CHECK-NEXT: vmov.u8 r0, q0[12]
387; CHECK-NEXT: vmov.16 q3[4], r0
388; CHECK-NEXT: vmov.u8 r0, q0[13]
389; CHECK-NEXT: vmov.16 q3[5], r0
390; CHECK-NEXT: vmov.u8 r0, q0[14]
391; CHECK-NEXT: vmov.16 q3[6], r0
392; CHECK-NEXT: vmov.u8 r0, q0[15]
393; CHECK-NEXT: vmov.16 q3[7], r0
394; CHECK-NEXT: vmovlb.s8 q2, q2
395; CHECK-NEXT: vmovlb.s8 q3, q3
396; CHECK-NEXT: vmov.u8 r0, q1[0]
397; CHECK-NEXT: vmul.i16 q2, q3, q2
398; CHECK-NEXT: vmov.16 q3[0], r0
399; CHECK-NEXT: vmov.u8 r0, q1[1]
400; CHECK-NEXT: vmov.16 q3[1], r0
401; CHECK-NEXT: vmov.u8 r0, q1[2]
402; CHECK-NEXT: vmov.16 q3[2], r0
403; CHECK-NEXT: vmov.u8 r0, q1[3]
404; CHECK-NEXT: vmov.16 q3[3], r0
405; CHECK-NEXT: vmov.u8 r0, q1[4]
406; CHECK-NEXT: vmov.16 q3[4], r0
407; CHECK-NEXT: vmov.u8 r0, q1[5]
408; CHECK-NEXT: vmov.16 q3[5], r0
409; CHECK-NEXT: vmov.u8 r0, q1[6]
410; CHECK-NEXT: vmov.16 q3[6], r0
411; CHECK-NEXT: vmov.u8 r0, q1[7]
412; CHECK-NEXT: vmov.16 q3[7], r0
413; CHECK-NEXT: vmov.u8 r0, q0[0]
414; CHECK-NEXT: vmovlb.s8 q1, q3
415; CHECK-NEXT: vmov.16 q3[0], r0
416; CHECK-NEXT: vmov.u8 r0, q0[1]
417; CHECK-NEXT: vmov.16 q3[1], r0
418; CHECK-NEXT: vmov.u8 r0, q0[2]
419; CHECK-NEXT: vmov.16 q3[2], r0
420; CHECK-NEXT: vmov.u8 r0, q0[3]
421; CHECK-NEXT: vmov.16 q3[3], r0
422; CHECK-NEXT: vmov.u8 r0, q0[4]
423; CHECK-NEXT: vmov.16 q3[4], r0
424; CHECK-NEXT: vmov.u8 r0, q0[5]
425; CHECK-NEXT: vmov.16 q3[5], r0
426; CHECK-NEXT: vmov.u8 r0, q0[6]
427; CHECK-NEXT: vmov.16 q3[6], r0
428; CHECK-NEXT: vmov.u8 r0, q0[7]
429; CHECK-NEXT: vmov.16 q3[7], r0
430; CHECK-NEXT: vmovlb.s8 q0, q3
431; CHECK-NEXT: vmul.i16 q0, q0, q1
432; CHECK-NEXT: vadd.i16 q0, q0, q2
433; CHECK-NEXT: vaddv.u16 r0, q0
434; CHECK-NEXT: sxth r0, r0
435; CHECK-NEXT: bx lr
436entry:
437 %xx = sext <16 x i8> %x to <16 x i16>
438 %yy = sext <16 x i8> %y to <16 x i16>
439 %m = mul <16 x i16> %xx, %yy
440 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
441 ret i16 %z
442}
443
444define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
445; CHECK-LABEL: add_v8i8_v8i16_zext:
446; CHECK: @ %bb.0: @ %entry
447; CHECK-NEXT: vmovlb.u8 q1, q1
448; CHECK-NEXT: vmovlb.u8 q0, q0
449; CHECK-NEXT: vmul.i16 q0, q0, q1
450; CHECK-NEXT: vaddv.u16 r0, q0
451; CHECK-NEXT: uxth r0, r0
452; CHECK-NEXT: bx lr
453entry:
454 %xx = zext <8 x i8> %x to <8 x i16>
455 %yy = zext <8 x i8> %y to <8 x i16>
456 %m = mul <8 x i16> %xx, %yy
457 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
458 ret i16 %z
459}
460
461define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
462; CHECK-LABEL: add_v8i8_v8i16_sext:
463; CHECK: @ %bb.0: @ %entry
464; CHECK-NEXT: vmovlb.s8 q1, q1
465; CHECK-NEXT: vmovlb.s8 q0, q0
466; CHECK-NEXT: vmul.i16 q0, q0, q1
467; CHECK-NEXT: vaddv.u16 r0, q0
468; CHECK-NEXT: sxth r0, r0
469; CHECK-NEXT: bx lr
470entry:
471 %xx = sext <8 x i8> %x to <8 x i16>
472 %yy = sext <8 x i8> %y to <8 x i16>
473 %m = mul <8 x i16> %xx, %yy
474 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
475 ret i16 %z
476}
477
478define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
479; CHECK-LABEL: add_v16i8_v16i8:
480; CHECK: @ %bb.0: @ %entry
481; CHECK-NEXT: vmul.i8 q0, q0, q1
482; CHECK-NEXT: vaddv.u8 r0, q0
483; CHECK-NEXT: uxtb r0, r0
484; CHECK-NEXT: bx lr
485entry:
486 %m = mul <16 x i8> %x, %y
487 %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
488 ret i8 %z
489}
490
491define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
492; CHECK-LABEL: add_v16i8_v16i64_zext:
493; CHECK: @ %bb.0: @ %entry
494; CHECK-NEXT: .save {r7, lr}
495; CHECK-NEXT: push {r7, lr}
496; CHECK-NEXT: .vsave {d8, d9, d10, d11}
497; CHECK-NEXT: vpush {d8, d9, d10, d11}
498; CHECK-NEXT: vmov.u8 r0, q1[0]
499; CHECK-NEXT: vmov.u8 r1, q0[0]
500; CHECK-NEXT: vmov.32 q3[0], r0
501; CHECK-NEXT: vmov.u8 r0, q1[1]
David Green0ac4f6b2020-02-17 11:41:16 +0000502; CHECK-NEXT: vmov.32 q4[0], r1
503; CHECK-NEXT: vmov.u8 r1, q0[1]
David Greenc9eaed52020-03-28 16:22:05 +0000504; CHECK-NEXT: vmov.32 q3[2], r0
505; CHECK-NEXT: vmov.i64 q2, #0xff
David Green0ac4f6b2020-02-17 11:41:16 +0000506; CHECK-NEXT: vmov.32 q4[2], r1
507; CHECK-NEXT: vand q3, q3, q2
508; CHECK-NEXT: vand q4, q4, q2
509; CHECK-NEXT: vmov r0, s14
510; CHECK-NEXT: vmov r1, s18
511; CHECK-NEXT: vmov r2, s12
512; CHECK-NEXT: vmov r3, s16
513; CHECK-NEXT: umull r12, r1, r1, r0
514; CHECK-NEXT: vmov.u8 r0, q0[2]
515; CHECK-NEXT: vmov.32 q4[0], r0
516; CHECK-NEXT: vmov.u8 r0, q0[3]
517; CHECK-NEXT: vmov.32 q4[2], r0
518; CHECK-NEXT: umull r2, r3, r3, r2
519; CHECK-NEXT: vand q4, q4, q2
520; CHECK-NEXT: vmov r0, s16
521; CHECK-NEXT: orr.w lr, r3, r1
522; CHECK-NEXT: vmov.u8 r3, q1[2]
523; CHECK-NEXT: vmov.32 q3[0], r3
524; CHECK-NEXT: vmov.u8 r3, q1[3]
525; CHECK-NEXT: vmov.32 q3[2], r3
526; CHECK-NEXT: add r2, r12
527; CHECK-NEXT: vand q3, q3, q2
528; CHECK-NEXT: vmov r3, s12
529; CHECK-NEXT: umull r0, r3, r0, r3
530; CHECK-NEXT: vmov.32 q5[0], r0
531; CHECK-NEXT: vmov r0, s14
532; CHECK-NEXT: vmov.32 q5[1], r3
533; CHECK-NEXT: vmov r3, s18
534; CHECK-NEXT: umull r0, r3, r3, r0
535; CHECK-NEXT: vmov.32 q5[2], r0
536; CHECK-NEXT: vmov.32 q5[3], r3
537; CHECK-NEXT: vmov r1, s20
538; CHECK-NEXT: vmov r0, s21
539; CHECK-NEXT: adds r1, r1, r2
540; CHECK-NEXT: adc.w r2, lr, r0
541; CHECK-NEXT: vmov r0, s22
542; CHECK-NEXT: adds.w r12, r1, r0
543; CHECK-NEXT: adc.w r1, r2, r3
544; CHECK-NEXT: vmov.u8 r2, q1[4]
545; CHECK-NEXT: vmov.u8 r3, q0[4]
546; CHECK-NEXT: vmov.32 q3[0], r2
547; CHECK-NEXT: vmov.u8 r2, q1[5]
548; CHECK-NEXT: vmov.32 q4[0], r3
549; CHECK-NEXT: vmov.u8 r3, q0[5]
550; CHECK-NEXT: vmov.32 q3[2], r2
551; CHECK-NEXT: vmov.32 q4[2], r3
552; CHECK-NEXT: vand q3, q3, q2
553; CHECK-NEXT: vand q4, q4, q2
554; CHECK-NEXT: vmov r2, s12
555; CHECK-NEXT: vmov r3, s16
556; CHECK-NEXT: umull r2, r3, r3, r2
557; CHECK-NEXT: vmov.32 q5[0], r2
558; CHECK-NEXT: vmov r2, s14
559; CHECK-NEXT: vmov.32 q5[1], r3
560; CHECK-NEXT: vmov r3, s18
561; CHECK-NEXT: umull r2, r3, r3, r2
562; CHECK-NEXT: vmov.32 q5[2], r2
563; CHECK-NEXT: vmov.32 q5[3], r3
564; CHECK-NEXT: vmov r0, s20
565; CHECK-NEXT: vmov r2, s21
566; CHECK-NEXT: adds.w r0, r0, r12
567; CHECK-NEXT: adcs r1, r2
568; CHECK-NEXT: vmov r2, s22
569; CHECK-NEXT: adds.w r12, r0, r2
570; CHECK-NEXT: vmov.u8 r2, q1[6]
571; CHECK-NEXT: adcs r1, r3
572; CHECK-NEXT: vmov.u8 r3, q0[6]
573; CHECK-NEXT: vmov.32 q3[0], r2
574; CHECK-NEXT: vmov.u8 r2, q1[7]
575; CHECK-NEXT: vmov.32 q4[0], r3
576; CHECK-NEXT: vmov.u8 r3, q0[7]
577; CHECK-NEXT: vmov.32 q3[2], r2
578; CHECK-NEXT: vmov.32 q4[2], r3
579; CHECK-NEXT: vand q3, q3, q2
580; CHECK-NEXT: vand q4, q4, q2
581; CHECK-NEXT: vmov r2, s12
582; CHECK-NEXT: vmov r3, s16
583; CHECK-NEXT: umull r2, r3, r3, r2
584; CHECK-NEXT: vmov.32 q5[0], r2
585; CHECK-NEXT: vmov r2, s14
586; CHECK-NEXT: vmov.32 q5[1], r3
587; CHECK-NEXT: vmov r3, s18
588; CHECK-NEXT: umull r2, r3, r3, r2
589; CHECK-NEXT: vmov.32 q5[2], r2
590; CHECK-NEXT: vmov.32 q5[3], r3
591; CHECK-NEXT: vmov r0, s20
592; CHECK-NEXT: vmov r2, s21
593; CHECK-NEXT: adds.w r0, r0, r12
594; CHECK-NEXT: adcs r1, r2
595; CHECK-NEXT: vmov r2, s22
596; CHECK-NEXT: adds.w r12, r0, r2
597; CHECK-NEXT: vmov.u8 r2, q1[8]
598; CHECK-NEXT: adcs r1, r3
599; CHECK-NEXT: vmov.u8 r3, q0[8]
600; CHECK-NEXT: vmov.32 q3[0], r2
601; CHECK-NEXT: vmov.u8 r2, q1[9]
602; CHECK-NEXT: vmov.32 q4[0], r3
603; CHECK-NEXT: vmov.u8 r3, q0[9]
604; CHECK-NEXT: vmov.32 q3[2], r2
605; CHECK-NEXT: vmov.32 q4[2], r3
606; CHECK-NEXT: vand q3, q3, q2
607; CHECK-NEXT: vand q4, q4, q2
608; CHECK-NEXT: vmov r2, s12
609; CHECK-NEXT: vmov r3, s16
610; CHECK-NEXT: umull r2, r3, r3, r2
611; CHECK-NEXT: vmov.32 q5[0], r2
612; CHECK-NEXT: vmov r2, s14
613; CHECK-NEXT: vmov.32 q5[1], r3
614; CHECK-NEXT: vmov r3, s18
615; CHECK-NEXT: umull r2, r3, r3, r2
616; CHECK-NEXT: vmov.32 q5[2], r2
617; CHECK-NEXT: vmov.32 q5[3], r3
618; CHECK-NEXT: vmov r0, s20
619; CHECK-NEXT: vmov r2, s21
620; CHECK-NEXT: adds.w r0, r0, r12
621; CHECK-NEXT: adcs r1, r2
622; CHECK-NEXT: vmov r2, s22
623; CHECK-NEXT: adds.w r12, r0, r2
624; CHECK-NEXT: vmov.u8 r2, q1[10]
625; CHECK-NEXT: adcs r1, r3
626; CHECK-NEXT: vmov.u8 r3, q0[10]
627; CHECK-NEXT: vmov.32 q3[0], r2
628; CHECK-NEXT: vmov.u8 r2, q1[11]
629; CHECK-NEXT: vmov.32 q4[0], r3
630; CHECK-NEXT: vmov.u8 r3, q0[11]
631; CHECK-NEXT: vmov.32 q3[2], r2
632; CHECK-NEXT: vmov.32 q4[2], r3
633; CHECK-NEXT: vand q3, q3, q2
634; CHECK-NEXT: vand q4, q4, q2
635; CHECK-NEXT: vmov r2, s12
636; CHECK-NEXT: vmov r3, s16
637; CHECK-NEXT: umull r2, r3, r3, r2
638; CHECK-NEXT: vmov.32 q5[0], r2
639; CHECK-NEXT: vmov r2, s14
640; CHECK-NEXT: vmov.32 q5[1], r3
641; CHECK-NEXT: vmov r3, s18
642; CHECK-NEXT: umull r2, r3, r3, r2
643; CHECK-NEXT: vmov.32 q5[2], r2
644; CHECK-NEXT: vmov.32 q5[3], r3
645; CHECK-NEXT: vmov r0, s20
646; CHECK-NEXT: vmov r2, s21
647; CHECK-NEXT: adds.w r0, r0, r12
648; CHECK-NEXT: adcs r1, r2
649; CHECK-NEXT: vmov r2, s22
650; CHECK-NEXT: adds.w r12, r0, r2
651; CHECK-NEXT: vmov.u8 r2, q1[12]
652; CHECK-NEXT: adcs r1, r3
653; CHECK-NEXT: vmov.u8 r3, q0[12]
654; CHECK-NEXT: vmov.32 q3[0], r2
655; CHECK-NEXT: vmov.u8 r2, q1[13]
656; CHECK-NEXT: vmov.32 q4[0], r3
657; CHECK-NEXT: vmov.u8 r3, q0[13]
658; CHECK-NEXT: vmov.32 q3[2], r2
659; CHECK-NEXT: vmov.32 q4[2], r3
660; CHECK-NEXT: vand q3, q3, q2
661; CHECK-NEXT: vand q4, q4, q2
662; CHECK-NEXT: vmov r2, s12
663; CHECK-NEXT: vmov r3, s16
664; CHECK-NEXT: umull r2, r3, r3, r2
665; CHECK-NEXT: vmov.32 q5[0], r2
666; CHECK-NEXT: vmov r2, s14
667; CHECK-NEXT: vmov.32 q5[1], r3
668; CHECK-NEXT: vmov r3, s18
669; CHECK-NEXT: umull r2, r3, r3, r2
670; CHECK-NEXT: vmov.32 q5[2], r2
671; CHECK-NEXT: vmov.32 q5[3], r3
672; CHECK-NEXT: vmov r0, s20
673; CHECK-NEXT: vmov r2, s21
674; CHECK-NEXT: adds.w r0, r0, r12
675; CHECK-NEXT: adcs r1, r2
676; CHECK-NEXT: vmov r2, s22
677; CHECK-NEXT: adds r0, r0, r2
678; CHECK-NEXT: vmov.u8 r2, q1[14]
679; CHECK-NEXT: vmov.32 q3[0], r2
680; CHECK-NEXT: vmov.u8 r2, q1[15]
681; CHECK-NEXT: adcs r1, r3
682; CHECK-NEXT: vmov.32 q3[2], r2
683; CHECK-NEXT: vmov.u8 r3, q0[14]
684; CHECK-NEXT: vand q1, q3, q2
685; CHECK-NEXT: vmov.32 q3[0], r3
686; CHECK-NEXT: vmov.u8 r3, q0[15]
687; CHECK-NEXT: vmov.32 q3[2], r3
688; CHECK-NEXT: vmov r2, s4
689; CHECK-NEXT: vand q0, q3, q2
690; CHECK-NEXT: vmov r3, s0
691; CHECK-NEXT: umlal r0, r1, r3, r2
692; CHECK-NEXT: vmov r2, s6
693; CHECK-NEXT: vmov r3, s2
694; CHECK-NEXT: umlal r0, r1, r3, r2
695; CHECK-NEXT: vpop {d8, d9, d10, d11}
696; CHECK-NEXT: pop {r7, pc}
David Green0ac4f6b2020-02-17 11:41:16 +0000697entry:
698 %xx = zext <16 x i8> %x to <16 x i64>
699 %yy = zext <16 x i8> %y to <16 x i64>
700 %m = mul <16 x i64> %xx, %yy
701 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
702 ret i64 %z
703}
704
705define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
706; CHECK-LABEL: add_v16i8_v16i64_sext:
707; CHECK: @ %bb.0: @ %entry
708; CHECK-NEXT: vmov.u8 r0, q1[0]
709; CHECK-NEXT: vmov.u8 r1, q0[0]
710; CHECK-NEXT: sxtb r0, r0
711; CHECK-NEXT: sxtb r1, r1
712; CHECK-NEXT: smull r0, r1, r1, r0
713; CHECK-NEXT: vmov.32 q2[0], r0
714; CHECK-NEXT: vmov.u8 r0, q1[1]
715; CHECK-NEXT: vmov.32 q2[1], r1
716; CHECK-NEXT: vmov.u8 r1, q0[1]
717; CHECK-NEXT: sxtb r0, r0
718; CHECK-NEXT: sxtb r1, r1
719; CHECK-NEXT: smull r0, r1, r1, r0
720; CHECK-NEXT: vmov.32 q2[2], r0
721; CHECK-NEXT: vmov.32 q2[3], r1
722; CHECK-NEXT: vmov r2, s10
723; CHECK-NEXT: vmov r3, s8
724; CHECK-NEXT: vmov r0, s9
725; CHECK-NEXT: adds r2, r2, r3
726; CHECK-NEXT: vmov.u8 r3, q0[2]
727; CHECK-NEXT: adc.w r12, r0, r1
728; CHECK-NEXT: vmov.u8 r1, q1[2]
729; CHECK-NEXT: sxtb r1, r1
730; CHECK-NEXT: sxtb r3, r3
731; CHECK-NEXT: smull r1, r3, r3, r1
732; CHECK-NEXT: vmov.32 q2[0], r1
733; CHECK-NEXT: vmov.u8 r1, q1[3]
734; CHECK-NEXT: vmov.32 q2[1], r3
735; CHECK-NEXT: vmov.u8 r3, q0[3]
736; CHECK-NEXT: sxtb r1, r1
737; CHECK-NEXT: sxtb r3, r3
738; CHECK-NEXT: smull r1, r3, r3, r1
739; CHECK-NEXT: vmov.32 q2[2], r1
740; CHECK-NEXT: vmov.32 q2[3], r3
741; CHECK-NEXT: vmov r0, s8
742; CHECK-NEXT: vmov r1, s9
743; CHECK-NEXT: adds r0, r0, r2
744; CHECK-NEXT: vmov r2, s10
745; CHECK-NEXT: adc.w r1, r1, r12
746; CHECK-NEXT: adds.w r12, r0, r2
747; CHECK-NEXT: vmov.u8 r2, q1[4]
748; CHECK-NEXT: adcs r1, r3
749; CHECK-NEXT: vmov.u8 r3, q0[4]
750; CHECK-NEXT: sxtb r2, r2
751; CHECK-NEXT: sxtb r3, r3
752; CHECK-NEXT: smull r2, r3, r3, r2
753; CHECK-NEXT: vmov.32 q2[0], r2
754; CHECK-NEXT: vmov.u8 r2, q1[5]
755; CHECK-NEXT: vmov.32 q2[1], r3
756; CHECK-NEXT: vmov.u8 r3, q0[5]
757; CHECK-NEXT: sxtb r2, r2
758; CHECK-NEXT: sxtb r3, r3
759; CHECK-NEXT: smull r2, r3, r3, r2
760; CHECK-NEXT: vmov.32 q2[2], r2
761; CHECK-NEXT: vmov.32 q2[3], r3
762; CHECK-NEXT: vmov r0, s8
763; CHECK-NEXT: vmov r2, s9
764; CHECK-NEXT: adds.w r0, r0, r12
765; CHECK-NEXT: adcs r1, r2
766; CHECK-NEXT: vmov r2, s10
767; CHECK-NEXT: adds.w r12, r0, r2
768; CHECK-NEXT: vmov.u8 r2, q1[6]
769; CHECK-NEXT: adcs r1, r3
770; CHECK-NEXT: vmov.u8 r3, q0[6]
771; CHECK-NEXT: sxtb r2, r2
772; CHECK-NEXT: sxtb r3, r3
773; CHECK-NEXT: smull r2, r3, r3, r2
774; CHECK-NEXT: vmov.32 q2[0], r2
775; CHECK-NEXT: vmov.u8 r2, q1[7]
776; CHECK-NEXT: vmov.32 q2[1], r3
777; CHECK-NEXT: vmov.u8 r3, q0[7]
778; CHECK-NEXT: sxtb r2, r2
779; CHECK-NEXT: sxtb r3, r3
780; CHECK-NEXT: smull r2, r3, r3, r2
781; CHECK-NEXT: vmov.32 q2[2], r2
782; CHECK-NEXT: vmov.32 q2[3], r3
783; CHECK-NEXT: vmov r0, s8
784; CHECK-NEXT: vmov r2, s9
785; CHECK-NEXT: adds.w r0, r0, r12
786; CHECK-NEXT: adcs r1, r2
787; CHECK-NEXT: vmov r2, s10
788; CHECK-NEXT: adds.w r12, r0, r2
789; CHECK-NEXT: vmov.u8 r2, q1[8]
790; CHECK-NEXT: adcs r1, r3
791; CHECK-NEXT: vmov.u8 r3, q0[8]
792; CHECK-NEXT: sxtb r2, r2
793; CHECK-NEXT: sxtb r3, r3
794; CHECK-NEXT: smull r2, r3, r3, r2
795; CHECK-NEXT: vmov.32 q2[0], r2
796; CHECK-NEXT: vmov.u8 r2, q1[9]
797; CHECK-NEXT: vmov.32 q2[1], r3
798; CHECK-NEXT: vmov.u8 r3, q0[9]
799; CHECK-NEXT: sxtb r2, r2
800; CHECK-NEXT: sxtb r3, r3
801; CHECK-NEXT: smull r2, r3, r3, r2
802; CHECK-NEXT: vmov.32 q2[2], r2
803; CHECK-NEXT: vmov.32 q2[3], r3
804; CHECK-NEXT: vmov r0, s8
805; CHECK-NEXT: vmov r2, s9
806; CHECK-NEXT: adds.w r0, r0, r12
807; CHECK-NEXT: adcs r1, r2
808; CHECK-NEXT: vmov r2, s10
809; CHECK-NEXT: adds.w r12, r0, r2
810; CHECK-NEXT: vmov.u8 r2, q1[10]
811; CHECK-NEXT: adcs r1, r3
812; CHECK-NEXT: vmov.u8 r3, q0[10]
813; CHECK-NEXT: sxtb r2, r2
814; CHECK-NEXT: sxtb r3, r3
815; CHECK-NEXT: smull r2, r3, r3, r2
816; CHECK-NEXT: vmov.32 q2[0], r2
817; CHECK-NEXT: vmov.u8 r2, q1[11]
818; CHECK-NEXT: vmov.32 q2[1], r3
819; CHECK-NEXT: vmov.u8 r3, q0[11]
820; CHECK-NEXT: sxtb r2, r2
821; CHECK-NEXT: sxtb r3, r3
822; CHECK-NEXT: smull r2, r3, r3, r2
823; CHECK-NEXT: vmov.32 q2[2], r2
824; CHECK-NEXT: vmov.32 q2[3], r3
825; CHECK-NEXT: vmov r0, s8
826; CHECK-NEXT: vmov r2, s9
827; CHECK-NEXT: adds.w r0, r0, r12
828; CHECK-NEXT: adcs r1, r2
829; CHECK-NEXT: vmov r2, s10
830; CHECK-NEXT: adds.w r12, r0, r2
831; CHECK-NEXT: vmov.u8 r2, q1[12]
832; CHECK-NEXT: adcs r1, r3
833; CHECK-NEXT: vmov.u8 r3, q0[12]
834; CHECK-NEXT: sxtb r2, r2
835; CHECK-NEXT: sxtb r3, r3
836; CHECK-NEXT: smull r2, r3, r3, r2
837; CHECK-NEXT: vmov.32 q2[0], r2
838; CHECK-NEXT: vmov.u8 r2, q1[13]
839; CHECK-NEXT: vmov.32 q2[1], r3
840; CHECK-NEXT: vmov.u8 r3, q0[13]
841; CHECK-NEXT: sxtb r2, r2
842; CHECK-NEXT: sxtb r3, r3
843; CHECK-NEXT: smull r2, r3, r3, r2
844; CHECK-NEXT: vmov.32 q2[2], r2
845; CHECK-NEXT: vmov.32 q2[3], r3
846; CHECK-NEXT: vmov r0, s8
847; CHECK-NEXT: vmov r2, s9
848; CHECK-NEXT: adds.w r0, r0, r12
849; CHECK-NEXT: adcs r1, r2
850; CHECK-NEXT: vmov r2, s10
851; CHECK-NEXT: adds r0, r0, r2
852; CHECK-NEXT: vmov.u8 r2, q1[14]
853; CHECK-NEXT: adcs r1, r3
854; CHECK-NEXT: vmov.u8 r3, q0[14]
855; CHECK-NEXT: sxtb r2, r2
856; CHECK-NEXT: sxtb r3, r3
857; CHECK-NEXT: smlal r0, r1, r3, r2
858; CHECK-NEXT: vmov.u8 r2, q1[15]
859; CHECK-NEXT: vmov.u8 r3, q0[15]
860; CHECK-NEXT: sxtb r2, r2
861; CHECK-NEXT: sxtb r3, r3
862; CHECK-NEXT: smlal r0, r1, r3, r2
863; CHECK-NEXT: bx lr
864entry:
865 %xx = sext <16 x i8> %x to <16 x i64>
866 %yy = sext <16 x i8> %y to <16 x i64>
867 %m = mul <16 x i64> %xx, %yy
868 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
869 ret i64 %z
870}
871
872define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
873; CHECK-LABEL: add_v2i8_v2i64_zext:
874; CHECK: @ %bb.0: @ %entry
David Greenc9eaed52020-03-28 16:22:05 +0000875; CHECK-NEXT: vmov.i64 q2, #0xff
David Green0ac4f6b2020-02-17 11:41:16 +0000876; CHECK-NEXT: vand q1, q1, q2
877; CHECK-NEXT: vand q0, q0, q2
878; CHECK-NEXT: vmov r0, s6
879; CHECK-NEXT: vmov r1, s2
880; CHECK-NEXT: vmov r2, s4
881; CHECK-NEXT: vmov r3, s0
882; CHECK-NEXT: umull r0, r1, r1, r0
883; CHECK-NEXT: umull r2, r3, r3, r2
884; CHECK-NEXT: add r0, r2
885; CHECK-NEXT: orrs r1, r3
886; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +0000887entry:
888 %xx = zext <2 x i8> %x to <2 x i64>
889 %yy = zext <2 x i8> %y to <2 x i64>
890 %m = mul <2 x i64> %xx, %yy
891 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
892 ret i64 %z
893}
894
895define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
896; CHECK-LABEL: add_v2i8_v2i64_sext:
897; CHECK: @ %bb.0: @ %entry
898; CHECK-NEXT: vmov r0, s4
899; CHECK-NEXT: vmov r1, s0
900; CHECK-NEXT: vmov r2, s6
901; CHECK-NEXT: vmov r3, s2
902; CHECK-NEXT: sxtb r0, r0
903; CHECK-NEXT: sxtb r1, r1
904; CHECK-NEXT: smull r0, r1, r1, r0
905; CHECK-NEXT: sxtb r2, r2
906; CHECK-NEXT: sxtb r3, r3
907; CHECK-NEXT: smlal r0, r1, r3, r2
908; CHECK-NEXT: bx lr
909entry:
910 %xx = sext <2 x i8> %x to <2 x i64>
911 %yy = sext <2 x i8> %y to <2 x i64>
912 %m = mul <2 x i64> %xx, %yy
913 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
914 ret i64 %z
915}
916
917define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
918; CHECK-LABEL: add_v2i64_v2i64:
919; CHECK: @ %bb.0: @ %entry
920; CHECK-NEXT: .save {r4, lr}
921; CHECK-NEXT: push {r4, lr}
922; CHECK-NEXT: vmov r0, s4
923; CHECK-NEXT: vmov r1, s0
924; CHECK-NEXT: vmov r2, s5
925; CHECK-NEXT: vmov r4, s7
926; CHECK-NEXT: umull r12, r3, r1, r0
927; CHECK-NEXT: mla r1, r1, r2, r3
928; CHECK-NEXT: vmov r2, s1
929; CHECK-NEXT: vmov r3, s2
930; CHECK-NEXT: vmov.32 q2[0], r12
931; CHECK-NEXT: mla r1, r2, r0, r1
932; CHECK-NEXT: vmov r2, s6
933; CHECK-NEXT: vmov.32 q2[1], r1
934; CHECK-NEXT: vmov r12, s8
935; CHECK-NEXT: umull lr, r0, r3, r2
936; CHECK-NEXT: mla r0, r3, r4, r0
937; CHECK-NEXT: vmov r3, s3
938; CHECK-NEXT: mla r2, r3, r2, r0
939; CHECK-NEXT: adds.w r0, r12, lr
940; CHECK-NEXT: adcs r1, r2
941; CHECK-NEXT: pop {r4, pc}
942entry:
943 %m = mul <2 x i64> %x, %y
944 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
945 ret i64 %z
946}
947
948define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, i32 %a) {
949; CHECK-LABEL: add_v4i32_v4i32_acc:
950; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000951; CHECK-NEXT: vmlava.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +0000952; CHECK-NEXT: bx lr
953entry:
954 %m = mul <4 x i32> %x, %y
955 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
956 %r = add i32 %z, %a
957 ret i32 %r
958}
959
960define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
961; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
962; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000963; CHECK-NEXT: vmlalva.u32 r0, r1, q0, q1
964; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +0000965entry:
966 %xx = zext <4 x i32> %x to <4 x i64>
967 %yy = zext <4 x i32> %y to <4 x i64>
968 %m = mul <4 x i64> %xx, %yy
969 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
970 %r = add i64 %z, %a
971 ret i64 %r
972}
973
974define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
975; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
976; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +0000977; CHECK-NEXT: vmlalva.s32 r0, r1, q0, q1
978; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +0000979entry:
980 %xx = sext <4 x i32> %x to <4 x i64>
981 %yy = sext <4 x i32> %y to <4 x i64>
982 %m = mul <4 x i64> %xx, %yy
983 %z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
984 %r = add i64 %z, %a
985 ret i64 %r
986}
987
988define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
989; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
990; CHECK: @ %bb.0: @ %entry
991; CHECK-NEXT: .save {r7, lr}
992; CHECK-NEXT: push {r7, lr}
993; CHECK-NEXT: vmov r2, s4
994; CHECK-NEXT: vmov r3, s0
995; CHECK-NEXT: vmov r12, s6
996; CHECK-NEXT: umull r2, lr, r3, r2
997; CHECK-NEXT: vmov r3, s2
998; CHECK-NEXT: umlal r2, lr, r3, r12
999; CHECK-NEXT: adds r0, r0, r2
1000; CHECK-NEXT: adc.w r1, r1, lr
1001; CHECK-NEXT: pop {r7, pc}
1002entry:
1003 %xx = zext <2 x i32> %x to <2 x i64>
1004 %yy = zext <2 x i32> %y to <2 x i64>
1005 %m = mul <2 x i64> %xx, %yy
1006 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1007 %r = add i64 %z, %a
1008 ret i64 %r
1009}
1010
1011define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
1012; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
1013; CHECK: @ %bb.0: @ %entry
1014; CHECK-NEXT: .save {r7, lr}
1015; CHECK-NEXT: push {r7, lr}
1016; CHECK-NEXT: vmov r2, s4
1017; CHECK-NEXT: vmov r3, s0
1018; CHECK-NEXT: vmov r12, s6
1019; CHECK-NEXT: smull r2, lr, r3, r2
1020; CHECK-NEXT: vmov r3, s2
1021; CHECK-NEXT: smlal r2, lr, r3, r12
1022; CHECK-NEXT: adds r0, r0, r2
1023; CHECK-NEXT: adc.w r1, r1, lr
1024; CHECK-NEXT: pop {r7, pc}
1025entry:
1026 %xx = sext <2 x i32> %x to <2 x i64>
1027 %yy = sext <2 x i32> %y to <2 x i64>
1028 %m = mul <2 x i64> %xx, %yy
1029 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1030 %r = add i64 %z, %a
1031 ret i64 %r
1032}
1033
1034define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
1035; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
1036; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001037; CHECK-NEXT: vmlava.u16 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001038; CHECK-NEXT: bx lr
1039entry:
1040 %xx = zext <8 x i16> %x to <8 x i32>
1041 %yy = zext <8 x i16> %y to <8 x i32>
1042 %m = mul <8 x i32> %xx, %yy
1043 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
1044 %r = add i32 %z, %a
1045 ret i32 %r
1046}
1047
1048define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
1049; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
1050; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001051; CHECK-NEXT: vmlava.s16 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001052; CHECK-NEXT: bx lr
1053entry:
1054 %xx = sext <8 x i16> %x to <8 x i32>
1055 %yy = sext <8 x i16> %y to <8 x i32>
1056 %m = mul <8 x i32> %xx, %yy
1057 %z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
1058 %r = add i32 %z, %a
1059 ret i32 %r
1060}
1061
1062define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
1063; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
1064; CHECK: @ %bb.0: @ %entry
1065; CHECK-NEXT: vmovlb.u16 q1, q1
1066; CHECK-NEXT: vmovlb.u16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +00001067; CHECK-NEXT: vmlava.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001068; CHECK-NEXT: bx lr
1069entry:
1070 %xx = zext <4 x i16> %x to <4 x i32>
1071 %yy = zext <4 x i16> %y to <4 x i32>
1072 %m = mul <4 x i32> %xx, %yy
1073 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1074 %r = add i32 %z, %a
1075 ret i32 %r
1076}
1077
1078define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
1079; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
1080; CHECK: @ %bb.0: @ %entry
1081; CHECK-NEXT: vmovlb.s16 q1, q1
1082; CHECK-NEXT: vmovlb.s16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +00001083; CHECK-NEXT: vmlava.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001084; CHECK-NEXT: bx lr
1085entry:
1086 %xx = sext <4 x i16> %x to <4 x i32>
1087 %yy = sext <4 x i16> %y to <4 x i32>
1088 %m = mul <4 x i32> %xx, %yy
1089 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1090 %r = add i32 %z, %a
1091 ret i32 %r
1092}
1093
1094define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, i16 %a) {
1095; CHECK-LABEL: add_v8i16_v8i16_acc:
1096; CHECK: @ %bb.0: @ %entry
1097; CHECK-NEXT: vmul.i16 q0, q0, q1
1098; CHECK-NEXT: vaddva.u16 r0, q0
1099; CHECK-NEXT: uxth r0, r0
1100; CHECK-NEXT: bx lr
1101entry:
1102 %m = mul <8 x i16> %x, %y
1103 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
1104 %r = add i16 %z, %a
1105 ret i16 %r
1106}
1107
1108define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1109; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
1110; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001111; CHECK-NEXT: vmlalva.u16 r0, r1, q0, q1
1112; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +00001113entry:
1114 %xx = zext <8 x i16> %x to <8 x i64>
1115 %yy = zext <8 x i16> %y to <8 x i64>
1116 %m = mul <8 x i64> %xx, %yy
1117 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
1118 %r = add i64 %z, %a
1119 ret i64 %r
1120}
1121
1122define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1123; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
1124; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001125; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q1
1126; CHECK-NEXT: bx lr
David Green0ac4f6b2020-02-17 11:41:16 +00001127entry:
1128 %xx = sext <8 x i16> %x to <8 x i64>
1129 %yy = sext <8 x i16> %y to <8 x i64>
1130 %m = mul <8 x i64> %xx, %yy
1131 %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
1132 %r = add i64 %z, %a
1133 ret i64 %r
1134}
1135
1136define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
1137; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
1138; CHECK: @ %bb.0: @ %entry
1139; CHECK-NEXT: .save {r7, lr}
1140; CHECK-NEXT: push {r7, lr}
David Greenc9eaed52020-03-28 16:22:05 +00001141; CHECK-NEXT: vmov.i64 q2, #0xffff
David Green0ac4f6b2020-02-17 11:41:16 +00001142; CHECK-NEXT: vand q1, q1, q2
1143; CHECK-NEXT: vand q0, q0, q2
1144; CHECK-NEXT: vmov r2, s4
1145; CHECK-NEXT: vmov r3, s0
1146; CHECK-NEXT: vmov r12, s6
1147; CHECK-NEXT: umull r2, lr, r3, r2
1148; CHECK-NEXT: vmov r3, s2
1149; CHECK-NEXT: umlal r2, lr, r3, r12
1150; CHECK-NEXT: adds r0, r0, r2
1151; CHECK-NEXT: adc.w r1, r1, lr
1152; CHECK-NEXT: pop {r7, pc}
David Green0ac4f6b2020-02-17 11:41:16 +00001153entry:
1154 %xx = zext <2 x i16> %x to <2 x i64>
1155 %yy = zext <2 x i16> %y to <2 x i64>
1156 %m = mul <2 x i64> %xx, %yy
1157 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1158 %r = add i64 %z, %a
1159 ret i64 %r
1160}
1161
1162define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
1163; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
1164; CHECK: @ %bb.0: @ %entry
1165; CHECK-NEXT: .save {r7, lr}
1166; CHECK-NEXT: push {r7, lr}
1167; CHECK-NEXT: vmov r2, s4
1168; CHECK-NEXT: vmov r3, s0
1169; CHECK-NEXT: sxth r2, r2
1170; CHECK-NEXT: sxth r3, r3
1171; CHECK-NEXT: smull r2, r12, r3, r2
1172; CHECK-NEXT: vmov r3, s6
1173; CHECK-NEXT: sxth.w lr, r3
1174; CHECK-NEXT: vmov r3, s2
1175; CHECK-NEXT: sxth r3, r3
1176; CHECK-NEXT: smlal r2, r12, r3, lr
1177; CHECK-NEXT: adds r0, r0, r2
1178; CHECK-NEXT: adc.w r1, r1, r12
1179; CHECK-NEXT: pop {r7, pc}
1180entry:
1181 %xx = sext <2 x i16> %x to <2 x i64>
1182 %yy = sext <2 x i16> %y to <2 x i64>
1183 %m = mul <2 x i64> %xx, %yy
1184 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1185 %r = add i64 %z, %a
1186 ret i64 %r
1187}
1188
1189define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1190; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
1191; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001192; CHECK-NEXT: vmlava.u8 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001193; CHECK-NEXT: bx lr
1194entry:
1195 %xx = zext <16 x i8> %x to <16 x i32>
1196 %yy = zext <16 x i8> %y to <16 x i32>
1197 %m = mul <16 x i32> %xx, %yy
1198 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
1199 %r = add i32 %z, %a
1200 ret i32 %r
1201}
1202
1203define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1204; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
1205; CHECK: @ %bb.0: @ %entry
David Green33aa5df2020-02-17 12:00:17 +00001206; CHECK-NEXT: vmlava.s8 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001207; CHECK-NEXT: bx lr
1208entry:
1209 %xx = sext <16 x i8> %x to <16 x i32>
1210 %yy = sext <16 x i8> %y to <16 x i32>
1211 %m = mul <16 x i32> %xx, %yy
1212 %z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
1213 %r = add i32 %z, %a
1214 ret i32 %r
1215}
1216
1217define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
1218; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
1219; CHECK: @ %bb.0: @ %entry
1220; CHECK-NEXT: vmov.i32 q2, #0xff
1221; CHECK-NEXT: vand q1, q1, q2
1222; CHECK-NEXT: vand q0, q0, q2
David Green33aa5df2020-02-17 12:00:17 +00001223; CHECK-NEXT: vmlava.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001224; CHECK-NEXT: bx lr
1225entry:
1226 %xx = zext <4 x i8> %x to <4 x i32>
1227 %yy = zext <4 x i8> %y to <4 x i32>
1228 %m = mul <4 x i32> %xx, %yy
1229 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1230 %r = add i32 %z, %a
1231 ret i32 %r
1232}
1233
1234define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
1235; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
1236; CHECK: @ %bb.0: @ %entry
1237; CHECK-NEXT: vmovlb.s8 q1, q1
1238; CHECK-NEXT: vmovlb.s8 q0, q0
1239; CHECK-NEXT: vmovlb.s16 q1, q1
1240; CHECK-NEXT: vmovlb.s16 q0, q0
David Green33aa5df2020-02-17 12:00:17 +00001241; CHECK-NEXT: vmlava.u32 r0, q0, q1
David Green0ac4f6b2020-02-17 11:41:16 +00001242; CHECK-NEXT: bx lr
1243entry:
1244 %xx = sext <4 x i8> %x to <4 x i32>
1245 %yy = sext <4 x i8> %y to <4 x i32>
1246 %m = mul <4 x i32> %xx, %yy
1247 %z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
1248 %r = add i32 %z, %a
1249 ret i32 %r
1250}
1251
1252define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
1253; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
1254; CHECK: @ %bb.0: @ %entry
1255; CHECK-NEXT: vmov.u8 r1, q1[8]
1256; CHECK-NEXT: vmov.16 q2[0], r1
1257; CHECK-NEXT: vmov.u8 r1, q1[9]
1258; CHECK-NEXT: vmov.16 q2[1], r1
1259; CHECK-NEXT: vmov.u8 r1, q1[10]
1260; CHECK-NEXT: vmov.16 q2[2], r1
1261; CHECK-NEXT: vmov.u8 r1, q1[11]
1262; CHECK-NEXT: vmov.16 q2[3], r1
1263; CHECK-NEXT: vmov.u8 r1, q1[12]
1264; CHECK-NEXT: vmov.16 q2[4], r1
1265; CHECK-NEXT: vmov.u8 r1, q1[13]
1266; CHECK-NEXT: vmov.16 q2[5], r1
1267; CHECK-NEXT: vmov.u8 r1, q1[14]
1268; CHECK-NEXT: vmov.16 q2[6], r1
1269; CHECK-NEXT: vmov.u8 r1, q1[15]
1270; CHECK-NEXT: vmov.16 q2[7], r1
1271; CHECK-NEXT: vmov.u8 r1, q0[8]
1272; CHECK-NEXT: vmov.16 q3[0], r1
1273; CHECK-NEXT: vmov.u8 r1, q0[9]
1274; CHECK-NEXT: vmov.16 q3[1], r1
1275; CHECK-NEXT: vmov.u8 r1, q0[10]
1276; CHECK-NEXT: vmov.16 q3[2], r1
1277; CHECK-NEXT: vmov.u8 r1, q0[11]
1278; CHECK-NEXT: vmov.16 q3[3], r1
1279; CHECK-NEXT: vmov.u8 r1, q0[12]
1280; CHECK-NEXT: vmov.16 q3[4], r1
1281; CHECK-NEXT: vmov.u8 r1, q0[13]
1282; CHECK-NEXT: vmov.16 q3[5], r1
1283; CHECK-NEXT: vmov.u8 r1, q0[14]
1284; CHECK-NEXT: vmov.16 q3[6], r1
1285; CHECK-NEXT: vmov.u8 r1, q0[15]
1286; CHECK-NEXT: vmov.16 q3[7], r1
1287; CHECK-NEXT: vmovlb.u8 q2, q2
1288; CHECK-NEXT: vmovlb.u8 q3, q3
1289; CHECK-NEXT: vmov.u8 r1, q1[0]
1290; CHECK-NEXT: vmul.i16 q2, q3, q2
1291; CHECK-NEXT: vmov.16 q3[0], r1
1292; CHECK-NEXT: vmov.u8 r1, q1[1]
1293; CHECK-NEXT: vmov.16 q3[1], r1
1294; CHECK-NEXT: vmov.u8 r1, q1[2]
1295; CHECK-NEXT: vmov.16 q3[2], r1
1296; CHECK-NEXT: vmov.u8 r1, q1[3]
1297; CHECK-NEXT: vmov.16 q3[3], r1
1298; CHECK-NEXT: vmov.u8 r1, q1[4]
1299; CHECK-NEXT: vmov.16 q3[4], r1
1300; CHECK-NEXT: vmov.u8 r1, q1[5]
1301; CHECK-NEXT: vmov.16 q3[5], r1
1302; CHECK-NEXT: vmov.u8 r1, q1[6]
1303; CHECK-NEXT: vmov.16 q3[6], r1
1304; CHECK-NEXT: vmov.u8 r1, q1[7]
1305; CHECK-NEXT: vmov.16 q3[7], r1
1306; CHECK-NEXT: vmov.u8 r1, q0[0]
1307; CHECK-NEXT: vmovlb.u8 q1, q3
1308; CHECK-NEXT: vmov.16 q3[0], r1
1309; CHECK-NEXT: vmov.u8 r1, q0[1]
1310; CHECK-NEXT: vmov.16 q3[1], r1
1311; CHECK-NEXT: vmov.u8 r1, q0[2]
1312; CHECK-NEXT: vmov.16 q3[2], r1
1313; CHECK-NEXT: vmov.u8 r1, q0[3]
1314; CHECK-NEXT: vmov.16 q3[3], r1
1315; CHECK-NEXT: vmov.u8 r1, q0[4]
1316; CHECK-NEXT: vmov.16 q3[4], r1
1317; CHECK-NEXT: vmov.u8 r1, q0[5]
1318; CHECK-NEXT: vmov.16 q3[5], r1
1319; CHECK-NEXT: vmov.u8 r1, q0[6]
1320; CHECK-NEXT: vmov.16 q3[6], r1
1321; CHECK-NEXT: vmov.u8 r1, q0[7]
1322; CHECK-NEXT: vmov.16 q3[7], r1
1323; CHECK-NEXT: vmovlb.u8 q0, q3
1324; CHECK-NEXT: vmul.i16 q0, q0, q1
1325; CHECK-NEXT: vadd.i16 q0, q0, q2
1326; CHECK-NEXT: vaddva.u16 r0, q0
1327; CHECK-NEXT: uxth r0, r0
1328; CHECK-NEXT: bx lr
1329entry:
1330 %xx = zext <16 x i8> %x to <16 x i16>
1331 %yy = zext <16 x i8> %y to <16 x i16>
1332 %m = mul <16 x i16> %xx, %yy
1333 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
1334 %r = add i16 %z, %a
1335 ret i16 %r
1336}
1337
1338define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
1339; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
1340; CHECK: @ %bb.0: @ %entry
1341; CHECK-NEXT: vmov.u8 r1, q1[8]
1342; CHECK-NEXT: vmov.16 q2[0], r1
1343; CHECK-NEXT: vmov.u8 r1, q1[9]
1344; CHECK-NEXT: vmov.16 q2[1], r1
1345; CHECK-NEXT: vmov.u8 r1, q1[10]
1346; CHECK-NEXT: vmov.16 q2[2], r1
1347; CHECK-NEXT: vmov.u8 r1, q1[11]
1348; CHECK-NEXT: vmov.16 q2[3], r1
1349; CHECK-NEXT: vmov.u8 r1, q1[12]
1350; CHECK-NEXT: vmov.16 q2[4], r1
1351; CHECK-NEXT: vmov.u8 r1, q1[13]
1352; CHECK-NEXT: vmov.16 q2[5], r1
1353; CHECK-NEXT: vmov.u8 r1, q1[14]
1354; CHECK-NEXT: vmov.16 q2[6], r1
1355; CHECK-NEXT: vmov.u8 r1, q1[15]
1356; CHECK-NEXT: vmov.16 q2[7], r1
1357; CHECK-NEXT: vmov.u8 r1, q0[8]
1358; CHECK-NEXT: vmov.16 q3[0], r1
1359; CHECK-NEXT: vmov.u8 r1, q0[9]
1360; CHECK-NEXT: vmov.16 q3[1], r1
1361; CHECK-NEXT: vmov.u8 r1, q0[10]
1362; CHECK-NEXT: vmov.16 q3[2], r1
1363; CHECK-NEXT: vmov.u8 r1, q0[11]
1364; CHECK-NEXT: vmov.16 q3[3], r1
1365; CHECK-NEXT: vmov.u8 r1, q0[12]
1366; CHECK-NEXT: vmov.16 q3[4], r1
1367; CHECK-NEXT: vmov.u8 r1, q0[13]
1368; CHECK-NEXT: vmov.16 q3[5], r1
1369; CHECK-NEXT: vmov.u8 r1, q0[14]
1370; CHECK-NEXT: vmov.16 q3[6], r1
1371; CHECK-NEXT: vmov.u8 r1, q0[15]
1372; CHECK-NEXT: vmov.16 q3[7], r1
1373; CHECK-NEXT: vmovlb.s8 q2, q2
1374; CHECK-NEXT: vmovlb.s8 q3, q3
1375; CHECK-NEXT: vmov.u8 r1, q1[0]
1376; CHECK-NEXT: vmul.i16 q2, q3, q2
1377; CHECK-NEXT: vmov.16 q3[0], r1
1378; CHECK-NEXT: vmov.u8 r1, q1[1]
1379; CHECK-NEXT: vmov.16 q3[1], r1
1380; CHECK-NEXT: vmov.u8 r1, q1[2]
1381; CHECK-NEXT: vmov.16 q3[2], r1
1382; CHECK-NEXT: vmov.u8 r1, q1[3]
1383; CHECK-NEXT: vmov.16 q3[3], r1
1384; CHECK-NEXT: vmov.u8 r1, q1[4]
1385; CHECK-NEXT: vmov.16 q3[4], r1
1386; CHECK-NEXT: vmov.u8 r1, q1[5]
1387; CHECK-NEXT: vmov.16 q3[5], r1
1388; CHECK-NEXT: vmov.u8 r1, q1[6]
1389; CHECK-NEXT: vmov.16 q3[6], r1
1390; CHECK-NEXT: vmov.u8 r1, q1[7]
1391; CHECK-NEXT: vmov.16 q3[7], r1
1392; CHECK-NEXT: vmov.u8 r1, q0[0]
1393; CHECK-NEXT: vmovlb.s8 q1, q3
1394; CHECK-NEXT: vmov.16 q3[0], r1
1395; CHECK-NEXT: vmov.u8 r1, q0[1]
1396; CHECK-NEXT: vmov.16 q3[1], r1
1397; CHECK-NEXT: vmov.u8 r1, q0[2]
1398; CHECK-NEXT: vmov.16 q3[2], r1
1399; CHECK-NEXT: vmov.u8 r1, q0[3]
1400; CHECK-NEXT: vmov.16 q3[3], r1
1401; CHECK-NEXT: vmov.u8 r1, q0[4]
1402; CHECK-NEXT: vmov.16 q3[4], r1
1403; CHECK-NEXT: vmov.u8 r1, q0[5]
1404; CHECK-NEXT: vmov.16 q3[5], r1
1405; CHECK-NEXT: vmov.u8 r1, q0[6]
1406; CHECK-NEXT: vmov.16 q3[6], r1
1407; CHECK-NEXT: vmov.u8 r1, q0[7]
1408; CHECK-NEXT: vmov.16 q3[7], r1
1409; CHECK-NEXT: vmovlb.s8 q0, q3
1410; CHECK-NEXT: vmul.i16 q0, q0, q1
1411; CHECK-NEXT: vadd.i16 q0, q0, q2
1412; CHECK-NEXT: vaddva.u16 r0, q0
1413; CHECK-NEXT: sxth r0, r0
1414; CHECK-NEXT: bx lr
1415entry:
1416 %xx = sext <16 x i8> %x to <16 x i16>
1417 %yy = sext <16 x i8> %y to <16 x i16>
1418 %m = mul <16 x i16> %xx, %yy
1419 %z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
1420 %r = add i16 %z, %a
1421 ret i16 %r
1422}
1423
1424define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
1425; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
1426; CHECK: @ %bb.0: @ %entry
1427; CHECK-NEXT: vmovlb.u8 q1, q1
1428; CHECK-NEXT: vmovlb.u8 q0, q0
1429; CHECK-NEXT: vmul.i16 q0, q0, q1
1430; CHECK-NEXT: vaddva.u16 r0, q0
1431; CHECK-NEXT: uxth r0, r0
1432; CHECK-NEXT: bx lr
1433entry:
1434 %xx = zext <8 x i8> %x to <8 x i16>
1435 %yy = zext <8 x i8> %y to <8 x i16>
1436 %m = mul <8 x i16> %xx, %yy
1437 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
1438 %r = add i16 %z, %a
1439 ret i16 %r
1440}
1441
1442define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
1443; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
1444; CHECK: @ %bb.0: @ %entry
1445; CHECK-NEXT: vmovlb.s8 q1, q1
1446; CHECK-NEXT: vmovlb.s8 q0, q0
1447; CHECK-NEXT: vmul.i16 q0, q0, q1
1448; CHECK-NEXT: vaddva.u16 r0, q0
1449; CHECK-NEXT: sxth r0, r0
1450; CHECK-NEXT: bx lr
1451entry:
1452 %xx = sext <8 x i8> %x to <8 x i16>
1453 %yy = sext <8 x i8> %y to <8 x i16>
1454 %m = mul <8 x i16> %xx, %yy
1455 %z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
1456 %r = add i16 %z, %a
1457 ret i16 %r
1458}
1459
1460define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, i8 %a) {
1461; CHECK-LABEL: add_v16i8_v16i8_acc:
1462; CHECK: @ %bb.0: @ %entry
1463; CHECK-NEXT: vmul.i8 q0, q0, q1
1464; CHECK-NEXT: vaddva.u8 r0, q0
1465; CHECK-NEXT: uxtb r0, r0
1466; CHECK-NEXT: bx lr
1467entry:
1468 %m = mul <16 x i8> %x, %y
1469 %z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
1470 %r = add i8 %z, %a
1471 ret i8 %r
1472}
1473
1474define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
1475; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
1476; CHECK: @ %bb.0: @ %entry
1477; CHECK-NEXT: .save {r4, r5, r7, lr}
1478; CHECK-NEXT: push {r4, r5, r7, lr}
1479; CHECK-NEXT: .vsave {d8, d9, d10, d11}
1480; CHECK-NEXT: vpush {d8, d9, d10, d11}
1481; CHECK-NEXT: vmov.u8 r2, q1[0]
1482; CHECK-NEXT: vmov.u8 r3, q0[0]
1483; CHECK-NEXT: vmov.32 q3[0], r2
1484; CHECK-NEXT: vmov.u8 r2, q1[1]
David Green0ac4f6b2020-02-17 11:41:16 +00001485; CHECK-NEXT: vmov.32 q4[0], r3
1486; CHECK-NEXT: vmov.u8 r3, q0[1]
David Greenc9eaed52020-03-28 16:22:05 +00001487; CHECK-NEXT: vmov.32 q3[2], r2
1488; CHECK-NEXT: vmov.i64 q2, #0xff
David Green0ac4f6b2020-02-17 11:41:16 +00001489; CHECK-NEXT: vmov.32 q4[2], r3
1490; CHECK-NEXT: vand q3, q3, q2
1491; CHECK-NEXT: vand q4, q4, q2
1492; CHECK-NEXT: vmov r2, s14
1493; CHECK-NEXT: vmov r3, s18
David Greenc9eaed52020-03-28 16:22:05 +00001494; CHECK-NEXT: vmov.u8 r4, q0[2]
David Green0ac4f6b2020-02-17 11:41:16 +00001495; CHECK-NEXT: umull r12, lr, r3, r2
1496; CHECK-NEXT: vmov r3, s16
1497; CHECK-NEXT: vmov r2, s12
1498; CHECK-NEXT: vmov.32 q4[0], r4
1499; CHECK-NEXT: vmov.u8 r4, q0[3]
1500; CHECK-NEXT: vmov.32 q4[2], r4
1501; CHECK-NEXT: vand q4, q4, q2
1502; CHECK-NEXT: vmov r4, s16
1503; CHECK-NEXT: umull r2, r3, r3, r2
1504; CHECK-NEXT: orr.w lr, lr, r3
1505; CHECK-NEXT: vmov.u8 r3, q1[2]
1506; CHECK-NEXT: vmov.32 q3[0], r3
1507; CHECK-NEXT: vmov.u8 r3, q1[3]
1508; CHECK-NEXT: vmov.32 q3[2], r3
1509; CHECK-NEXT: add r2, r12
1510; CHECK-NEXT: vand q3, q3, q2
1511; CHECK-NEXT: vmov r3, s12
1512; CHECK-NEXT: umull r3, r4, r4, r3
1513; CHECK-NEXT: vmov.32 q5[0], r3
1514; CHECK-NEXT: vmov r3, s14
1515; CHECK-NEXT: vmov.32 q5[1], r4
1516; CHECK-NEXT: vmov r4, s18
1517; CHECK-NEXT: umull r3, r4, r4, r3
1518; CHECK-NEXT: vmov.32 q5[2], r3
1519; CHECK-NEXT: vmov.32 q5[3], r4
1520; CHECK-NEXT: vmov r3, s20
1521; CHECK-NEXT: vmov r5, s21
1522; CHECK-NEXT: adds r2, r2, r3
1523; CHECK-NEXT: adc.w r3, lr, r5
1524; CHECK-NEXT: vmov r5, s22
1525; CHECK-NEXT: adds.w r12, r2, r5
1526; CHECK-NEXT: vmov.u8 r5, q1[4]
1527; CHECK-NEXT: adcs r3, r4
1528; CHECK-NEXT: vmov.u8 r4, q0[4]
1529; CHECK-NEXT: vmov.32 q3[0], r5
1530; CHECK-NEXT: vmov.u8 r5, q1[5]
1531; CHECK-NEXT: vmov.32 q4[0], r4
1532; CHECK-NEXT: vmov.u8 r4, q0[5]
1533; CHECK-NEXT: vmov.32 q3[2], r5
1534; CHECK-NEXT: vmov.32 q4[2], r4
1535; CHECK-NEXT: vand q3, q3, q2
1536; CHECK-NEXT: vand q4, q4, q2
1537; CHECK-NEXT: vmov r5, s12
1538; CHECK-NEXT: vmov r4, s16
1539; CHECK-NEXT: umull r5, r4, r4, r5
1540; CHECK-NEXT: vmov.32 q5[0], r5
1541; CHECK-NEXT: vmov r5, s14
1542; CHECK-NEXT: vmov.32 q5[1], r4
1543; CHECK-NEXT: vmov r4, s18
1544; CHECK-NEXT: umull r5, r4, r4, r5
1545; CHECK-NEXT: vmov.32 q5[2], r5
1546; CHECK-NEXT: vmov.32 q5[3], r4
1547; CHECK-NEXT: vmov r2, s20
1548; CHECK-NEXT: vmov r5, s21
1549; CHECK-NEXT: adds.w r2, r2, r12
1550; CHECK-NEXT: adcs r3, r5
1551; CHECK-NEXT: vmov r5, s22
1552; CHECK-NEXT: adds.w r12, r2, r5
1553; CHECK-NEXT: vmov.u8 r5, q1[6]
1554; CHECK-NEXT: adcs r3, r4
1555; CHECK-NEXT: vmov.u8 r4, q0[6]
1556; CHECK-NEXT: vmov.32 q3[0], r5
1557; CHECK-NEXT: vmov.u8 r5, q1[7]
1558; CHECK-NEXT: vmov.32 q4[0], r4
1559; CHECK-NEXT: vmov.u8 r4, q0[7]
1560; CHECK-NEXT: vmov.32 q3[2], r5
1561; CHECK-NEXT: vmov.32 q4[2], r4
1562; CHECK-NEXT: vand q3, q3, q2
1563; CHECK-NEXT: vand q4, q4, q2
1564; CHECK-NEXT: vmov r5, s12
1565; CHECK-NEXT: vmov r4, s16
1566; CHECK-NEXT: umull r5, r4, r4, r5
1567; CHECK-NEXT: vmov.32 q5[0], r5
1568; CHECK-NEXT: vmov r5, s14
1569; CHECK-NEXT: vmov.32 q5[1], r4
1570; CHECK-NEXT: vmov r4, s18
1571; CHECK-NEXT: umull r5, r4, r4, r5
1572; CHECK-NEXT: vmov.32 q5[2], r5
1573; CHECK-NEXT: vmov.32 q5[3], r4
1574; CHECK-NEXT: vmov r2, s20
1575; CHECK-NEXT: vmov r5, s21
1576; CHECK-NEXT: adds.w r2, r2, r12
1577; CHECK-NEXT: adcs r3, r5
1578; CHECK-NEXT: vmov r5, s22
1579; CHECK-NEXT: adds.w r12, r2, r5
1580; CHECK-NEXT: vmov.u8 r5, q1[8]
1581; CHECK-NEXT: adcs r3, r4
1582; CHECK-NEXT: vmov.u8 r4, q0[8]
1583; CHECK-NEXT: vmov.32 q3[0], r5
1584; CHECK-NEXT: vmov.u8 r5, q1[9]
1585; CHECK-NEXT: vmov.32 q4[0], r4
1586; CHECK-NEXT: vmov.u8 r4, q0[9]
1587; CHECK-NEXT: vmov.32 q3[2], r5
1588; CHECK-NEXT: vmov.32 q4[2], r4
1589; CHECK-NEXT: vand q3, q3, q2
1590; CHECK-NEXT: vand q4, q4, q2
1591; CHECK-NEXT: vmov r5, s12
1592; CHECK-NEXT: vmov r4, s16
1593; CHECK-NEXT: umull r5, r4, r4, r5
1594; CHECK-NEXT: vmov.32 q5[0], r5
1595; CHECK-NEXT: vmov r5, s14
1596; CHECK-NEXT: vmov.32 q5[1], r4
1597; CHECK-NEXT: vmov r4, s18
1598; CHECK-NEXT: umull r5, r4, r4, r5
1599; CHECK-NEXT: vmov.32 q5[2], r5
1600; CHECK-NEXT: vmov.32 q5[3], r4
1601; CHECK-NEXT: vmov r2, s20
1602; CHECK-NEXT: vmov r5, s21
1603; CHECK-NEXT: adds.w r2, r2, r12
1604; CHECK-NEXT: adcs r3, r5
1605; CHECK-NEXT: vmov r5, s22
1606; CHECK-NEXT: adds.w r12, r2, r5
1607; CHECK-NEXT: vmov.u8 r5, q1[10]
1608; CHECK-NEXT: adcs r3, r4
1609; CHECK-NEXT: vmov.u8 r4, q0[10]
1610; CHECK-NEXT: vmov.32 q3[0], r5
1611; CHECK-NEXT: vmov.u8 r5, q1[11]
1612; CHECK-NEXT: vmov.32 q4[0], r4
1613; CHECK-NEXT: vmov.u8 r4, q0[11]
1614; CHECK-NEXT: vmov.32 q3[2], r5
1615; CHECK-NEXT: vmov.32 q4[2], r4
1616; CHECK-NEXT: vand q3, q3, q2
1617; CHECK-NEXT: vand q4, q4, q2
1618; CHECK-NEXT: vmov r5, s12
1619; CHECK-NEXT: vmov r4, s16
1620; CHECK-NEXT: umull r5, r4, r4, r5
1621; CHECK-NEXT: vmov.32 q5[0], r5
1622; CHECK-NEXT: vmov r5, s14
1623; CHECK-NEXT: vmov.32 q5[1], r4
1624; CHECK-NEXT: vmov r4, s18
1625; CHECK-NEXT: umull r5, r4, r4, r5
1626; CHECK-NEXT: vmov.32 q5[2], r5
1627; CHECK-NEXT: vmov.32 q5[3], r4
1628; CHECK-NEXT: vmov r2, s20
1629; CHECK-NEXT: vmov r5, s21
1630; CHECK-NEXT: adds.w r2, r2, r12
1631; CHECK-NEXT: adcs r3, r5
1632; CHECK-NEXT: vmov r5, s22
1633; CHECK-NEXT: adds.w r12, r2, r5
1634; CHECK-NEXT: vmov.u8 r5, q1[12]
1635; CHECK-NEXT: adcs r3, r4
1636; CHECK-NEXT: vmov.u8 r4, q0[12]
1637; CHECK-NEXT: vmov.32 q3[0], r5
1638; CHECK-NEXT: vmov.u8 r5, q1[13]
1639; CHECK-NEXT: vmov.32 q4[0], r4
1640; CHECK-NEXT: vmov.u8 r4, q0[13]
1641; CHECK-NEXT: vmov.32 q3[2], r5
1642; CHECK-NEXT: vmov.32 q4[2], r4
1643; CHECK-NEXT: vand q3, q3, q2
1644; CHECK-NEXT: vand q4, q4, q2
1645; CHECK-NEXT: vmov r5, s12
1646; CHECK-NEXT: vmov r4, s16
1647; CHECK-NEXT: umull r5, r4, r4, r5
1648; CHECK-NEXT: vmov.32 q5[0], r5
1649; CHECK-NEXT: vmov r5, s14
1650; CHECK-NEXT: vmov.32 q5[1], r4
1651; CHECK-NEXT: vmov r4, s18
1652; CHECK-NEXT: umull r5, r4, r4, r5
1653; CHECK-NEXT: vmov.32 q5[2], r5
1654; CHECK-NEXT: vmov.32 q5[3], r4
1655; CHECK-NEXT: vmov r2, s20
1656; CHECK-NEXT: vmov r5, s21
1657; CHECK-NEXT: adds.w r2, r2, r12
1658; CHECK-NEXT: adcs r3, r5
1659; CHECK-NEXT: vmov r5, s22
1660; CHECK-NEXT: adds r2, r2, r5
1661; CHECK-NEXT: vmov.u8 r5, q1[14]
1662; CHECK-NEXT: vmov.32 q3[0], r5
1663; CHECK-NEXT: vmov.u8 r5, q1[15]
1664; CHECK-NEXT: adcs r3, r4
1665; CHECK-NEXT: vmov.32 q3[2], r5
1666; CHECK-NEXT: vmov.u8 r4, q0[14]
1667; CHECK-NEXT: vand q1, q3, q2
1668; CHECK-NEXT: vmov.32 q3[0], r4
1669; CHECK-NEXT: vmov.u8 r4, q0[15]
1670; CHECK-NEXT: vmov.32 q3[2], r4
1671; CHECK-NEXT: vmov r5, s4
1672; CHECK-NEXT: vand q0, q3, q2
1673; CHECK-NEXT: vmov r4, s0
1674; CHECK-NEXT: umlal r2, r3, r4, r5
1675; CHECK-NEXT: vmov r5, s6
1676; CHECK-NEXT: vmov r4, s2
1677; CHECK-NEXT: umlal r2, r3, r4, r5
1678; CHECK-NEXT: adds r0, r0, r2
1679; CHECK-NEXT: adcs r1, r3
1680; CHECK-NEXT: vpop {d8, d9, d10, d11}
1681; CHECK-NEXT: pop {r4, r5, r7, pc}
David Green0ac4f6b2020-02-17 11:41:16 +00001682entry:
1683 %xx = zext <16 x i8> %x to <16 x i64>
1684 %yy = zext <16 x i8> %y to <16 x i64>
1685 %m = mul <16 x i64> %xx, %yy
1686 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
1687 %r = add i64 %z, %a
1688 ret i64 %r
1689}
1690
1691define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
1692; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
1693; CHECK: @ %bb.0: @ %entry
1694; CHECK-NEXT: .save {r4, lr}
1695; CHECK-NEXT: push {r4, lr}
1696; CHECK-NEXT: vmov.u8 r2, q1[0]
1697; CHECK-NEXT: vmov.u8 r3, q0[0]
1698; CHECK-NEXT: sxtb r2, r2
1699; CHECK-NEXT: sxtb r3, r3
1700; CHECK-NEXT: smull r2, r3, r3, r2
1701; CHECK-NEXT: vmov.32 q2[0], r2
1702; CHECK-NEXT: vmov.u8 r2, q1[1]
1703; CHECK-NEXT: vmov.32 q2[1], r3
1704; CHECK-NEXT: vmov.u8 r3, q0[1]
1705; CHECK-NEXT: sxtb r2, r2
1706; CHECK-NEXT: sxtb r3, r3
1707; CHECK-NEXT: smull r2, r3, r3, r2
1708; CHECK-NEXT: vmov.32 q2[2], r2
1709; CHECK-NEXT: vmov.32 q2[3], r3
1710; CHECK-NEXT: vmov lr, s10
1711; CHECK-NEXT: vmov r2, s8
1712; CHECK-NEXT: vmov r12, s9
1713; CHECK-NEXT: adds.w lr, lr, r2
1714; CHECK-NEXT: vmov.u8 r2, q1[2]
1715; CHECK-NEXT: adc.w r12, r12, r3
1716; CHECK-NEXT: vmov.u8 r3, q0[2]
1717; CHECK-NEXT: sxtb r2, r2
1718; CHECK-NEXT: sxtb r3, r3
1719; CHECK-NEXT: smull r2, r3, r3, r2
1720; CHECK-NEXT: vmov.32 q2[0], r2
1721; CHECK-NEXT: vmov.u8 r2, q1[3]
1722; CHECK-NEXT: vmov.32 q2[1], r3
1723; CHECK-NEXT: vmov.u8 r3, q0[3]
1724; CHECK-NEXT: sxtb r2, r2
1725; CHECK-NEXT: sxtb r3, r3
1726; CHECK-NEXT: smull r2, r3, r3, r2
1727; CHECK-NEXT: vmov.32 q2[2], r2
1728; CHECK-NEXT: vmov.32 q2[3], r3
1729; CHECK-NEXT: vmov r4, s8
1730; CHECK-NEXT: vmov r2, s9
1731; CHECK-NEXT: adds.w r4, r4, lr
1732; CHECK-NEXT: adc.w r12, r12, r2
1733; CHECK-NEXT: vmov r2, s10
1734; CHECK-NEXT: adds.w lr, r4, r2
1735; CHECK-NEXT: vmov.u8 r4, q1[4]
1736; CHECK-NEXT: vmov.u8 r2, q0[4]
1737; CHECK-NEXT: sxtb r4, r4
1738; CHECK-NEXT: sxtb r2, r2
1739; CHECK-NEXT: adc.w r12, r12, r3
1740; CHECK-NEXT: smull r2, r4, r2, r4
1741; CHECK-NEXT: vmov.32 q2[0], r2
1742; CHECK-NEXT: vmov.u8 r2, q1[5]
1743; CHECK-NEXT: vmov.32 q2[1], r4
1744; CHECK-NEXT: vmov.u8 r4, q0[5]
1745; CHECK-NEXT: sxtb r2, r2
1746; CHECK-NEXT: sxtb r4, r4
1747; CHECK-NEXT: smull r2, r4, r4, r2
1748; CHECK-NEXT: vmov.32 q2[2], r2
1749; CHECK-NEXT: vmov.32 q2[3], r4
1750; CHECK-NEXT: vmov r3, s8
1751; CHECK-NEXT: vmov r2, s9
1752; CHECK-NEXT: adds.w r3, r3, lr
1753; CHECK-NEXT: adc.w r12, r12, r2
1754; CHECK-NEXT: vmov r2, s10
1755; CHECK-NEXT: adds.w lr, r3, r2
1756; CHECK-NEXT: vmov.u8 r2, q0[6]
1757; CHECK-NEXT: adc.w r12, r12, r4
1758; CHECK-NEXT: vmov.u8 r4, q1[6]
1759; CHECK-NEXT: sxtb r4, r4
1760; CHECK-NEXT: sxtb r2, r2
1761; CHECK-NEXT: smull r2, r4, r2, r4
1762; CHECK-NEXT: vmov.32 q2[0], r2
1763; CHECK-NEXT: vmov.u8 r2, q1[7]
1764; CHECK-NEXT: vmov.32 q2[1], r4
1765; CHECK-NEXT: vmov.u8 r4, q0[7]
1766; CHECK-NEXT: sxtb r2, r2
1767; CHECK-NEXT: sxtb r4, r4
1768; CHECK-NEXT: smull r2, r4, r4, r2
1769; CHECK-NEXT: vmov.32 q2[2], r2
1770; CHECK-NEXT: vmov.32 q2[3], r4
1771; CHECK-NEXT: vmov r3, s8
1772; CHECK-NEXT: vmov r2, s9
1773; CHECK-NEXT: adds.w r3, r3, lr
1774; CHECK-NEXT: adc.w r12, r12, r2
1775; CHECK-NEXT: vmov r2, s10
1776; CHECK-NEXT: adds.w lr, r3, r2
1777; CHECK-NEXT: vmov.u8 r2, q0[8]
1778; CHECK-NEXT: adc.w r12, r12, r4
1779; CHECK-NEXT: vmov.u8 r4, q1[8]
1780; CHECK-NEXT: sxtb r4, r4
1781; CHECK-NEXT: sxtb r2, r2
1782; CHECK-NEXT: smull r2, r4, r2, r4
1783; CHECK-NEXT: vmov.32 q2[0], r2
1784; CHECK-NEXT: vmov.u8 r2, q1[9]
1785; CHECK-NEXT: vmov.32 q2[1], r4
1786; CHECK-NEXT: vmov.u8 r4, q0[9]
1787; CHECK-NEXT: sxtb r2, r2
1788; CHECK-NEXT: sxtb r4, r4
1789; CHECK-NEXT: smull r2, r4, r4, r2
1790; CHECK-NEXT: vmov.32 q2[2], r2
1791; CHECK-NEXT: vmov.32 q2[3], r4
1792; CHECK-NEXT: vmov r3, s8
1793; CHECK-NEXT: vmov r2, s9
1794; CHECK-NEXT: adds.w r3, r3, lr
1795; CHECK-NEXT: adc.w r12, r12, r2
1796; CHECK-NEXT: vmov r2, s10
1797; CHECK-NEXT: adds.w lr, r3, r2
1798; CHECK-NEXT: vmov.u8 r2, q0[10]
1799; CHECK-NEXT: adc.w r12, r12, r4
1800; CHECK-NEXT: vmov.u8 r4, q1[10]
1801; CHECK-NEXT: sxtb r4, r4
1802; CHECK-NEXT: sxtb r2, r2
1803; CHECK-NEXT: smull r2, r4, r2, r4
1804; CHECK-NEXT: vmov.32 q2[0], r2
1805; CHECK-NEXT: vmov.u8 r2, q1[11]
1806; CHECK-NEXT: vmov.32 q2[1], r4
1807; CHECK-NEXT: vmov.u8 r4, q0[11]
1808; CHECK-NEXT: sxtb r2, r2
1809; CHECK-NEXT: sxtb r4, r4
1810; CHECK-NEXT: smull r2, r4, r4, r2
1811; CHECK-NEXT: vmov.32 q2[2], r2
1812; CHECK-NEXT: vmov.32 q2[3], r4
1813; CHECK-NEXT: vmov r3, s8
1814; CHECK-NEXT: vmov r2, s9
1815; CHECK-NEXT: adds.w r3, r3, lr
1816; CHECK-NEXT: adc.w r12, r12, r2
1817; CHECK-NEXT: vmov r2, s10
1818; CHECK-NEXT: adds.w lr, r3, r2
1819; CHECK-NEXT: vmov.u8 r2, q0[12]
1820; CHECK-NEXT: adc.w r12, r12, r4
1821; CHECK-NEXT: vmov.u8 r4, q1[12]
1822; CHECK-NEXT: sxtb r4, r4
1823; CHECK-NEXT: sxtb r2, r2
1824; CHECK-NEXT: smull r2, r4, r2, r4
1825; CHECK-NEXT: vmov.32 q2[0], r2
1826; CHECK-NEXT: vmov.u8 r2, q1[13]
1827; CHECK-NEXT: vmov.32 q2[1], r4
1828; CHECK-NEXT: vmov.u8 r4, q0[13]
1829; CHECK-NEXT: sxtb r2, r2
1830; CHECK-NEXT: sxtb r4, r4
1831; CHECK-NEXT: smull r2, r4, r4, r2
1832; CHECK-NEXT: vmov.32 q2[2], r2
1833; CHECK-NEXT: vmov.32 q2[3], r4
1834; CHECK-NEXT: vmov r3, s8
1835; CHECK-NEXT: vmov r2, s9
1836; CHECK-NEXT: adds.w r3, r3, lr
1837; CHECK-NEXT: adc.w r12, r12, r2
1838; CHECK-NEXT: vmov r2, s10
1839; CHECK-NEXT: adds r2, r2, r3
1840; CHECK-NEXT: adc.w r3, r12, r4
1841; CHECK-NEXT: vmov.u8 r4, q1[14]
1842; CHECK-NEXT: sxtb.w r12, r4
1843; CHECK-NEXT: vmov.u8 r4, q0[14]
1844; CHECK-NEXT: sxtb r4, r4
1845; CHECK-NEXT: smlal r2, r3, r4, r12
1846; CHECK-NEXT: vmov.u8 r4, q1[15]
1847; CHECK-NEXT: sxtb.w r12, r4
1848; CHECK-NEXT: vmov.u8 r4, q0[15]
1849; CHECK-NEXT: sxtb r4, r4
1850; CHECK-NEXT: smlal r2, r3, r4, r12
1851; CHECK-NEXT: adds r0, r0, r2
1852; CHECK-NEXT: adcs r1, r3
1853; CHECK-NEXT: pop {r4, pc}
1854entry:
1855 %xx = sext <16 x i8> %x to <16 x i64>
1856 %yy = sext <16 x i8> %y to <16 x i64>
1857 %m = mul <16 x i64> %xx, %yy
1858 %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
1859 %r = add i64 %z, %a
1860 ret i64 %r
1861}
1862
1863define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
1864; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
1865; CHECK: @ %bb.0: @ %entry
1866; CHECK-NEXT: .save {r7, lr}
1867; CHECK-NEXT: push {r7, lr}
David Greenc9eaed52020-03-28 16:22:05 +00001868; CHECK-NEXT: vmov.i64 q2, #0xff
David Green0ac4f6b2020-02-17 11:41:16 +00001869; CHECK-NEXT: vand q1, q1, q2
1870; CHECK-NEXT: vand q0, q0, q2
1871; CHECK-NEXT: vmov r2, s6
1872; CHECK-NEXT: vmov r3, s2
1873; CHECK-NEXT: umull r12, lr, r3, r2
1874; CHECK-NEXT: vmov r2, s4
1875; CHECK-NEXT: vmov r3, s0
1876; CHECK-NEXT: umull r2, r3, r3, r2
1877; CHECK-NEXT: add r2, r12
1878; CHECK-NEXT: orr.w r3, r3, lr
1879; CHECK-NEXT: adds r0, r0, r2
1880; CHECK-NEXT: adcs r1, r3
1881; CHECK-NEXT: pop {r7, pc}
David Green0ac4f6b2020-02-17 11:41:16 +00001882entry:
1883 %xx = zext <2 x i8> %x to <2 x i64>
1884 %yy = zext <2 x i8> %y to <2 x i64>
1885 %m = mul <2 x i64> %xx, %yy
1886 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1887 %r = add i64 %z, %a
1888 ret i64 %r
1889}
1890
1891define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
1892; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
1893; CHECK: @ %bb.0: @ %entry
1894; CHECK-NEXT: .save {r7, lr}
1895; CHECK-NEXT: push {r7, lr}
1896; CHECK-NEXT: vmov r2, s4
1897; CHECK-NEXT: vmov r3, s0
1898; CHECK-NEXT: sxtb r2, r2
1899; CHECK-NEXT: sxtb r3, r3
1900; CHECK-NEXT: smull r2, r12, r3, r2
1901; CHECK-NEXT: vmov r3, s6
1902; CHECK-NEXT: sxtb.w lr, r3
1903; CHECK-NEXT: vmov r3, s2
1904; CHECK-NEXT: sxtb r3, r3
1905; CHECK-NEXT: smlal r2, r12, r3, lr
1906; CHECK-NEXT: adds r0, r0, r2
1907; CHECK-NEXT: adc.w r1, r1, r12
1908; CHECK-NEXT: pop {r7, pc}
1909entry:
1910 %xx = sext <2 x i8> %x to <2 x i64>
1911 %yy = sext <2 x i8> %y to <2 x i64>
1912 %m = mul <2 x i64> %xx, %yy
1913 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1914 %r = add i64 %z, %a
1915 ret i64 %r
1916}
1917
1918define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64 %a) {
1919; CHECK-LABEL: add_v2i64_v2i64_acc:
1920; CHECK: @ %bb.0: @ %entry
1921; CHECK-NEXT: .save {r4, r5, r6, lr}
1922; CHECK-NEXT: push {r4, r5, r6, lr}
1923; CHECK-NEXT: vmov r2, s4
1924; CHECK-NEXT: vmov r3, s0
1925; CHECK-NEXT: vmov r4, s5
1926; CHECK-NEXT: vmov r6, s7
1927; CHECK-NEXT: umull r12, lr, r3, r2
1928; CHECK-NEXT: mla r3, r3, r4, lr
1929; CHECK-NEXT: vmov r4, s1
1930; CHECK-NEXT: vmov.32 q2[0], r12
1931; CHECK-NEXT: mla r2, r4, r2, r3
1932; CHECK-NEXT: vmov r4, s6
1933; CHECK-NEXT: vmov r3, s2
1934; CHECK-NEXT: vmov.32 q2[1], r2
1935; CHECK-NEXT: vmov r12, s8
1936; CHECK-NEXT: umull lr, r5, r3, r4
1937; CHECK-NEXT: mla r3, r3, r6, r5
1938; CHECK-NEXT: vmov r5, s3
1939; CHECK-NEXT: adds.w r6, r12, lr
1940; CHECK-NEXT: mla r3, r5, r4, r3
1941; CHECK-NEXT: adcs r2, r3
1942; CHECK-NEXT: adds r0, r0, r6
1943; CHECK-NEXT: adcs r1, r2
1944; CHECK-NEXT: pop {r4, r5, r6, pc}
1945entry:
1946 %m = mul <2 x i64> %x, %y
1947 %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
1948 %r = add i64 %z, %a
1949 ret i64 %r
1950}
1951
1952declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
1953declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
1954declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
1955declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
1956declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
1957declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
1958declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
1959declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
1960declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
1961declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)