blob: 80ba9be3bd2d3dd09732699701e7a7098397b969 [file] [log] [blame]
Bob Wilsonfe27c512009-10-07 23:47:21 +00001; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
Bob Wilson5bafff32009-06-22 23:27:02 +00002
3define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
Bob Wilsonfe27c512009-10-07 23:47:21 +00004;CHECK: vmuli8:
5;CHECK: vmul.i8
Bob Wilson5bafff32009-06-22 23:27:02 +00006 %tmp1 = load <8 x i8>* %A
7 %tmp2 = load <8 x i8>* %B
8 %tmp3 = mul <8 x i8> %tmp1, %tmp2
9 ret <8 x i8> %tmp3
10}
11
12define <4 x i16> @vmuli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
Bob Wilsonfe27c512009-10-07 23:47:21 +000013;CHECK: vmuli16:
14;CHECK: vmul.i16
Bob Wilson5bafff32009-06-22 23:27:02 +000015 %tmp1 = load <4 x i16>* %A
16 %tmp2 = load <4 x i16>* %B
17 %tmp3 = mul <4 x i16> %tmp1, %tmp2
18 ret <4 x i16> %tmp3
19}
20
21define <2 x i32> @vmuli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
Bob Wilsonfe27c512009-10-07 23:47:21 +000022;CHECK: vmuli32:
23;CHECK: vmul.i32
Bob Wilson5bafff32009-06-22 23:27:02 +000024 %tmp1 = load <2 x i32>* %A
25 %tmp2 = load <2 x i32>* %B
26 %tmp3 = mul <2 x i32> %tmp1, %tmp2
27 ret <2 x i32> %tmp3
28}
29
30define <2 x float> @vmulf32(<2 x float>* %A, <2 x float>* %B) nounwind {
Bob Wilsonfe27c512009-10-07 23:47:21 +000031;CHECK: vmulf32:
32;CHECK: vmul.f32
Bob Wilson5bafff32009-06-22 23:27:02 +000033 %tmp1 = load <2 x float>* %A
34 %tmp2 = load <2 x float>* %B
Dan Gohmand4d01152010-05-03 22:36:46 +000035 %tmp3 = fmul <2 x float> %tmp1, %tmp2
Bob Wilson5bafff32009-06-22 23:27:02 +000036 ret <2 x float> %tmp3
37}
38
39define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
Bob Wilsonfe27c512009-10-07 23:47:21 +000040;CHECK: vmulp8:
41;CHECK: vmul.p8
Bob Wilson5bafff32009-06-22 23:27:02 +000042 %tmp1 = load <8 x i8>* %A
43 %tmp2 = load <8 x i8>* %B
44 %tmp3 = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
45 ret <8 x i8> %tmp3
46}
47
48define <16 x i8> @vmulQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
Bob Wilsonfe27c512009-10-07 23:47:21 +000049;CHECK: vmulQi8:
50;CHECK: vmul.i8
Bob Wilson5bafff32009-06-22 23:27:02 +000051 %tmp1 = load <16 x i8>* %A
52 %tmp2 = load <16 x i8>* %B
53 %tmp3 = mul <16 x i8> %tmp1, %tmp2
54 ret <16 x i8> %tmp3
55}
56
57define <8 x i16> @vmulQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
Bob Wilsonfe27c512009-10-07 23:47:21 +000058;CHECK: vmulQi16:
59;CHECK: vmul.i16
Bob Wilson5bafff32009-06-22 23:27:02 +000060 %tmp1 = load <8 x i16>* %A
61 %tmp2 = load <8 x i16>* %B
62 %tmp3 = mul <8 x i16> %tmp1, %tmp2
63 ret <8 x i16> %tmp3
64}
65
66define <4 x i32> @vmulQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
Bob Wilsonfe27c512009-10-07 23:47:21 +000067;CHECK: vmulQi32:
68;CHECK: vmul.i32
Bob Wilson5bafff32009-06-22 23:27:02 +000069 %tmp1 = load <4 x i32>* %A
70 %tmp2 = load <4 x i32>* %B
71 %tmp3 = mul <4 x i32> %tmp1, %tmp2
72 ret <4 x i32> %tmp3
73}
74
75define <4 x float> @vmulQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
Bob Wilsonfe27c512009-10-07 23:47:21 +000076;CHECK: vmulQf32:
77;CHECK: vmul.f32
Bob Wilson5bafff32009-06-22 23:27:02 +000078 %tmp1 = load <4 x float>* %A
79 %tmp2 = load <4 x float>* %B
Dan Gohmand4d01152010-05-03 22:36:46 +000080 %tmp3 = fmul <4 x float> %tmp1, %tmp2
Bob Wilson5bafff32009-06-22 23:27:02 +000081 ret <4 x float> %tmp3
82}
83
84define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
Bob Wilsonfe27c512009-10-07 23:47:21 +000085;CHECK: vmulQp8:
86;CHECK: vmul.p8
Bob Wilson5bafff32009-06-22 23:27:02 +000087 %tmp1 = load <16 x i8>* %A
88 %tmp2 = load <16 x i8>* %B
89 %tmp3 = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
90 ret <16 x i8> %tmp3
91}
92
93declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
94declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
Bob Wilson83815ae2009-10-09 20:20:54 +000095
96define arm_aapcs_vfpcc <2 x float> @test_vmul_lanef32(<2 x float> %arg0_float32x2_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
97entry:
98; CHECK: test_vmul_lanef32:
99; CHECK: vmul.f32 d0, d0, d1[0]
100 %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <2 x i32> zeroinitializer ; <<2 x float>> [#uses=1]
101 %1 = fmul <2 x float> %0, %arg0_float32x2_t ; <<2 x float>> [#uses=1]
102 ret <2 x float> %1
103}
104
105define arm_aapcs_vfpcc <4 x i16> @test_vmul_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
106entry:
107; CHECK: test_vmul_lanes16:
108; CHECK: vmul.i16 d0, d0, d1[1]
109 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses$
110 %1 = mul <4 x i16> %0, %arg0_int16x4_t ; <<4 x i16>> [#uses=1]
111 ret <4 x i16> %1
112}
113
114define arm_aapcs_vfpcc <2 x i32> @test_vmul_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
115entry:
116; CHECK: test_vmul_lanes32:
117; CHECK: vmul.i32 d0, d0, d1[1]
118 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
119 %1 = mul <2 x i32> %0, %arg0_int32x2_t ; <<2 x i32>> [#uses=1]
120 ret <2 x i32> %1
121}
122
123define arm_aapcs_vfpcc <4 x float> @test_vmulQ_lanef32(<4 x float> %arg0_float32x4_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
124entry:
125; CHECK: test_vmulQ_lanef32:
126; CHECK: vmul.f32 q0, q0, d2[1]
127 %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>$
128 %1 = fmul <4 x float> %0, %arg0_float32x4_t ; <<4 x float>> [#uses=1]
129 ret <4 x float> %1
130}
131
132define arm_aapcs_vfpcc <8 x i16> @test_vmulQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
133entry:
134; CHECK: test_vmulQ_lanes16:
135; CHECK: vmul.i16 q0, q0, d2[1]
136 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
137 %1 = mul <8 x i16> %0, %arg0_int16x8_t ; <<8 x i16>> [#uses=1]
138 ret <8 x i16> %1
139}
140
141define arm_aapcs_vfpcc <4 x i32> @test_vmulQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
142entry:
143; CHECK: test_vmulQ_lanes32:
144; CHECK: vmul.i32 q0, q0, d2[1]
145 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses$
146 %1 = mul <4 x i32> %0, %arg0_int32x4_t ; <<4 x i32>> [#uses=1]
147 ret <4 x i32> %1
148}
149
150define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
151;CHECK: vmulls8:
152;CHECK: vmull.s8
153 %tmp1 = load <8 x i8>* %A
154 %tmp2 = load <8 x i8>* %B
Bob Wilsond0b69cf2010-09-01 23:50:19 +0000155 %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
156 %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
157 %tmp5 = mul <8 x i16> %tmp3, %tmp4
158 ret <8 x i16> %tmp5
Bob Wilson83815ae2009-10-09 20:20:54 +0000159}
160
Evan Cheng92e39162011-03-29 23:06:19 +0000161define <8 x i16> @vmulls8_int(<8 x i8>* %A, <8 x i8>* %B) nounwind {
162;CHECK: vmulls8_int:
163;CHECK: vmull.s8
164 %tmp1 = load <8 x i8>* %A
165 %tmp2 = load <8 x i8>* %B
166 %tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
167 ret <8 x i16> %tmp3
168}
169
Bob Wilson83815ae2009-10-09 20:20:54 +0000170define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
171;CHECK: vmulls16:
172;CHECK: vmull.s16
173 %tmp1 = load <4 x i16>* %A
174 %tmp2 = load <4 x i16>* %B
Bob Wilsond0b69cf2010-09-01 23:50:19 +0000175 %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
176 %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
177 %tmp5 = mul <4 x i32> %tmp3, %tmp4
178 ret <4 x i32> %tmp5
Bob Wilson83815ae2009-10-09 20:20:54 +0000179}
180
Evan Cheng92e39162011-03-29 23:06:19 +0000181define <4 x i32> @vmulls16_int(<4 x i16>* %A, <4 x i16>* %B) nounwind {
182;CHECK: vmulls16_int:
183;CHECK: vmull.s16
184 %tmp1 = load <4 x i16>* %A
185 %tmp2 = load <4 x i16>* %B
186 %tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
187 ret <4 x i32> %tmp3
188}
189
Bob Wilson83815ae2009-10-09 20:20:54 +0000190define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
191;CHECK: vmulls32:
192;CHECK: vmull.s32
193 %tmp1 = load <2 x i32>* %A
194 %tmp2 = load <2 x i32>* %B
Bob Wilsond0b69cf2010-09-01 23:50:19 +0000195 %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
196 %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
197 %tmp5 = mul <2 x i64> %tmp3, %tmp4
198 ret <2 x i64> %tmp5
Bob Wilson83815ae2009-10-09 20:20:54 +0000199}
200
Evan Cheng92e39162011-03-29 23:06:19 +0000201define <2 x i64> @vmulls32_int(<2 x i32>* %A, <2 x i32>* %B) nounwind {
202;CHECK: vmulls32_int:
203;CHECK: vmull.s32
204 %tmp1 = load <2 x i32>* %A
205 %tmp2 = load <2 x i32>* %B
206 %tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
207 ret <2 x i64> %tmp3
208}
209
Bob Wilson83815ae2009-10-09 20:20:54 +0000210define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
211;CHECK: vmullu8:
212;CHECK: vmull.u8
213 %tmp1 = load <8 x i8>* %A
214 %tmp2 = load <8 x i8>* %B
Bob Wilsond0b69cf2010-09-01 23:50:19 +0000215 %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
216 %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
217 %tmp5 = mul <8 x i16> %tmp3, %tmp4
218 ret <8 x i16> %tmp5
Bob Wilson83815ae2009-10-09 20:20:54 +0000219}
220
Evan Cheng92e39162011-03-29 23:06:19 +0000221define <8 x i16> @vmullu8_int(<8 x i8>* %A, <8 x i8>* %B) nounwind {
222;CHECK: vmullu8_int:
223;CHECK: vmull.u8
224 %tmp1 = load <8 x i8>* %A
225 %tmp2 = load <8 x i8>* %B
226 %tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
227 ret <8 x i16> %tmp3
228}
229
Bob Wilson83815ae2009-10-09 20:20:54 +0000230define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
231;CHECK: vmullu16:
232;CHECK: vmull.u16
233 %tmp1 = load <4 x i16>* %A
234 %tmp2 = load <4 x i16>* %B
Bob Wilsond0b69cf2010-09-01 23:50:19 +0000235 %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
236 %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
237 %tmp5 = mul <4 x i32> %tmp3, %tmp4
238 ret <4 x i32> %tmp5
Bob Wilson83815ae2009-10-09 20:20:54 +0000239}
240
Evan Cheng92e39162011-03-29 23:06:19 +0000241define <4 x i32> @vmullu16_int(<4 x i16>* %A, <4 x i16>* %B) nounwind {
242;CHECK: vmullu16_int:
243;CHECK: vmull.u16
244 %tmp1 = load <4 x i16>* %A
245 %tmp2 = load <4 x i16>* %B
246 %tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
247 ret <4 x i32> %tmp3
248}
249
Bob Wilson83815ae2009-10-09 20:20:54 +0000250define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
251;CHECK: vmullu32:
252;CHECK: vmull.u32
253 %tmp1 = load <2 x i32>* %A
254 %tmp2 = load <2 x i32>* %B
Bob Wilsond0b69cf2010-09-01 23:50:19 +0000255 %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
256 %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
257 %tmp5 = mul <2 x i64> %tmp3, %tmp4
258 ret <2 x i64> %tmp5
Bob Wilson83815ae2009-10-09 20:20:54 +0000259}
260
Evan Cheng92e39162011-03-29 23:06:19 +0000261define <2 x i64> @vmullu32_int(<2 x i32>* %A, <2 x i32>* %B) nounwind {
262;CHECK: vmullu32_int:
263;CHECK: vmull.u32
264 %tmp1 = load <2 x i32>* %A
265 %tmp2 = load <2 x i32>* %B
266 %tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
267 ret <2 x i64> %tmp3
268}
269
Bob Wilson83815ae2009-10-09 20:20:54 +0000270define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
271;CHECK: vmullp8:
272;CHECK: vmull.p8
273 %tmp1 = load <8 x i8>* %A
274 %tmp2 = load <8 x i8>* %B
275 %tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
276 ret <8 x i16> %tmp3
277}
278
279define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
280entry:
281; CHECK: test_vmull_lanes16
282; CHECK: vmull.s16 q0, d0, d1[1]
283 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
Bob Wilsond0b69cf2010-09-01 23:50:19 +0000284 %1 = sext <4 x i16> %arg0_int16x4_t to <4 x i32>
285 %2 = sext <4 x i16> %0 to <4 x i32>
286 %3 = mul <4 x i32> %1, %2
287 ret <4 x i32> %3
Bob Wilson83815ae2009-10-09 20:20:54 +0000288}
289
Evan Cheng92e39162011-03-29 23:06:19 +0000290define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16_int(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
291entry:
292; CHECK: test_vmull_lanes16_int
293; CHECK: vmull.s16 q0, d0, d1[1]
294 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
295 %1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
296 ret <4 x i32> %1
297}
298
Bob Wilson83815ae2009-10-09 20:20:54 +0000299define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
300entry:
301; CHECK: test_vmull_lanes32
302; CHECK: vmull.s32 q0, d0, d1[1]
303 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
Bob Wilsond0b69cf2010-09-01 23:50:19 +0000304 %1 = sext <2 x i32> %arg0_int32x2_t to <2 x i64>
305 %2 = sext <2 x i32> %0 to <2 x i64>
306 %3 = mul <2 x i64> %1, %2
307 ret <2 x i64> %3
Bob Wilson83815ae2009-10-09 20:20:54 +0000308}
309
Evan Cheng92e39162011-03-29 23:06:19 +0000310define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32_int(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
311entry:
312; CHECK: test_vmull_lanes32_int
313; CHECK: vmull.s32 q0, d0, d1[1]
314 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
315 %1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
316 ret <2 x i64> %1
317}
318
Bob Wilson83815ae2009-10-09 20:20:54 +0000319define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
320entry:
321; CHECK: test_vmull_laneu16
322; CHECK: vmull.u16 q0, d0, d1[1]
323 %0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
Bob Wilsond0b69cf2010-09-01 23:50:19 +0000324 %1 = zext <4 x i16> %arg0_uint16x4_t to <4 x i32>
325 %2 = zext <4 x i16> %0 to <4 x i32>
326 %3 = mul <4 x i32> %1, %2
327 ret <4 x i32> %3
Bob Wilson83815ae2009-10-09 20:20:54 +0000328}
329
Evan Cheng92e39162011-03-29 23:06:19 +0000330define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16_int(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
331entry:
332; CHECK: test_vmull_laneu16_int
333; CHECK: vmull.u16 q0, d0, d1[1]
334 %0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
335 %1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
336 ret <4 x i32> %1
337}
338
Bob Wilson83815ae2009-10-09 20:20:54 +0000339define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
340entry:
341; CHECK: test_vmull_laneu32
342; CHECK: vmull.u32 q0, d0, d1[1]
343 %0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
Bob Wilsond0b69cf2010-09-01 23:50:19 +0000344 %1 = zext <2 x i32> %arg0_uint32x2_t to <2 x i64>
345 %2 = zext <2 x i32> %0 to <2 x i64>
346 %3 = mul <2 x i64> %1, %2
347 ret <2 x i64> %3
Bob Wilson83815ae2009-10-09 20:20:54 +0000348}
349
Evan Cheng92e39162011-03-29 23:06:19 +0000350define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32_int(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
351entry:
352; CHECK: test_vmull_laneu32_int
353; CHECK: vmull.u32 q0, d0, d1[1]
354 %0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
355 %1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
356 ret <2 x i64> %1
357}
358
359declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
360declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
361declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
362
363declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
364declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
365declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
366
Bob Wilson83815ae2009-10-09 20:20:54 +0000367declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
Bob Wilson626613d2010-11-23 19:38:38 +0000368
369
370; Radar 8687140
371; VMULL needs to recognize BUILD_VECTORs with sign/zero-extended elements.
372
373define <8 x i16> @vmull_extvec_s8(<8 x i8> %arg) nounwind {
374; CHECK: vmull_extvec_s8
375; CHECK: vmull.s8
376 %tmp3 = sext <8 x i8> %arg to <8 x i16>
377 %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
378 ret <8 x i16> %tmp4
379}
380
381define <8 x i16> @vmull_extvec_u8(<8 x i8> %arg) nounwind {
382; CHECK: vmull_extvec_u8
383; CHECK: vmull.u8
384 %tmp3 = zext <8 x i8> %arg to <8 x i16>
385 %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
386 ret <8 x i16> %tmp4
387}
388
389define <8 x i16> @vmull_noextvec_s8(<8 x i8> %arg) nounwind {
390; Do not use VMULL if the BUILD_VECTOR element values are too big.
391; CHECK: vmull_noextvec_s8
392; CHECK: vmovl.s8
393; CHECK: vmul.i16
394 %tmp3 = sext <8 x i8> %arg to <8 x i16>
395 %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
396 ret <8 x i16> %tmp4
397}
398
399define <8 x i16> @vmull_noextvec_u8(<8 x i8> %arg) nounwind {
400; Do not use VMULL if the BUILD_VECTOR element values are too big.
401; CHECK: vmull_noextvec_u8
402; CHECK: vmovl.u8
403; CHECK: vmul.i16
404 %tmp3 = zext <8 x i8> %arg to <8 x i16>
405 %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
406 ret <8 x i16> %tmp4
407}
408
409define <4 x i32> @vmull_extvec_s16(<4 x i16> %arg) nounwind {
410; CHECK: vmull_extvec_s16
411; CHECK: vmull.s16
412 %tmp3 = sext <4 x i16> %arg to <4 x i32>
413 %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
414 ret <4 x i32> %tmp4
415}
416
417define <4 x i32> @vmull_extvec_u16(<4 x i16> %arg) nounwind {
418; CHECK: vmull_extvec_u16
419; CHECK: vmull.u16
420 %tmp3 = zext <4 x i16> %arg to <4 x i32>
421 %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
422 ret <4 x i32> %tmp4
423}
424
425define <2 x i64> @vmull_extvec_s32(<2 x i32> %arg) nounwind {
426; CHECK: vmull_extvec_s32
427; CHECK: vmull.s32
428 %tmp3 = sext <2 x i32> %arg to <2 x i64>
429 %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
430 ret <2 x i64> %tmp4
431}
432
433define <2 x i64> @vmull_extvec_u32(<2 x i32> %arg) nounwind {
434; CHECK: vmull_extvec_u32
435; CHECK: vmull.u32
436 %tmp3 = zext <2 x i32> %arg to <2 x i64>
437 %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
438 ret <2 x i64> %tmp4
439}
Evan Cheng78fe9ab2011-03-29 01:56:09 +0000440
441; rdar://9197392
442define void @distribue(i16* %dst, i8* %src, i32 %mul) nounwind {
443entry:
444; CHECK: distribue:
445; CHECK: vmull.u8 [[REG1:(q[0-9]+)]], d{{.*}}, [[REG2:(d[0-9]+)]]
446; CHECK: vmlal.u8 [[REG1]], d{{.*}}, [[REG2]]
447 %0 = trunc i32 %mul to i8
448 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
449 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
450 %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
451 %4 = bitcast <16 x i8> %3 to <2 x double>
452 %5 = extractelement <2 x double> %4, i32 1
453 %6 = bitcast double %5 to <8 x i8>
454 %7 = zext <8 x i8> %6 to <8 x i16>
455 %8 = zext <8 x i8> %2 to <8 x i16>
456 %9 = extractelement <2 x double> %4, i32 0
457 %10 = bitcast double %9 to <8 x i8>
458 %11 = zext <8 x i8> %10 to <8 x i16>
459 %12 = add <8 x i16> %7, %11
460 %13 = mul <8 x i16> %12, %8
461 %14 = bitcast i16* %dst to i8*
462 tail call void @llvm.arm.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
463 ret void
464}
465
466declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
467
468declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind