blob: 5afc8d9f3f49597324baf5730592c39e2aa5486c [file] [log] [blame]
Tim Northover3b0846e2014-05-24 12:50:23 +00001; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
2
3
4define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5;CHECK-LABEL: sabdl8h:
6;CHECK: sabdl.8h
7 %tmp1 = load <8 x i8>* %A
8 %tmp2 = load <8 x i8>* %B
9 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
10 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
11 ret <8 x i16> %tmp4
12}
13
14define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
15;CHECK-LABEL: sabdl4s:
16;CHECK: sabdl.4s
17 %tmp1 = load <4 x i16>* %A
18 %tmp2 = load <4 x i16>* %B
19 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
20 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
21 ret <4 x i32> %tmp4
22}
23
24define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
25;CHECK-LABEL: sabdl2d:
26;CHECK: sabdl.2d
27 %tmp1 = load <2 x i32>* %A
28 %tmp2 = load <2 x i32>* %B
29 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
30 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
31 ret <2 x i64> %tmp4
32}
33
34define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
35;CHECK-LABEL: sabdl2_8h:
36;CHECK: sabdl2.8h
37 %load1 = load <16 x i8>* %A
38 %load2 = load <16 x i8>* %B
39 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
40 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
41 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
42 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
43 ret <8 x i16> %tmp4
44}
45
46define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
47;CHECK-LABEL: sabdl2_4s:
48;CHECK: sabdl2.4s
49 %load1 = load <8 x i16>* %A
50 %load2 = load <8 x i16>* %B
51 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
52 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
53 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
54 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
55 ret <4 x i32> %tmp4
56}
57
58define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
59;CHECK-LABEL: sabdl2_2d:
60;CHECK: sabdl2.2d
61 %load1 = load <4 x i32>* %A
62 %load2 = load <4 x i32>* %B
63 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
64 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
65 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
66 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
67 ret <2 x i64> %tmp4
68}
69
70define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
71;CHECK-LABEL: uabdl8h:
72;CHECK: uabdl.8h
73 %tmp1 = load <8 x i8>* %A
74 %tmp2 = load <8 x i8>* %B
75 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
76 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
77 ret <8 x i16> %tmp4
78}
79
80define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
81;CHECK-LABEL: uabdl4s:
82;CHECK: uabdl.4s
83 %tmp1 = load <4 x i16>* %A
84 %tmp2 = load <4 x i16>* %B
85 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
86 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
87 ret <4 x i32> %tmp4
88}
89
90define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
91;CHECK-LABEL: uabdl2d:
92;CHECK: uabdl.2d
93 %tmp1 = load <2 x i32>* %A
94 %tmp2 = load <2 x i32>* %B
95 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
96 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
97 ret <2 x i64> %tmp4
98}
99
100define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
101;CHECK-LABEL: uabdl2_8h:
102;CHECK: uabdl2.8h
103 %load1 = load <16 x i8>* %A
104 %load2 = load <16 x i8>* %B
105 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
106 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
107
108 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
109 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
110 ret <8 x i16> %tmp4
111}
112
113define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
114;CHECK-LABEL: uabdl2_4s:
115;CHECK: uabdl2.4s
116 %load1 = load <8 x i16>* %A
117 %load2 = load <8 x i16>* %B
118 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
119 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
120 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
121 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
122 ret <4 x i32> %tmp4
123}
124
125define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
126;CHECK-LABEL: uabdl2_2d:
127;CHECK: uabdl2.2d
128 %load1 = load <4 x i32>* %A
129 %load2 = load <4 x i32>* %B
130 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
131 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
132 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
133 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
134 ret <2 x i64> %tmp4
135}
136
137define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
138;CHECK-LABEL: fabd_2s:
139;CHECK: fabd.2s
140 %tmp1 = load <2 x float>* %A
141 %tmp2 = load <2 x float>* %B
142 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
143 ret <2 x float> %tmp3
144}
145
146define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
147;CHECK-LABEL: fabd_4s:
148;CHECK: fabd.4s
149 %tmp1 = load <4 x float>* %A
150 %tmp2 = load <4 x float>* %B
151 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
152 ret <4 x float> %tmp3
153}
154
155define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
156;CHECK-LABEL: fabd_2d:
157;CHECK: fabd.2d
158 %tmp1 = load <2 x double>* %A
159 %tmp2 = load <2 x double>* %B
160 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
161 ret <2 x double> %tmp3
162}
163
164declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
165declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
166declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
167
168define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
169;CHECK-LABEL: sabd_8b:
170;CHECK: sabd.8b
171 %tmp1 = load <8 x i8>* %A
172 %tmp2 = load <8 x i8>* %B
173 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
174 ret <8 x i8> %tmp3
175}
176
177define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
178;CHECK-LABEL: sabd_16b:
179;CHECK: sabd.16b
180 %tmp1 = load <16 x i8>* %A
181 %tmp2 = load <16 x i8>* %B
182 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
183 ret <16 x i8> %tmp3
184}
185
186define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
187;CHECK-LABEL: sabd_4h:
188;CHECK: sabd.4h
189 %tmp1 = load <4 x i16>* %A
190 %tmp2 = load <4 x i16>* %B
191 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
192 ret <4 x i16> %tmp3
193}
194
195define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
196;CHECK-LABEL: sabd_8h:
197;CHECK: sabd.8h
198 %tmp1 = load <8 x i16>* %A
199 %tmp2 = load <8 x i16>* %B
200 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
201 ret <8 x i16> %tmp3
202}
203
204define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
205;CHECK-LABEL: sabd_2s:
206;CHECK: sabd.2s
207 %tmp1 = load <2 x i32>* %A
208 %tmp2 = load <2 x i32>* %B
209 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
210 ret <2 x i32> %tmp3
211}
212
213define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
214;CHECK-LABEL: sabd_4s:
215;CHECK: sabd.4s
216 %tmp1 = load <4 x i32>* %A
217 %tmp2 = load <4 x i32>* %B
218 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
219 ret <4 x i32> %tmp3
220}
221
222declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
223declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
224declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
225declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
226declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
227declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
228
229define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
230;CHECK-LABEL: uabd_8b:
231;CHECK: uabd.8b
232 %tmp1 = load <8 x i8>* %A
233 %tmp2 = load <8 x i8>* %B
234 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
235 ret <8 x i8> %tmp3
236}
237
238define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
239;CHECK-LABEL: uabd_16b:
240;CHECK: uabd.16b
241 %tmp1 = load <16 x i8>* %A
242 %tmp2 = load <16 x i8>* %B
243 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
244 ret <16 x i8> %tmp3
245}
246
247define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
248;CHECK-LABEL: uabd_4h:
249;CHECK: uabd.4h
250 %tmp1 = load <4 x i16>* %A
251 %tmp2 = load <4 x i16>* %B
252 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
253 ret <4 x i16> %tmp3
254}
255
256define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
257;CHECK-LABEL: uabd_8h:
258;CHECK: uabd.8h
259 %tmp1 = load <8 x i16>* %A
260 %tmp2 = load <8 x i16>* %B
261 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
262 ret <8 x i16> %tmp3
263}
264
265define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
266;CHECK-LABEL: uabd_2s:
267;CHECK: uabd.2s
268 %tmp1 = load <2 x i32>* %A
269 %tmp2 = load <2 x i32>* %B
270 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
271 ret <2 x i32> %tmp3
272}
273
274define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
275;CHECK-LABEL: uabd_4s:
276;CHECK: uabd.4s
277 %tmp1 = load <4 x i32>* %A
278 %tmp2 = load <4 x i32>* %B
279 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
280 ret <4 x i32> %tmp3
281}
282
283declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
284declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
285declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
286declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
287declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
288declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
289
290define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
291;CHECK-LABEL: sqabs_8b:
292;CHECK: sqabs.8b
293 %tmp1 = load <8 x i8>* %A
294 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
295 ret <8 x i8> %tmp3
296}
297
298define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
299;CHECK-LABEL: sqabs_16b:
300;CHECK: sqabs.16b
301 %tmp1 = load <16 x i8>* %A
302 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
303 ret <16 x i8> %tmp3
304}
305
306define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
307;CHECK-LABEL: sqabs_4h:
308;CHECK: sqabs.4h
309 %tmp1 = load <4 x i16>* %A
310 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
311 ret <4 x i16> %tmp3
312}
313
314define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
315;CHECK-LABEL: sqabs_8h:
316;CHECK: sqabs.8h
317 %tmp1 = load <8 x i16>* %A
318 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
319 ret <8 x i16> %tmp3
320}
321
322define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
323;CHECK-LABEL: sqabs_2s:
324;CHECK: sqabs.2s
325 %tmp1 = load <2 x i32>* %A
326 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
327 ret <2 x i32> %tmp3
328}
329
330define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
331;CHECK-LABEL: sqabs_4s:
332;CHECK: sqabs.4s
333 %tmp1 = load <4 x i32>* %A
334 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
335 ret <4 x i32> %tmp3
336}
337
338declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
339declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
340declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
341declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
342declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
343declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
344
345define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
346;CHECK-LABEL: sqneg_8b:
347;CHECK: sqneg.8b
348 %tmp1 = load <8 x i8>* %A
349 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
350 ret <8 x i8> %tmp3
351}
352
353define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
354;CHECK-LABEL: sqneg_16b:
355;CHECK: sqneg.16b
356 %tmp1 = load <16 x i8>* %A
357 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
358 ret <16 x i8> %tmp3
359}
360
361define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
362;CHECK-LABEL: sqneg_4h:
363;CHECK: sqneg.4h
364 %tmp1 = load <4 x i16>* %A
365 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
366 ret <4 x i16> %tmp3
367}
368
369define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
370;CHECK-LABEL: sqneg_8h:
371;CHECK: sqneg.8h
372 %tmp1 = load <8 x i16>* %A
373 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
374 ret <8 x i16> %tmp3
375}
376
377define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
378;CHECK-LABEL: sqneg_2s:
379;CHECK: sqneg.2s
380 %tmp1 = load <2 x i32>* %A
381 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
382 ret <2 x i32> %tmp3
383}
384
385define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
386;CHECK-LABEL: sqneg_4s:
387;CHECK: sqneg.4s
388 %tmp1 = load <4 x i32>* %A
389 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
390 ret <4 x i32> %tmp3
391}
392
393declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
394declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
395declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
396declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
397declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
398declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
399
400define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
401;CHECK-LABEL: abs_8b:
402;CHECK: abs.8b
403 %tmp1 = load <8 x i8>* %A
404 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
405 ret <8 x i8> %tmp3
406}
407
408define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
409;CHECK-LABEL: abs_16b:
410;CHECK: abs.16b
411 %tmp1 = load <16 x i8>* %A
412 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
413 ret <16 x i8> %tmp3
414}
415
416define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
417;CHECK-LABEL: abs_4h:
418;CHECK: abs.4h
419 %tmp1 = load <4 x i16>* %A
420 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
421 ret <4 x i16> %tmp3
422}
423
424define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
425;CHECK-LABEL: abs_8h:
426;CHECK: abs.8h
427 %tmp1 = load <8 x i16>* %A
428 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
429 ret <8 x i16> %tmp3
430}
431
432define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
433;CHECK-LABEL: abs_2s:
434;CHECK: abs.2s
435 %tmp1 = load <2 x i32>* %A
436 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
437 ret <2 x i32> %tmp3
438}
439
440define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
441;CHECK-LABEL: abs_4s:
442;CHECK: abs.4s
443 %tmp1 = load <4 x i32>* %A
444 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
445 ret <4 x i32> %tmp3
446}
447
448define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
449; CHECK-LABEL: abs_1d:
450; CHECK: abs d0, d0
451 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
452 ret <1 x i64> %abs
453}
454
455define i64 @abs_1d_honestly(i64 %A) nounwind {
456; CHECK-LABEL: abs_1d_honestly:
457; CHECK: abs d0, d0
458 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
459 ret i64 %abs
460}
461
462declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
463declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
464declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
465declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
466declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
467declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
468declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
469declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
470
471define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
472;CHECK-LABEL: sabal8h:
473;CHECK: sabal.8h
474 %tmp1 = load <8 x i8>* %A
475 %tmp2 = load <8 x i8>* %B
476 %tmp3 = load <8 x i16>* %C
477 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
478 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
479 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
480 ret <8 x i16> %tmp5
481}
482
483define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
484;CHECK-LABEL: sabal4s:
485;CHECK: sabal.4s
486 %tmp1 = load <4 x i16>* %A
487 %tmp2 = load <4 x i16>* %B
488 %tmp3 = load <4 x i32>* %C
489 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
490 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
491 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
492 ret <4 x i32> %tmp5
493}
494
495define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
496;CHECK-LABEL: sabal2d:
497;CHECK: sabal.2d
498 %tmp1 = load <2 x i32>* %A
499 %tmp2 = load <2 x i32>* %B
500 %tmp3 = load <2 x i64>* %C
501 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
502 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
503 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
504 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
505 ret <2 x i64> %tmp5
506}
507
508define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
509;CHECK-LABEL: sabal2_8h:
510;CHECK: sabal2.8h
511 %load1 = load <16 x i8>* %A
512 %load2 = load <16 x i8>* %B
513 %tmp3 = load <8 x i16>* %C
514 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
515 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
516 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
517 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
518 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
519 ret <8 x i16> %tmp5
520}
521
522define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
523;CHECK-LABEL: sabal2_4s:
524;CHECK: sabal2.4s
525 %load1 = load <8 x i16>* %A
526 %load2 = load <8 x i16>* %B
527 %tmp3 = load <4 x i32>* %C
528 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
529 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
530 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
531 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
532 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
533 ret <4 x i32> %tmp5
534}
535
536define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
537;CHECK-LABEL: sabal2_2d:
538;CHECK: sabal2.2d
539 %load1 = load <4 x i32>* %A
540 %load2 = load <4 x i32>* %B
541 %tmp3 = load <2 x i64>* %C
542 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
543 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
544 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
545 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
546 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
547 ret <2 x i64> %tmp5
548}
549
550define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
551;CHECK-LABEL: uabal8h:
552;CHECK: uabal.8h
553 %tmp1 = load <8 x i8>* %A
554 %tmp2 = load <8 x i8>* %B
555 %tmp3 = load <8 x i16>* %C
556 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
557 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
558 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
559 ret <8 x i16> %tmp5
560}
561
562define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
563;CHECK-LABEL: uabal4s:
564;CHECK: uabal.4s
565 %tmp1 = load <4 x i16>* %A
566 %tmp2 = load <4 x i16>* %B
567 %tmp3 = load <4 x i32>* %C
568 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
569 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
570 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
571 ret <4 x i32> %tmp5
572}
573
574define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
575;CHECK-LABEL: uabal2d:
576;CHECK: uabal.2d
577 %tmp1 = load <2 x i32>* %A
578 %tmp2 = load <2 x i32>* %B
579 %tmp3 = load <2 x i64>* %C
580 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
581 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
582 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
583 ret <2 x i64> %tmp5
584}
585
586define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
587;CHECK-LABEL: uabal2_8h:
588;CHECK: uabal2.8h
589 %load1 = load <16 x i8>* %A
590 %load2 = load <16 x i8>* %B
591 %tmp3 = load <8 x i16>* %C
592 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
593 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
594 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
595 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
596 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
597 ret <8 x i16> %tmp5
598}
599
600define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
601;CHECK-LABEL: uabal2_4s:
602;CHECK: uabal2.4s
603 %load1 = load <8 x i16>* %A
604 %load2 = load <8 x i16>* %B
605 %tmp3 = load <4 x i32>* %C
606 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
607 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
608 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
609 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
610 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
611 ret <4 x i32> %tmp5
612}
613
614define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
615;CHECK-LABEL: uabal2_2d:
616;CHECK: uabal2.2d
617 %load1 = load <4 x i32>* %A
618 %load2 = load <4 x i32>* %B
619 %tmp3 = load <2 x i64>* %C
620 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
621 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
622 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
623 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
624 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
625 ret <2 x i64> %tmp5
626}
627
628define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
629;CHECK-LABEL: saba_8b:
630;CHECK: saba.8b
631 %tmp1 = load <8 x i8>* %A
632 %tmp2 = load <8 x i8>* %B
633 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
634 %tmp4 = load <8 x i8>* %C
635 %tmp5 = add <8 x i8> %tmp3, %tmp4
636 ret <8 x i8> %tmp5
637}
638
639define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
640;CHECK-LABEL: saba_16b:
641;CHECK: saba.16b
642 %tmp1 = load <16 x i8>* %A
643 %tmp2 = load <16 x i8>* %B
644 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
645 %tmp4 = load <16 x i8>* %C
646 %tmp5 = add <16 x i8> %tmp3, %tmp4
647 ret <16 x i8> %tmp5
648}
649
650define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
651;CHECK-LABEL: saba_4h:
652;CHECK: saba.4h
653 %tmp1 = load <4 x i16>* %A
654 %tmp2 = load <4 x i16>* %B
655 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
656 %tmp4 = load <4 x i16>* %C
657 %tmp5 = add <4 x i16> %tmp3, %tmp4
658 ret <4 x i16> %tmp5
659}
660
661define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
662;CHECK-LABEL: saba_8h:
663;CHECK: saba.8h
664 %tmp1 = load <8 x i16>* %A
665 %tmp2 = load <8 x i16>* %B
666 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
667 %tmp4 = load <8 x i16>* %C
668 %tmp5 = add <8 x i16> %tmp3, %tmp4
669 ret <8 x i16> %tmp5
670}
671
672define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
673;CHECK-LABEL: saba_2s:
674;CHECK: saba.2s
675 %tmp1 = load <2 x i32>* %A
676 %tmp2 = load <2 x i32>* %B
677 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
678 %tmp4 = load <2 x i32>* %C
679 %tmp5 = add <2 x i32> %tmp3, %tmp4
680 ret <2 x i32> %tmp5
681}
682
683define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
684;CHECK-LABEL: saba_4s:
685;CHECK: saba.4s
686 %tmp1 = load <4 x i32>* %A
687 %tmp2 = load <4 x i32>* %B
688 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
689 %tmp4 = load <4 x i32>* %C
690 %tmp5 = add <4 x i32> %tmp3, %tmp4
691 ret <4 x i32> %tmp5
692}
693
694define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
695;CHECK-LABEL: uaba_8b:
696;CHECK: uaba.8b
697 %tmp1 = load <8 x i8>* %A
698 %tmp2 = load <8 x i8>* %B
699 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
700 %tmp4 = load <8 x i8>* %C
701 %tmp5 = add <8 x i8> %tmp3, %tmp4
702 ret <8 x i8> %tmp5
703}
704
705define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
706;CHECK-LABEL: uaba_16b:
707;CHECK: uaba.16b
708 %tmp1 = load <16 x i8>* %A
709 %tmp2 = load <16 x i8>* %B
710 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
711 %tmp4 = load <16 x i8>* %C
712 %tmp5 = add <16 x i8> %tmp3, %tmp4
713 ret <16 x i8> %tmp5
714}
715
716define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
717;CHECK-LABEL: uaba_4h:
718;CHECK: uaba.4h
719 %tmp1 = load <4 x i16>* %A
720 %tmp2 = load <4 x i16>* %B
721 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
722 %tmp4 = load <4 x i16>* %C
723 %tmp5 = add <4 x i16> %tmp3, %tmp4
724 ret <4 x i16> %tmp5
725}
726
727define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
728;CHECK-LABEL: uaba_8h:
729;CHECK: uaba.8h
730 %tmp1 = load <8 x i16>* %A
731 %tmp2 = load <8 x i16>* %B
732 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
733 %tmp4 = load <8 x i16>* %C
734 %tmp5 = add <8 x i16> %tmp3, %tmp4
735 ret <8 x i16> %tmp5
736}
737
738define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
739;CHECK-LABEL: uaba_2s:
740;CHECK: uaba.2s
741 %tmp1 = load <2 x i32>* %A
742 %tmp2 = load <2 x i32>* %B
743 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
744 %tmp4 = load <2 x i32>* %C
745 %tmp5 = add <2 x i32> %tmp3, %tmp4
746 ret <2 x i32> %tmp5
747}
748
749define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
750;CHECK-LABEL: uaba_4s:
751;CHECK: uaba.4s
752 %tmp1 = load <4 x i32>* %A
753 %tmp2 = load <4 x i32>* %B
754 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
755 %tmp4 = load <4 x i32>* %C
756 %tmp5 = add <4 x i32> %tmp3, %tmp4
757 ret <4 x i32> %tmp5
758}
759
760; Scalar FABD
761define float @fabds(float %a, float %b) nounwind {
762; CHECK-LABEL: fabds:
763; CHECK: fabd s0, s0, s1
764 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
765 ret float %vabd.i
766}
767
768define double @fabdd(double %a, double %b) nounwind {
769; CHECK-LABEL: fabdd:
770; CHECK: fabd d0, d0, d1
771 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
772 ret double %vabd.i
773}
774
775declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
776declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
777
778define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
779; CHECK-LABEL: uabdl_from_extract_dup:
780; CHECK-NOT: ext.16b
781; CHECK: uabdl2.2d
782 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
783 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
784
785 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
786
787 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
788 %res1 = zext <2 x i32> %res to <2 x i64>
789 ret <2 x i64> %res1
790}
791
792define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
793; CHECK-LABEL: sabdl_from_extract_dup:
794; CHECK-NOT: ext.16b
795; CHECK: sabdl2.2d
796 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
797 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
798
799 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
800
801 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
802 %res1 = zext <2 x i32> %res to <2 x i64>
803 ret <2 x i64> %res1
804}