blob: def614150cd4432938418522c7f41d1d85afb65b [file] [log] [blame]
Simon Pilgrimb099d162016-11-30 11:30:33 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64
4
5;
6; 128-bit Vectors
7;
8
9define <4 x float> @test_unpackl_fhadd_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
10; X32-LABEL: test_unpackl_fhadd_128:
11; X32: ## BB#0:
12; X32-NEXT: vhaddps %xmm1, %xmm0, %xmm0
13; X32-NEXT: vhaddps %xmm3, %xmm2, %xmm1
14; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
15; X32-NEXT: retl
16;
17; X64-LABEL: test_unpackl_fhadd_128:
18; X64: ## BB#0:
19; X64-NEXT: vhaddps %xmm1, %xmm0, %xmm0
20; X64-NEXT: vhaddps %xmm3, %xmm2, %xmm1
21; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
22; X64-NEXT: retq
23 %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
24 %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a2, <4 x float> %a3)
25 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
26 ret <4 x float> %3
27}
28
29define <2 x double> @test_unpackh_fhadd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
30; X32-LABEL: test_unpackh_fhadd_128:
31; X32: ## BB#0:
32; X32-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
33; X32-NEXT: vhaddpd %xmm3, %xmm2, %xmm1
34; X32-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
35; X32-NEXT: retl
36;
37; X64-LABEL: test_unpackh_fhadd_128:
38; X64: ## BB#0:
39; X64-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
40; X64-NEXT: vhaddpd %xmm3, %xmm2, %xmm1
41; X64-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
42; X64-NEXT: retq
43 %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
44 %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a2, <2 x double> %a3)
45 %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 1, i32 3>
46 ret <2 x double> %3
47}
48
49define <2 x double> @test_unpackl_fhsub_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
50; X32-LABEL: test_unpackl_fhsub_128:
51; X32: ## BB#0:
52; X32-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
53; X32-NEXT: vhsubpd %xmm3, %xmm2, %xmm1
54; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
55; X32-NEXT: retl
56;
57; X64-LABEL: test_unpackl_fhsub_128:
58; X64: ## BB#0:
59; X64-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
60; X64-NEXT: vhsubpd %xmm3, %xmm2, %xmm1
61; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
62; X64-NEXT: retq
63 %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
64 %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a2, <2 x double> %a3)
65 %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 0, i32 2>
66 ret <2 x double> %3
67}
68
69define <4 x float> @test_unpackh_fhsub_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
70; X32-LABEL: test_unpackh_fhsub_128:
71; X32: ## BB#0:
72; X32-NEXT: vhsubps %xmm1, %xmm0, %xmm0
73; X32-NEXT: vhsubps %xmm3, %xmm2, %xmm1
74; X32-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
75; X32-NEXT: retl
76;
77; X64-LABEL: test_unpackh_fhsub_128:
78; X64: ## BB#0:
79; X64-NEXT: vhsubps %xmm1, %xmm0, %xmm0
80; X64-NEXT: vhsubps %xmm3, %xmm2, %xmm1
81; X64-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
82; X64-NEXT: retq
83 %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
84 %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a2, <4 x float> %a3)
85 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
86 ret <4 x float> %3
87}
88
89define <8 x i16> @test_unpackl_hadd_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
90; X32-LABEL: test_unpackl_hadd_128:
91; X32: ## BB#0:
92; X32-NEXT: vphaddw %xmm1, %xmm0, %xmm0
93; X32-NEXT: vphaddw %xmm3, %xmm2, %xmm1
94; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
95; X32-NEXT: retl
96;
97; X64-LABEL: test_unpackl_hadd_128:
98; X64: ## BB#0:
99; X64-NEXT: vphaddw %xmm1, %xmm0, %xmm0
100; X64-NEXT: vphaddw %xmm3, %xmm2, %xmm1
101; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
102; X64-NEXT: retq
103 %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
104 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a2, <8 x i16> %a3)
105 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
106 ret <8 x i16> %3
107}
108
109define <4 x i32> @test_unpackh_hadd_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
110; X32-LABEL: test_unpackh_hadd_128:
111; X32: ## BB#0:
112; X32-NEXT: vphaddd %xmm1, %xmm0, %xmm0
113; X32-NEXT: vphaddd %xmm3, %xmm2, %xmm1
114; X32-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
115; X32-NEXT: retl
116;
117; X64-LABEL: test_unpackh_hadd_128:
118; X64: ## BB#0:
119; X64-NEXT: vphaddd %xmm1, %xmm0, %xmm0
120; X64-NEXT: vphaddd %xmm3, %xmm2, %xmm1
121; X64-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
122; X64-NEXT: retq
123 %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
124 %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a2, <4 x i32> %a3)
125 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
126 ret <4 x i32> %3
127}
128
129define <4 x i32> @test_unpackl_hsub_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
130; X32-LABEL: test_unpackl_hsub_128:
131; X32: ## BB#0:
132; X32-NEXT: vphsubd %xmm1, %xmm0, %xmm0
133; X32-NEXT: vphsubd %xmm3, %xmm2, %xmm1
134; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
135; X32-NEXT: retl
136;
137; X64-LABEL: test_unpackl_hsub_128:
138; X64: ## BB#0:
139; X64-NEXT: vphsubd %xmm1, %xmm0, %xmm0
140; X64-NEXT: vphsubd %xmm3, %xmm2, %xmm1
141; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
142; X64-NEXT: retq
143 %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
144 %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a2, <4 x i32> %a3)
145 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
146 ret <4 x i32> %3
147}
148
149define <8 x i16> @test_unpackh_hsub_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
150; X32-LABEL: test_unpackh_hsub_128:
151; X32: ## BB#0:
152; X32-NEXT: vphsubw %xmm1, %xmm0, %xmm0
153; X32-NEXT: vphsubw %xmm3, %xmm2, %xmm1
154; X32-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
155; X32-NEXT: retl
156;
157; X64-LABEL: test_unpackh_hsub_128:
158; X64: ## BB#0:
159; X64-NEXT: vphsubw %xmm1, %xmm0, %xmm0
160; X64-NEXT: vphsubw %xmm3, %xmm2, %xmm1
161; X64-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
162; X64-NEXT: retq
163 %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
164 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a2, <8 x i16> %a3)
165 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
166 ret <8 x i16> %3
167}
168
169define <16 x i8> @test_unpackl_packss_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
170; X32-LABEL: test_unpackl_packss_128:
171; X32: ## BB#0:
172; X32-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
173; X32-NEXT: vpacksswb %xmm3, %xmm2, %xmm1
174; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
175; X32-NEXT: retl
176;
177; X64-LABEL: test_unpackl_packss_128:
178; X64: ## BB#0:
179; X64-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
180; X64-NEXT: vpacksswb %xmm3, %xmm2, %xmm1
181; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
182; X64-NEXT: retq
183 %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
184 %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a2, <8 x i16> %a3)
185 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
186 ret <16 x i8> %3
187}
188
189define <8 x i16> @test_unpackh_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
190; X32-LABEL: test_unpackh_packss_128:
191; X32: ## BB#0:
192; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
193; X32-NEXT: vpackssdw %xmm3, %xmm2, %xmm1
194; X32-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
195; X32-NEXT: retl
196;
197; X64-LABEL: test_unpackh_packss_128:
198; X64: ## BB#0:
199; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
200; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm1
201; X64-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
202; X64-NEXT: retq
203 %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
204 %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a2, <4 x i32> %a3)
205 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
206 ret <8 x i16> %3
207}
208
209define <8 x i16> @test_unpackl_packus_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
210; X32-LABEL: test_unpackl_packus_128:
211; X32: ## BB#0:
212; X32-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
213; X32-NEXT: retl
214;
215; X64-LABEL: test_unpackl_packus_128:
216; X64: ## BB#0:
217; X64-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
218; X64-NEXT: retq
219 %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
220 %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
221 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
222 ret <8 x i16> %3
223}
224
225define <16 x i8> @test_unpackh_packus_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
226; X32-LABEL: test_unpackh_packus_128:
227; X32: ## BB#0:
228; X32-NEXT: vpackuswb %xmm3, %xmm1, %xmm0
229; X32-NEXT: retl
230;
231; X64-LABEL: test_unpackh_packus_128:
232; X64: ## BB#0:
233; X64-NEXT: vpackuswb %xmm3, %xmm1, %xmm0
234; X64-NEXT: retq
235 %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
236 %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
237 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
238 ret <16 x i8> %3
239}
240
241;
242; 256-bit Vectors
243;
244
245define <8 x float> @test_unpackl_fhadd_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
246; X32-LABEL: test_unpackl_fhadd_256:
247; X32: ## BB#0:
248; X32-NEXT: vhaddps %ymm1, %ymm0, %ymm0
249; X32-NEXT: vhaddps %ymm3, %ymm2, %ymm1
250; X32-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
251; X32-NEXT: retl
252;
253; X64-LABEL: test_unpackl_fhadd_256:
254; X64: ## BB#0:
255; X64-NEXT: vhaddps %ymm1, %ymm0, %ymm0
256; X64-NEXT: vhaddps %ymm3, %ymm2, %ymm1
257; X64-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
258; X64-NEXT: retq
259 %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
260 %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a2, <8 x float> %a3)
261 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
262 ret <8 x float> %3
263}
264
265define <4 x double> @test_unpackh_fhadd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) {
266; X32-LABEL: test_unpackh_fhadd_256:
267; X32: ## BB#0:
268; X32-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
269; X32-NEXT: vhaddpd %ymm3, %ymm2, %ymm1
270; X32-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
271; X32-NEXT: retl
272;
273; X64-LABEL: test_unpackh_fhadd_256:
274; X64: ## BB#0:
275; X64-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
276; X64-NEXT: vhaddpd %ymm3, %ymm2, %ymm1
277; X64-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
278; X64-NEXT: retq
279 %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
280 %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a2, <4 x double> %a3)
281 %3 = shufflevector <4 x double> %1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
282 ret <4 x double> %3
283}
284
285define <4 x double> @test_unpackl_fhsub_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) {
286; X32-LABEL: test_unpackl_fhsub_256:
287; X32: ## BB#0:
288; X32-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
289; X32-NEXT: vhsubpd %ymm3, %ymm2, %ymm1
290; X32-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
291; X32-NEXT: retl
292;
293; X64-LABEL: test_unpackl_fhsub_256:
294; X64: ## BB#0:
295; X64-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
296; X64-NEXT: vhsubpd %ymm3, %ymm2, %ymm1
297; X64-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
298; X64-NEXT: retq
299 %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
300 %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a2, <4 x double> %a3)
301 %3 = shufflevector <4 x double> %1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
302 ret <4 x double> %3
303}
304
305define <8 x float> @test_unpackh_fhsub_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
306; X32-LABEL: test_unpackh_fhsub_256:
307; X32: ## BB#0:
308; X32-NEXT: vhsubps %ymm1, %ymm0, %ymm0
309; X32-NEXT: vhsubps %ymm3, %ymm2, %ymm1
310; X32-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
311; X32-NEXT: retl
312;
313; X64-LABEL: test_unpackh_fhsub_256:
314; X64: ## BB#0:
315; X64-NEXT: vhsubps %ymm1, %ymm0, %ymm0
316; X64-NEXT: vhsubps %ymm3, %ymm2, %ymm1
317; X64-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
318; X64-NEXT: retq
319 %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
320 %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a2, <8 x float> %a3)
321 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
322 ret <8 x float> %3
323}
324
325define <16 x i16> @test_unpackl_hadd_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
326; X32-LABEL: test_unpackl_hadd_256:
327; X32: ## BB#0:
328; X32-NEXT: vphaddw %ymm1, %ymm0, %ymm0
329; X32-NEXT: vphaddw %ymm3, %ymm2, %ymm1
330; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
331; X32-NEXT: retl
332;
333; X64-LABEL: test_unpackl_hadd_256:
334; X64: ## BB#0:
335; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0
336; X64-NEXT: vphaddw %ymm3, %ymm2, %ymm1
337; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
338; X64-NEXT: retq
339 %1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
340 %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a2, <16 x i16> %a3)
341 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27>
342 ret <16 x i16> %3
343}
344
345define <8 x i32> @test_unpackh_hadd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
346; X32-LABEL: test_unpackh_hadd_256:
347; X32: ## BB#0:
348; X32-NEXT: vphaddd %ymm1, %ymm0, %ymm0
349; X32-NEXT: vphaddd %ymm3, %ymm2, %ymm1
350; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
351; X32-NEXT: retl
352;
353; X64-LABEL: test_unpackh_hadd_256:
354; X64: ## BB#0:
355; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0
356; X64-NEXT: vphaddd %ymm3, %ymm2, %ymm1
357; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
358; X64-NEXT: retq
359 %1 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
360 %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a2, <8 x i32> %a3)
361 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
362 ret <8 x i32> %3
363}
364
365define <8 x i32> @test_unpackl_hsub_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
366; X32-LABEL: test_unpackl_hsub_256:
367; X32: ## BB#0:
368; X32-NEXT: vphsubd %ymm1, %ymm0, %ymm0
369; X32-NEXT: vphsubd %ymm3, %ymm2, %ymm1
370; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
371; X32-NEXT: retl
372;
373; X64-LABEL: test_unpackl_hsub_256:
374; X64: ## BB#0:
375; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0
376; X64-NEXT: vphsubd %ymm3, %ymm2, %ymm1
377; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
378; X64-NEXT: retq
379 %1 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
380 %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a2, <8 x i32> %a3)
381 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
382 ret <8 x i32> %3
383}
384
385define <16 x i16> @test_unpackh_hsub_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
386; X32-LABEL: test_unpackh_hsub_256:
387; X32: ## BB#0:
388; X32-NEXT: vphsubw %ymm1, %ymm0, %ymm0
389; X32-NEXT: vphsubw %ymm3, %ymm2, %ymm1
390; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
391; X32-NEXT: retl
392;
393; X64-LABEL: test_unpackh_hsub_256:
394; X64: ## BB#0:
395; X64-NEXT: vphsubw %ymm1, %ymm0, %ymm0
396; X64-NEXT: vphsubw %ymm3, %ymm2, %ymm1
397; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
398; X64-NEXT: retq
399 %1 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
400 %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a2, <16 x i16> %a3)
401 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
402 ret <16 x i16> %3
403}
404
405define <32 x i8> @test_unpackl_packss_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
406; X32-LABEL: test_unpackl_packss_256:
407; X32: ## BB#0:
408; X32-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
409; X32-NEXT: vpacksswb %ymm3, %ymm2, %ymm1
410; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
411; X32-NEXT: retl
412;
413; X64-LABEL: test_unpackl_packss_256:
414; X64: ## BB#0:
415; X64-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
416; X64-NEXT: vpacksswb %ymm3, %ymm2, %ymm1
417; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
418; X64-NEXT: retq
419 %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
420 %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3)
421 %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
422 ret <32 x i8> %3
423}
424
425define <16 x i16> @test_unpackh_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
426; X32-LABEL: test_unpackh_packss_256:
427; X32: ## BB#0:
428; X32-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
429; X32-NEXT: vpackssdw %ymm3, %ymm2, %ymm1
430; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
431; X32-NEXT: retl
432;
433; X64-LABEL: test_unpackh_packss_256:
434; X64: ## BB#0:
435; X64-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
436; X64-NEXT: vpackssdw %ymm3, %ymm2, %ymm1
437; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
438; X64-NEXT: retq
439 %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
440 %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
441 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
442 ret <16 x i16> %3
443}
444
445define <16 x i16> @test_unpackl_packus_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
446; X32-LABEL: test_unpackl_packus_256:
447; X32: ## BB#0:
448; X32-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
449; X32-NEXT: vpackusdw %ymm3, %ymm2, %ymm1
450; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
451; X32-NEXT: retl
452;
453; X64-LABEL: test_unpackl_packus_256:
454; X64: ## BB#0:
455; X64-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
456; X64-NEXT: vpackusdw %ymm3, %ymm2, %ymm1
457; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
458; X64-NEXT: retq
459 %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
460 %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3)
461 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27>
462 ret <16 x i16> %3
463}
464
465define <32 x i8> @test_unpackh_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
466; X32-LABEL: test_unpackh_packus_256:
467; X32: ## BB#0:
468; X32-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
469; X32-NEXT: vpacksswb %ymm3, %ymm2, %ymm1
470; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
471; X32-NEXT: retl
472;
473; X64-LABEL: test_unpackh_packus_256:
474; X64: ## BB#0:
475; X64-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
476; X64-NEXT: vpacksswb %ymm3, %ymm2, %ymm1
477; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
478; X64-NEXT: retq
479 %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
480 %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3)
481 %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
482 ret <32 x i8> %3
483}
484
485declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
486declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
487declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
488declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
489
490declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>)
491declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>)
492declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>)
493declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>)
494
495declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
496declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
497declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
498declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
499
500declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>)
501declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>)
502declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>)
503declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>)
504
505declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>)
506declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>)
507declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>)
508declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>)
509
510declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
511declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
512declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
513declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)