blob: 4192029a6b742f89d4d3ea67d2ec97a79746eab1 [file] [log] [blame]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
8
9define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
10; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
11; AVX1: # BB#0:
12; AVX1-NEXT: vmovdqa (%rdi), %ymm0
13; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
14; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
15; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
16; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
17; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
18; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
19; AVX1-NEXT: vzeroupper
20; AVX1-NEXT: retq
21;
22; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
23; AVX2: # BB#0:
24; AVX2-NEXT: vmovdqa (%rdi), %ymm0
25; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
26; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
27; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
28; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
29; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
30; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
31; AVX2-NEXT: vzeroupper
32; AVX2-NEXT: retq
33;
Wei Mi1736efd2017-10-12 00:24:52 +000034; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
35; AVX512: # BB#0:
36; AVX512-NEXT: vmovdqa (%rdi), %ymm0
37; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
38; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
39; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
40; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
41; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
42; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
43; AVX512-NEXT: vzeroupper
44; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000045 %vec = load <32 x i8>, <32 x i8>* %L
46 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
47 store <16 x i8> %strided.vec, <16 x i8>* %S
48 ret void
49}
50
51define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
52; AVX1-LABEL: shuffle_v16i16_to_v8i16_1:
53; AVX1: # BB#0:
54; AVX1-NEXT: vmovdqa (%rdi), %ymm0
55; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
56; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
57; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
58; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
59; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
60; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
61; AVX1-NEXT: vzeroupper
62; AVX1-NEXT: retq
63;
64; AVX2-LABEL: shuffle_v16i16_to_v8i16_1:
65; AVX2: # BB#0:
66; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +000067; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
68; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
69; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
70; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
71; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000072; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
73; AVX2-NEXT: vzeroupper
74; AVX2-NEXT: retq
75;
Wei Mi1736efd2017-10-12 00:24:52 +000076; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
77; AVX512: # BB#0:
78; AVX512-NEXT: vmovdqa (%rdi), %ymm0
79; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
80; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
81; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
82; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
83; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
84; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
85; AVX512-NEXT: vzeroupper
86; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000087 %vec = load <16 x i16>, <16 x i16>* %L
88 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
89 store <8 x i16> %strided.vec, <8 x i16>* %S
90 ret void
91}
92
93define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
94; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
95; AVX: # BB#0:
96; AVX-NEXT: vmovaps (%rdi), %ymm0
97; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
98; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
99; AVX-NEXT: vmovaps %xmm0, (%rsi)
100; AVX-NEXT: vzeroupper
101; AVX-NEXT: retq
102;
Craig Topper410d2522017-07-31 22:07:29 +0000103; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
104; AVX512: # BB#0:
105; AVX512-NEXT: vmovaps (%rdi), %ymm0
106; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
107; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
108; AVX512-NEXT: vmovaps %xmm0, (%rsi)
109; AVX512-NEXT: vzeroupper
110; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000111 %vec = load <8 x i32>, <8 x i32>* %L
112 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
113 store <4 x i32> %strided.vec, <4 x i32>* %S
114 ret void
115}
116
117define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
118; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
119; AVX1: # BB#0:
120; AVX1-NEXT: vmovdqa (%rdi), %ymm0
121; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
122; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
123; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
124; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
125; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
126; AVX1-NEXT: vmovq %xmm0, (%rsi)
127; AVX1-NEXT: vzeroupper
128; AVX1-NEXT: retq
129;
130; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
131; AVX2: # BB#0:
132; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000133; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
134; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
135; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
136; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
137; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000138; AVX2-NEXT: vmovq %xmm0, (%rsi)
139; AVX2-NEXT: vzeroupper
140; AVX2-NEXT: retq
141;
142; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
143; AVX512F: # BB#0:
144; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000145; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
146; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
147; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
148; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
149; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000150; AVX512F-NEXT: vmovq %xmm0, (%rsi)
151; AVX512F-NEXT: vzeroupper
152; AVX512F-NEXT: retq
153;
154; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
155; AVX512VL: # BB#0:
156; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000157; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
158; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
159; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
160; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
161; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
162; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000163; AVX512VL-NEXT: vzeroupper
164; AVX512VL-NEXT: retq
165;
166; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
167; AVX512BW: # BB#0:
168; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000169; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
170; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
171; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
172; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
173; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000174; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
175; AVX512BW-NEXT: vzeroupper
176; AVX512BW-NEXT: retq
177;
178; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
179; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000180; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
181; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
182; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
183; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
184; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
185; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
186; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000187; AVX512BWVL-NEXT: vzeroupper
188; AVX512BWVL-NEXT: retq
189 %vec = load <32 x i8>, <32 x i8>* %L
190 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
191 store <8 x i8> %strided.vec, <8 x i8>* %S
192 ret void
193}
194
195define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
196; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
197; AVX1: # BB#0:
198; AVX1-NEXT: vmovdqa (%rdi), %ymm0
199; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
200; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
201; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
202; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
203; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
204; AVX1-NEXT: vmovq %xmm0, (%rsi)
205; AVX1-NEXT: vzeroupper
206; AVX1-NEXT: retq
207;
208; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
209; AVX2: # BB#0:
210; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000211; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
212; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
213; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
214; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
215; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000216; AVX2-NEXT: vmovq %xmm0, (%rsi)
217; AVX2-NEXT: vzeroupper
218; AVX2-NEXT: retq
219;
220; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
221; AVX512F: # BB#0:
222; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000223; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
224; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
225; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
226; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
227; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000228; AVX512F-NEXT: vmovq %xmm0, (%rsi)
229; AVX512F-NEXT: vzeroupper
230; AVX512F-NEXT: retq
231;
232; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
233; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000234; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
235; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
236; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
237; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
238; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
239; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
240; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000241; AVX512VL-NEXT: vzeroupper
242; AVX512VL-NEXT: retq
243;
244; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
245; AVX512BW: # BB#0:
246; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000247; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
248; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
249; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
250; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
251; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000252; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
253; AVX512BW-NEXT: vzeroupper
254; AVX512BW-NEXT: retq
255;
256; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
257; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000258; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
259; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
260; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
261; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
262; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
263; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
264; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000265; AVX512BWVL-NEXT: vzeroupper
266; AVX512BWVL-NEXT: retq
267 %vec = load <32 x i8>, <32 x i8>* %L
268 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
269 store <8 x i8> %strided.vec, <8 x i8>* %S
270 ret void
271}
272
273define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
274; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
275; AVX1: # BB#0:
276; AVX1-NEXT: vmovdqa (%rdi), %ymm0
277; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
278; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
279; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
280; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
281; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
282; AVX1-NEXT: vmovq %xmm0, (%rsi)
283; AVX1-NEXT: vzeroupper
284; AVX1-NEXT: retq
285;
286; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
287; AVX2: # BB#0:
288; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000289; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
290; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
291; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
292; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
293; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000294; AVX2-NEXT: vmovq %xmm0, (%rsi)
295; AVX2-NEXT: vzeroupper
296; AVX2-NEXT: retq
297;
298; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
299; AVX512F: # BB#0:
300; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000301; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
302; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
303; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
304; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
305; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000306; AVX512F-NEXT: vmovq %xmm0, (%rsi)
307; AVX512F-NEXT: vzeroupper
308; AVX512F-NEXT: retq
309;
310; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
311; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000312; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
313; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
314; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
315; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
316; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
317; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
318; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000319; AVX512VL-NEXT: vzeroupper
320; AVX512VL-NEXT: retq
321;
322; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
323; AVX512BW: # BB#0:
324; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000325; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
326; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
327; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
328; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
329; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000330; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
331; AVX512BW-NEXT: vzeroupper
332; AVX512BW-NEXT: retq
333;
334; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
335; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000336; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
337; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
338; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
339; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
340; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
341; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
342; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000343; AVX512BWVL-NEXT: vzeroupper
344; AVX512BWVL-NEXT: retq
345 %vec = load <32 x i8>, <32 x i8>* %L
346 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
347 store <8 x i8> %strided.vec, <8 x i8>* %S
348 ret void
349}
350
351define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
352; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
353; AVX1: # BB#0:
354; AVX1-NEXT: vmovdqa (%rdi), %ymm0
355; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
356; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
357; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
358; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
359; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
360; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
361; AVX1-NEXT: vmovq %xmm0, (%rsi)
362; AVX1-NEXT: vzeroupper
363; AVX1-NEXT: retq
364;
365; AVX2-LABEL: shuffle_v16i16_to_v4i16_1:
366; AVX2: # BB#0:
367; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000368; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
369; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
370; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
371; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
372; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
373; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000374; AVX2-NEXT: vmovq %xmm0, (%rsi)
375; AVX2-NEXT: vzeroupper
376; AVX2-NEXT: retq
377;
378; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
379; AVX512F: # BB#0:
380; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000381; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
382; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
383; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
384; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
385; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
386; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000387; AVX512F-NEXT: vmovq %xmm0, (%rsi)
388; AVX512F-NEXT: vzeroupper
389; AVX512F-NEXT: retq
390;
391; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
392; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000393; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
394; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
395; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
396; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
397; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
398; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
399; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
400; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000401; AVX512VL-NEXT: vzeroupper
402; AVX512VL-NEXT: retq
403;
404; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
405; AVX512BW: # BB#0:
406; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000407; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
408; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
409; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
410; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
411; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
412; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000413; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
414; AVX512BW-NEXT: vzeroupper
415; AVX512BW-NEXT: retq
416;
417; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
418; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000419; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
420; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
421; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
422; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
423; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
424; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
425; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
426; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000427; AVX512BWVL-NEXT: vzeroupper
428; AVX512BWVL-NEXT: retq
429 %vec = load <16 x i16>, <16 x i16>* %L
430 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
431 store <4 x i16> %strided.vec, <4 x i16>* %S
432 ret void
433}
434
435define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
436; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
437; AVX1: # BB#0:
438; AVX1-NEXT: vmovdqa (%rdi), %ymm0
439; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
440; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
441; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
442; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
443; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
444; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
445; AVX1-NEXT: vmovq %xmm0, (%rsi)
446; AVX1-NEXT: vzeroupper
447; AVX1-NEXT: retq
448;
449; AVX2-LABEL: shuffle_v16i16_to_v4i16_2:
450; AVX2: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000451; AVX2-NEXT: vmovdqa (%rdi), %ymm0
452; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
453; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
454; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
455; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
456; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
457; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000458; AVX2-NEXT: vmovq %xmm0, (%rsi)
459; AVX2-NEXT: vzeroupper
460; AVX2-NEXT: retq
461;
462; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
463; AVX512F: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000464; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
465; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
466; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
467; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
468; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
469; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
470; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000471; AVX512F-NEXT: vmovq %xmm0, (%rsi)
472; AVX512F-NEXT: vzeroupper
473; AVX512F-NEXT: retq
474;
475; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
476; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000477; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
478; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
479; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
480; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000481; AVX512VL-NEXT: vzeroupper
482; AVX512VL-NEXT: retq
483;
484; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
485; AVX512BW: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000486; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
487; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
488; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
489; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
490; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
491; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
492; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000493; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
494; AVX512BW-NEXT: vzeroupper
495; AVX512BW-NEXT: retq
496;
497; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
498; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000499; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
500; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
501; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
502; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000503; AVX512BWVL-NEXT: vzeroupper
504; AVX512BWVL-NEXT: retq
505 %vec = load <16 x i16>, <16 x i16>* %L
506 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
507 store <4 x i16> %strided.vec, <4 x i16>* %S
508 ret void
509}
510
511define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
512; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
513; AVX1: # BB#0:
514; AVX1-NEXT: vmovdqa (%rdi), %ymm0
515; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
516; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
517; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
518; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
519; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
520; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
521; AVX1-NEXT: vmovq %xmm0, (%rsi)
522; AVX1-NEXT: vzeroupper
523; AVX1-NEXT: retq
524;
525; AVX2-LABEL: shuffle_v16i16_to_v4i16_3:
526; AVX2: # BB#0:
527; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000528; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
529; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
530; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
531; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
532; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
533; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000534; AVX2-NEXT: vmovq %xmm0, (%rsi)
535; AVX2-NEXT: vzeroupper
536; AVX2-NEXT: retq
537;
538; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
539; AVX512F: # BB#0:
540; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000541; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
542; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
543; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
544; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
545; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
546; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000547; AVX512F-NEXT: vmovq %xmm0, (%rsi)
548; AVX512F-NEXT: vzeroupper
549; AVX512F-NEXT: retq
550;
551; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
552; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000553; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
554; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
555; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
556; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
557; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
558; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
559; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
560; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000561; AVX512VL-NEXT: vzeroupper
562; AVX512VL-NEXT: retq
563;
564; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
565; AVX512BW: # BB#0:
566; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000567; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
568; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
569; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
570; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
571; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
572; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000573; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
574; AVX512BW-NEXT: vzeroupper
575; AVX512BW-NEXT: retq
576;
577; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
578; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000579; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
580; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
581; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
582; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
583; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
584; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
585; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
586; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000587; AVX512BWVL-NEXT: vzeroupper
588; AVX512BWVL-NEXT: retq
589 %vec = load <16 x i16>, <16 x i16>* %L
590 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
591 store <4 x i16> %strided.vec, <4 x i16>* %S
592 ret void
593}
594
595define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
596; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
597; AVX1: # BB#0:
598; AVX1-NEXT: vmovdqa (%rdi), %ymm0
599; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
600; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
601; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
602; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
603; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
604; AVX1-NEXT: vmovd %xmm0, (%rsi)
605; AVX1-NEXT: vzeroupper
606; AVX1-NEXT: retq
607;
608; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
609; AVX2: # BB#0:
610; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000611; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
612; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
613; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
614; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
615; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000616; AVX2-NEXT: vmovd %xmm0, (%rsi)
617; AVX2-NEXT: vzeroupper
618; AVX2-NEXT: retq
619;
620; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
621; AVX512F: # BB#0:
622; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000623; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
624; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
625; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
626; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
627; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000628; AVX512F-NEXT: vmovd %xmm0, (%rsi)
629; AVX512F-NEXT: vzeroupper
630; AVX512F-NEXT: retq
631;
632; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
633; AVX512VL: # BB#0:
634; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000635; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
636; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
637; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
638; AVX512VL-NEXT: vpsrld $16, %xmm1, %xmm1
639; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
640; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm0
641; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
642; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000643; AVX512VL-NEXT: vzeroupper
644; AVX512VL-NEXT: retq
645;
646; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
647; AVX512BW: # BB#0:
648; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000649; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
650; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
651; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
652; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
653; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000654; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
655; AVX512BW-NEXT: vzeroupper
656; AVX512BW-NEXT: retq
657;
658; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
659; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000660; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
661; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
662; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
663; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
664; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
665; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
666; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm0
667; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
668; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000669; AVX512BWVL-NEXT: vzeroupper
670; AVX512BWVL-NEXT: retq
671 %vec = load <32 x i8>, <32 x i8>* %L
672 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
673 store <4 x i8> %strided.vec, <4 x i8>* %S
674 ret void
675}
676
677define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
678; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
679; AVX1: # BB#0:
680; AVX1-NEXT: vmovdqa (%rdi), %ymm0
681; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
682; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
683; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
684; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
685; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
686; AVX1-NEXT: vmovd %xmm0, (%rsi)
687; AVX1-NEXT: vzeroupper
688; AVX1-NEXT: retq
689;
690; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
691; AVX2: # BB#0:
692; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000693; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
694; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
695; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
696; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
697; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000698; AVX2-NEXT: vmovd %xmm0, (%rsi)
699; AVX2-NEXT: vzeroupper
700; AVX2-NEXT: retq
701;
702; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
703; AVX512F: # BB#0:
704; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000705; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
706; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
707; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
708; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
709; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000710; AVX512F-NEXT: vmovd %xmm0, (%rsi)
711; AVX512F-NEXT: vzeroupper
712; AVX512F-NEXT: retq
713;
714; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
715; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000716; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
717; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
718; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
719; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
720; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
721; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
722; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
723; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000724; AVX512VL-NEXT: vzeroupper
725; AVX512VL-NEXT: retq
726;
727; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
728; AVX512BW: # BB#0:
729; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000730; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
731; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
732; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
733; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
734; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000735; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
736; AVX512BW-NEXT: vzeroupper
737; AVX512BW-NEXT: retq
738;
739; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
740; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000741; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
742; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
743; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
744; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
745; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
746; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
747; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
748; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000749; AVX512BWVL-NEXT: vzeroupper
750; AVX512BWVL-NEXT: retq
751 %vec = load <32 x i8>, <32 x i8>* %L
752 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
753 store <4 x i8> %strided.vec, <4 x i8>* %S
754 ret void
755}
756
757define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
758; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
759; AVX1: # BB#0:
760; AVX1-NEXT: vmovdqa (%rdi), %ymm0
761; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
762; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
763; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
764; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
765; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
766; AVX1-NEXT: vmovd %xmm0, (%rsi)
767; AVX1-NEXT: vzeroupper
768; AVX1-NEXT: retq
769;
770; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
771; AVX2: # BB#0:
772; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000773; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
774; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
775; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
776; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
777; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000778; AVX2-NEXT: vmovd %xmm0, (%rsi)
779; AVX2-NEXT: vzeroupper
780; AVX2-NEXT: retq
781;
782; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
783; AVX512F: # BB#0:
784; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000785; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
786; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
787; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
788; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
789; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000790; AVX512F-NEXT: vmovd %xmm0, (%rsi)
791; AVX512F-NEXT: vzeroupper
792; AVX512F-NEXT: retq
793;
794; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
795; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000796; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
797; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
798; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
799; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
800; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
801; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
802; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
803; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
804; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000805; AVX512VL-NEXT: vzeroupper
806; AVX512VL-NEXT: retq
807;
808; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
809; AVX512BW: # BB#0:
810; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000811; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
812; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
813; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
814; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
815; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000816; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
817; AVX512BW-NEXT: vzeroupper
818; AVX512BW-NEXT: retq
819;
820; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
821; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000822; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
823; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
824; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
825; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
826; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
827; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
828; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
829; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
830; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000831; AVX512BWVL-NEXT: vzeroupper
832; AVX512BWVL-NEXT: retq
833 %vec = load <32 x i8>, <32 x i8>* %L
834 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
835 store <4 x i8> %strided.vec, <4 x i8>* %S
836 ret void
837}
838
839define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
840; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
841; AVX1: # BB#0:
842; AVX1-NEXT: vmovdqa (%rdi), %ymm0
843; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
844; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
845; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
846; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
847; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
848; AVX1-NEXT: vmovd %xmm0, (%rsi)
849; AVX1-NEXT: vzeroupper
850; AVX1-NEXT: retq
851;
852; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
853; AVX2: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000854; AVX2-NEXT: vmovdqa (%rdi), %ymm0
855; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
856; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
857; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
858; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
859; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000860; AVX2-NEXT: vmovd %xmm0, (%rsi)
861; AVX2-NEXT: vzeroupper
862; AVX2-NEXT: retq
863;
864; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
865; AVX512F: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000866; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
867; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
868; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
869; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
870; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
871; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000872; AVX512F-NEXT: vmovd %xmm0, (%rsi)
873; AVX512F-NEXT: vzeroupper
874; AVX512F-NEXT: retq
875;
876; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
877; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000878; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
879; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
880; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
881; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000882; AVX512VL-NEXT: vzeroupper
883; AVX512VL-NEXT: retq
884;
885; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
886; AVX512BW: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000887; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
888; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
889; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
890; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
891; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
892; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000893; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
894; AVX512BW-NEXT: vzeroupper
895; AVX512BW-NEXT: retq
896;
897; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
898; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000899; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
900; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
901; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
902; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000903; AVX512BWVL-NEXT: vzeroupper
904; AVX512BWVL-NEXT: retq
905 %vec = load <32 x i8>, <32 x i8>* %L
906 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
907 store <4 x i8> %strided.vec, <4 x i8>* %S
908 ret void
909}
910
911define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
912; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
913; AVX1: # BB#0:
914; AVX1-NEXT: vmovdqa (%rdi), %ymm0
915; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
916; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
917; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
918; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
919; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
920; AVX1-NEXT: vmovd %xmm0, (%rsi)
921; AVX1-NEXT: vzeroupper
922; AVX1-NEXT: retq
923;
924; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
925; AVX2: # BB#0:
926; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000927; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
928; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
929; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
930; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
931; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000932; AVX2-NEXT: vmovd %xmm0, (%rsi)
933; AVX2-NEXT: vzeroupper
934; AVX2-NEXT: retq
935;
936; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
937; AVX512F: # BB#0:
938; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000939; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
940; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
941; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
942; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
943; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000944; AVX512F-NEXT: vmovd %xmm0, (%rsi)
945; AVX512F-NEXT: vzeroupper
946; AVX512F-NEXT: retq
947;
948; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
949; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000950; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
951; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
952; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
953; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
954; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
955; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
956; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
957; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
958; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
959; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
960; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
961; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000962; AVX512VL-NEXT: vzeroupper
963; AVX512VL-NEXT: retq
964;
965; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
966; AVX512BW: # BB#0:
967; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000968; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
969; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
970; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
971; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
972; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000973; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
974; AVX512BW-NEXT: vzeroupper
975; AVX512BW-NEXT: retq
976;
977; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
978; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +0000979; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
980; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
981; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
982; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
983; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
984; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
985; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
986; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
987; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
988; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
989; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
990; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000991; AVX512BWVL-NEXT: vzeroupper
992; AVX512BWVL-NEXT: retq
993 %vec = load <32 x i8>, <32 x i8>* %L
994 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
995 store <4 x i8> %strided.vec, <4 x i8>* %S
996 ret void
997}
998
999define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1000; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
1001; AVX1: # BB#0:
1002; AVX1-NEXT: vmovdqa (%rdi), %ymm0
1003; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1004; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1005; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1006; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1007; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1008; AVX1-NEXT: vmovd %xmm0, (%rsi)
1009; AVX1-NEXT: vzeroupper
1010; AVX1-NEXT: retq
1011;
1012; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
1013; AVX2: # BB#0:
1014; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001015; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1016; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1017; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1018; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1019; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001020; AVX2-NEXT: vmovd %xmm0, (%rsi)
1021; AVX2-NEXT: vzeroupper
1022; AVX2-NEXT: retq
1023;
1024; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
1025; AVX512F: # BB#0:
1026; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001027; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1028; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1029; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1030; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1031; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001032; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1033; AVX512F-NEXT: vzeroupper
1034; AVX512F-NEXT: retq
1035;
1036; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
1037; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +00001038; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1039; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1040; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1041; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1042; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1043; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
1044; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1045; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001046; AVX512VL-NEXT: vzeroupper
1047; AVX512VL-NEXT: retq
1048;
1049; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
1050; AVX512BW: # BB#0:
1051; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001052; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1053; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1054; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1055; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1056; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001057; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1058; AVX512BW-NEXT: vzeroupper
1059; AVX512BW-NEXT: retq
1060;
1061; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
1062; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +00001063; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1064; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1065; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1066; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1067; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1068; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
1069; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1070; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001071; AVX512BWVL-NEXT: vzeroupper
1072; AVX512BWVL-NEXT: retq
1073 %vec = load <32 x i8>, <32 x i8>* %L
1074 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
1075 store <4 x i8> %strided.vec, <4 x i8>* %S
1076 ret void
1077}
1078
1079define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1080; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
1081; AVX1: # BB#0:
1082; AVX1-NEXT: vmovdqa (%rdi), %ymm0
1083; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1084; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1085; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1086; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1087; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1088; AVX1-NEXT: vmovd %xmm0, (%rsi)
1089; AVX1-NEXT: vzeroupper
1090; AVX1-NEXT: retq
1091;
1092; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
1093; AVX2: # BB#0:
1094; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001095; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1096; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1097; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1098; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1099; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001100; AVX2-NEXT: vmovd %xmm0, (%rsi)
1101; AVX2-NEXT: vzeroupper
1102; AVX2-NEXT: retq
1103;
1104; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
1105; AVX512F: # BB#0:
1106; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001107; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1108; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1109; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1110; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1111; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001112; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1113; AVX512F-NEXT: vzeroupper
1114; AVX512F-NEXT: retq
1115;
1116; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
1117; AVX512VL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +00001118; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1119; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1120; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
1121; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1122; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1123; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1124; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001125; AVX512VL-NEXT: vzeroupper
1126; AVX512VL-NEXT: retq
1127;
1128; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
1129; AVX512BW: # BB#0:
1130; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001131; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1132; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1133; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1134; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1135; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001136; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1137; AVX512BW-NEXT: vzeroupper
1138; AVX512BW-NEXT: retq
1139;
1140; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
1141; AVX512BWVL: # BB#0:
Wei Mi1736efd2017-10-12 00:24:52 +00001142; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1143; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1144; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
1145; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1146; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1147; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1148; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001149; AVX512BWVL-NEXT: vzeroupper
1150; AVX512BWVL-NEXT: retq
1151 %vec = load <32 x i8>, <32 x i8>* %L
1152 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
1153 store <4 x i8> %strided.vec, <4 x i8>* %S
1154 ret void
1155}
1156