blob: ce8b127014b59d2bed7274580fc0fef94809c2e0 [file] [log] [blame]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
8
9define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
10; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000011; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000012; AVX1-NEXT: vmovdqa (%rdi), %ymm0
13; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
14; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
15; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
16; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
17; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
18; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
19; AVX1-NEXT: vzeroupper
20; AVX1-NEXT: retq
21;
22; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000023; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000024; AVX2-NEXT: vmovdqa (%rdi), %ymm0
25; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
26; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
27; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
28; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
29; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
30; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
31; AVX2-NEXT: vzeroupper
32; AVX2-NEXT: retq
33;
Wei Mi1736efd2017-10-12 00:24:52 +000034; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000035; AVX512: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +000036; AVX512-NEXT: vmovdqa (%rdi), %ymm0
37; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
38; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
39; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
40; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
41; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
42; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
43; AVX512-NEXT: vzeroupper
44; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000045 %vec = load <32 x i8>, <32 x i8>* %L
46 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
47 store <16 x i8> %strided.vec, <16 x i8>* %S
48 ret void
49}
50
51define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
52; AVX1-LABEL: shuffle_v16i16_to_v8i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000053; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000054; AVX1-NEXT: vmovdqa (%rdi), %ymm0
55; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
56; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
57; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
58; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
59; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
60; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
61; AVX1-NEXT: vzeroupper
62; AVX1-NEXT: retq
63;
64; AVX2-LABEL: shuffle_v16i16_to_v8i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000065; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000066; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +000067; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
68; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
69; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
70; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
71; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000072; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
73; AVX2-NEXT: vzeroupper
74; AVX2-NEXT: retq
75;
Wei Mi1736efd2017-10-12 00:24:52 +000076; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000077; AVX512: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +000078; AVX512-NEXT: vmovdqa (%rdi), %ymm0
79; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
80; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
81; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
82; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
83; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
84; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
85; AVX512-NEXT: vzeroupper
86; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000087 %vec = load <16 x i16>, <16 x i16>* %L
88 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
89 store <8 x i16> %strided.vec, <8 x i16>* %S
90 ret void
91}
92
93define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
94; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000095; AVX: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000096; AVX-NEXT: vmovaps (%rdi), %ymm0
97; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
98; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
99; AVX-NEXT: vmovaps %xmm0, (%rsi)
100; AVX-NEXT: vzeroupper
101; AVX-NEXT: retq
102;
Craig Topper410d2522017-07-31 22:07:29 +0000103; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000104; AVX512: # %bb.0:
Craig Topper410d2522017-07-31 22:07:29 +0000105; AVX512-NEXT: vmovaps (%rdi), %ymm0
106; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
107; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
108; AVX512-NEXT: vmovaps %xmm0, (%rsi)
109; AVX512-NEXT: vzeroupper
110; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000111 %vec = load <8 x i32>, <8 x i32>* %L
112 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
113 store <4 x i32> %strided.vec, <4 x i32>* %S
114 ret void
115}
116
117define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
118; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000119; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000120; AVX1-NEXT: vmovdqa (%rdi), %ymm0
121; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
122; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
123; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
124; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
125; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
126; AVX1-NEXT: vmovq %xmm0, (%rsi)
127; AVX1-NEXT: vzeroupper
128; AVX1-NEXT: retq
129;
130; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000131; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000132; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000133; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
134; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
135; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
136; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
137; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000138; AVX2-NEXT: vmovq %xmm0, (%rsi)
139; AVX2-NEXT: vzeroupper
140; AVX2-NEXT: retq
141;
142; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000143; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000144; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000145; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
146; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
147; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
148; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
149; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000150; AVX512F-NEXT: vmovq %xmm0, (%rsi)
151; AVX512F-NEXT: vzeroupper
152; AVX512F-NEXT: retq
153;
154; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000155; AVX512VL: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000156; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000157; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
158; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
159; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
160; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
161; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
162; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000163; AVX512VL-NEXT: vzeroupper
164; AVX512VL-NEXT: retq
165;
166; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000167; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000168; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000169; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
170; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
171; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
172; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
173; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000174; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
175; AVX512BW-NEXT: vzeroupper
176; AVX512BW-NEXT: retq
177;
178; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000179; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000180; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
181; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
182; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
183; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
184; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
185; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
186; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000187; AVX512BWVL-NEXT: vzeroupper
188; AVX512BWVL-NEXT: retq
189 %vec = load <32 x i8>, <32 x i8>* %L
190 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
191 store <8 x i8> %strided.vec, <8 x i8>* %S
192 ret void
193}
194
195define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
196; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000197; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000198; AVX1-NEXT: vmovdqa (%rdi), %ymm0
199; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
200; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
201; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
202; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
203; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
204; AVX1-NEXT: vmovq %xmm0, (%rsi)
205; AVX1-NEXT: vzeroupper
206; AVX1-NEXT: retq
207;
208; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000209; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000210; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000211; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
212; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
213; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
214; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
215; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000216; AVX2-NEXT: vmovq %xmm0, (%rsi)
217; AVX2-NEXT: vzeroupper
218; AVX2-NEXT: retq
219;
220; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000221; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000222; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000223; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
224; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
225; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
226; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
227; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000228; AVX512F-NEXT: vmovq %xmm0, (%rsi)
229; AVX512F-NEXT: vzeroupper
230; AVX512F-NEXT: retq
231;
232; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000233; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000234; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
235; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
236; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
237; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
238; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
239; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
240; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000241; AVX512VL-NEXT: vzeroupper
242; AVX512VL-NEXT: retq
243;
244; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000245; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000246; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000247; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
248; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
249; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
250; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
251; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000252; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
253; AVX512BW-NEXT: vzeroupper
254; AVX512BW-NEXT: retq
255;
256; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000257; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000258; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
259; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
260; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
261; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
262; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
263; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
264; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000265; AVX512BWVL-NEXT: vzeroupper
266; AVX512BWVL-NEXT: retq
267 %vec = load <32 x i8>, <32 x i8>* %L
268 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
269 store <8 x i8> %strided.vec, <8 x i8>* %S
270 ret void
271}
272
273define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
274; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000275; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000276; AVX1-NEXT: vmovdqa (%rdi), %ymm0
277; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
278; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
279; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
280; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
281; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
282; AVX1-NEXT: vmovq %xmm0, (%rsi)
283; AVX1-NEXT: vzeroupper
284; AVX1-NEXT: retq
285;
286; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000287; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000288; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000289; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
290; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
291; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
292; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
293; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000294; AVX2-NEXT: vmovq %xmm0, (%rsi)
295; AVX2-NEXT: vzeroupper
296; AVX2-NEXT: retq
297;
298; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000299; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000300; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000301; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
302; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
303; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
304; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
305; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000306; AVX512F-NEXT: vmovq %xmm0, (%rsi)
307; AVX512F-NEXT: vzeroupper
308; AVX512F-NEXT: retq
309;
310; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000311; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000312; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
313; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
314; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
315; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
316; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
317; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
318; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000319; AVX512VL-NEXT: vzeroupper
320; AVX512VL-NEXT: retq
321;
322; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000323; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000324; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000325; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
326; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
327; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
328; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
329; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000330; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
331; AVX512BW-NEXT: vzeroupper
332; AVX512BW-NEXT: retq
333;
334; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000335; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000336; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
337; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
338; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
339; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
340; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
341; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
342; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000343; AVX512BWVL-NEXT: vzeroupper
344; AVX512BWVL-NEXT: retq
345 %vec = load <32 x i8>, <32 x i8>* %L
346 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
347 store <8 x i8> %strided.vec, <8 x i8>* %S
348 ret void
349}
350
351define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
352; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000353; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000354; AVX1-NEXT: vmovdqa (%rdi), %ymm0
355; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
356; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
357; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
358; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
359; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
360; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
361; AVX1-NEXT: vmovq %xmm0, (%rsi)
362; AVX1-NEXT: vzeroupper
363; AVX1-NEXT: retq
364;
365; AVX2-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000366; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000367; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000368; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
369; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
370; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
371; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
372; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
373; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000374; AVX2-NEXT: vmovq %xmm0, (%rsi)
375; AVX2-NEXT: vzeroupper
376; AVX2-NEXT: retq
377;
378; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000379; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000380; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000381; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
382; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
383; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
384; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
385; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
386; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000387; AVX512F-NEXT: vmovq %xmm0, (%rsi)
388; AVX512F-NEXT: vzeroupper
389; AVX512F-NEXT: retq
390;
391; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000392; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000393; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
394; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
395; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
396; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
397; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
398; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
399; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
400; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000401; AVX512VL-NEXT: vzeroupper
402; AVX512VL-NEXT: retq
403;
404; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000405; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000406; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000407; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
408; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
409; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
410; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
411; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
412; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000413; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
414; AVX512BW-NEXT: vzeroupper
415; AVX512BW-NEXT: retq
416;
417; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000418; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000419; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
420; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
421; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
422; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
423; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
424; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
425; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
426; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000427; AVX512BWVL-NEXT: vzeroupper
428; AVX512BWVL-NEXT: retq
429 %vec = load <16 x i16>, <16 x i16>* %L
430 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
431 store <4 x i16> %strided.vec, <4 x i16>* %S
432 ret void
433}
434
435define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
436; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000437; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000438; AVX1-NEXT: vmovdqa (%rdi), %ymm0
439; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
440; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
441; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
442; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
443; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
444; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
445; AVX1-NEXT: vmovq %xmm0, (%rsi)
446; AVX1-NEXT: vzeroupper
447; AVX1-NEXT: retq
448;
449; AVX2-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000450; AVX2: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000451; AVX2-NEXT: vmovdqa (%rdi), %ymm0
452; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
453; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
454; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
455; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
456; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
457; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000458; AVX2-NEXT: vmovq %xmm0, (%rsi)
459; AVX2-NEXT: vzeroupper
460; AVX2-NEXT: retq
461;
462; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000463; AVX512F: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000464; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
465; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
466; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
467; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
468; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
469; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
470; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000471; AVX512F-NEXT: vmovq %xmm0, (%rsi)
472; AVX512F-NEXT: vzeroupper
473; AVX512F-NEXT: retq
474;
475; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000476; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000477; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
478; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
479; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
480; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000481; AVX512VL-NEXT: vzeroupper
482; AVX512VL-NEXT: retq
483;
484; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000485; AVX512BW: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000486; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
487; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
488; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
489; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
490; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
491; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
492; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000493; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
494; AVX512BW-NEXT: vzeroupper
495; AVX512BW-NEXT: retq
496;
497; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000498; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000499; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
500; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
501; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
502; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000503; AVX512BWVL-NEXT: vzeroupper
504; AVX512BWVL-NEXT: retq
505 %vec = load <16 x i16>, <16 x i16>* %L
506 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
507 store <4 x i16> %strided.vec, <4 x i16>* %S
508 ret void
509}
510
511define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
512; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000513; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000514; AVX1-NEXT: vmovdqa (%rdi), %ymm0
515; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
516; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
517; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
518; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
519; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
520; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
521; AVX1-NEXT: vmovq %xmm0, (%rsi)
522; AVX1-NEXT: vzeroupper
523; AVX1-NEXT: retq
524;
525; AVX2-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000526; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000527; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000528; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
529; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
530; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
531; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
532; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
533; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000534; AVX2-NEXT: vmovq %xmm0, (%rsi)
535; AVX2-NEXT: vzeroupper
536; AVX2-NEXT: retq
537;
538; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000539; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000540; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000541; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
542; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
543; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
544; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
545; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
546; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000547; AVX512F-NEXT: vmovq %xmm0, (%rsi)
548; AVX512F-NEXT: vzeroupper
549; AVX512F-NEXT: retq
550;
551; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000552; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000553; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
554; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
555; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
556; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
557; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
558; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
559; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
560; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000561; AVX512VL-NEXT: vzeroupper
562; AVX512VL-NEXT: retq
563;
564; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000565; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000566; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000567; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
568; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
569; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
570; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
571; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
572; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000573; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
574; AVX512BW-NEXT: vzeroupper
575; AVX512BW-NEXT: retq
576;
577; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000578; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000579; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
580; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
581; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
582; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
583; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
584; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
585; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
586; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000587; AVX512BWVL-NEXT: vzeroupper
588; AVX512BWVL-NEXT: retq
589 %vec = load <16 x i16>, <16 x i16>* %L
590 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
591 store <4 x i16> %strided.vec, <4 x i16>* %S
592 ret void
593}
594
595define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
596; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000597; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000598; AVX1-NEXT: vmovdqa (%rdi), %ymm0
599; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
600; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
601; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
602; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
603; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
604; AVX1-NEXT: vmovd %xmm0, (%rsi)
605; AVX1-NEXT: vzeroupper
606; AVX1-NEXT: retq
607;
608; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000609; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000610; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000611; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
612; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
613; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
614; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
615; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000616; AVX2-NEXT: vmovd %xmm0, (%rsi)
617; AVX2-NEXT: vzeroupper
618; AVX2-NEXT: retq
619;
620; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000621; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000622; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000623; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
624; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
625; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
626; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
627; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000628; AVX512F-NEXT: vmovd %xmm0, (%rsi)
629; AVX512F-NEXT: vzeroupper
630; AVX512F-NEXT: retq
631;
632; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000633; AVX512VL: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000634; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000635; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000636; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
Wei Mi1736efd2017-10-12 00:24:52 +0000637; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000638; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000639; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
640; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000641; AVX512VL-NEXT: vzeroupper
642; AVX512VL-NEXT: retq
643;
644; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000645; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000646; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000647; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
648; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
649; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
650; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
651; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000652; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
653; AVX512BW-NEXT: vzeroupper
654; AVX512BW-NEXT: retq
655;
656; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000657; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000658; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
659; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000660; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
Wei Mi1736efd2017-10-12 00:24:52 +0000661; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000662; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000663; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
664; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000665; AVX512BWVL-NEXT: vzeroupper
666; AVX512BWVL-NEXT: retq
667 %vec = load <32 x i8>, <32 x i8>* %L
668 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
669 store <4 x i8> %strided.vec, <4 x i8>* %S
670 ret void
671}
672
673define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
674; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000675; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000676; AVX1-NEXT: vmovdqa (%rdi), %ymm0
677; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
678; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
679; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
680; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
681; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
682; AVX1-NEXT: vmovd %xmm0, (%rsi)
683; AVX1-NEXT: vzeroupper
684; AVX1-NEXT: retq
685;
686; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000687; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000688; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000689; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
690; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
691; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
692; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
693; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000694; AVX2-NEXT: vmovd %xmm0, (%rsi)
695; AVX2-NEXT: vzeroupper
696; AVX2-NEXT: retq
697;
698; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000699; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000700; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000701; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
702; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
703; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
704; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
705; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000706; AVX512F-NEXT: vmovd %xmm0, (%rsi)
707; AVX512F-NEXT: vzeroupper
708; AVX512F-NEXT: retq
709;
710; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000711; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000712; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
713; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
714; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
715; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
716; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
717; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
718; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
719; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000720; AVX512VL-NEXT: vzeroupper
721; AVX512VL-NEXT: retq
722;
723; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000724; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000725; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000726; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
727; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
728; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
729; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
730; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000731; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
732; AVX512BW-NEXT: vzeroupper
733; AVX512BW-NEXT: retq
734;
735; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000736; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000737; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
738; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
739; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
740; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
741; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
742; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
743; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
744; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000745; AVX512BWVL-NEXT: vzeroupper
746; AVX512BWVL-NEXT: retq
747 %vec = load <32 x i8>, <32 x i8>* %L
748 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
749 store <4 x i8> %strided.vec, <4 x i8>* %S
750 ret void
751}
752
753define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
754; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000755; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000756; AVX1-NEXT: vmovdqa (%rdi), %ymm0
757; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
758; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
759; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
760; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
761; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
762; AVX1-NEXT: vmovd %xmm0, (%rsi)
763; AVX1-NEXT: vzeroupper
764; AVX1-NEXT: retq
765;
766; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000767; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000768; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000769; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
770; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
771; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
772; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
773; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000774; AVX2-NEXT: vmovd %xmm0, (%rsi)
775; AVX2-NEXT: vzeroupper
776; AVX2-NEXT: retq
777;
778; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000779; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000780; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000781; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
782; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
783; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
784; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
785; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000786; AVX512F-NEXT: vmovd %xmm0, (%rsi)
787; AVX512F-NEXT: vzeroupper
788; AVX512F-NEXT: retq
789;
790; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000791; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000792; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
793; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000794; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
Wei Mi1736efd2017-10-12 00:24:52 +0000795; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000796; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000797; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
798; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000799; AVX512VL-NEXT: vzeroupper
800; AVX512VL-NEXT: retq
801;
802; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000803; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000804; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000805; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
806; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
807; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
808; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
809; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000810; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
811; AVX512BW-NEXT: vzeroupper
812; AVX512BW-NEXT: retq
813;
814; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000815; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000816; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
817; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000818; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
Wei Mi1736efd2017-10-12 00:24:52 +0000819; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000820; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000821; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
822; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000823; AVX512BWVL-NEXT: vzeroupper
824; AVX512BWVL-NEXT: retq
825 %vec = load <32 x i8>, <32 x i8>* %L
826 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
827 store <4 x i8> %strided.vec, <4 x i8>* %S
828 ret void
829}
830
831define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
832; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000833; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000834; AVX1-NEXT: vmovdqa (%rdi), %ymm0
835; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
836; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
837; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
838; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
839; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
840; AVX1-NEXT: vmovd %xmm0, (%rsi)
841; AVX1-NEXT: vzeroupper
842; AVX1-NEXT: retq
843;
844; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000845; AVX2: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000846; AVX2-NEXT: vmovdqa (%rdi), %ymm0
847; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
848; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
849; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
850; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
851; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000852; AVX2-NEXT: vmovd %xmm0, (%rsi)
853; AVX2-NEXT: vzeroupper
854; AVX2-NEXT: retq
855;
856; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000857; AVX512F: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000858; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
859; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
860; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
861; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
862; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
863; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000864; AVX512F-NEXT: vmovd %xmm0, (%rsi)
865; AVX512F-NEXT: vzeroupper
866; AVX512F-NEXT: retq
867;
868; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000869; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000870; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
871; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
872; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
873; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000874; AVX512VL-NEXT: vzeroupper
875; AVX512VL-NEXT: retq
876;
877; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000878; AVX512BW: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000879; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
880; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
881; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
882; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
883; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
884; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000885; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
886; AVX512BW-NEXT: vzeroupper
887; AVX512BW-NEXT: retq
888;
889; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000890; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000891; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
892; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
893; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
894; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000895; AVX512BWVL-NEXT: vzeroupper
896; AVX512BWVL-NEXT: retq
897 %vec = load <32 x i8>, <32 x i8>* %L
898 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
899 store <4 x i8> %strided.vec, <4 x i8>* %S
900 ret void
901}
902
903define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
904; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000905; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000906; AVX1-NEXT: vmovdqa (%rdi), %ymm0
907; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
908; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
909; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
910; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
911; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
912; AVX1-NEXT: vmovd %xmm0, (%rsi)
913; AVX1-NEXT: vzeroupper
914; AVX1-NEXT: retq
915;
916; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000917; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000918; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000919; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
920; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
921; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
922; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
923; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000924; AVX2-NEXT: vmovd %xmm0, (%rsi)
925; AVX2-NEXT: vzeroupper
926; AVX2-NEXT: retq
927;
928; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000929; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000930; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000931; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
932; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
933; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
934; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
935; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000936; AVX512F-NEXT: vmovd %xmm0, (%rsi)
937; AVX512F-NEXT: vzeroupper
938; AVX512F-NEXT: retq
939;
940; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000941; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000942; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
943; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000944; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
945; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
946; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000947; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
948; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000949; AVX512VL-NEXT: vzeroupper
950; AVX512VL-NEXT: retq
951;
952; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000953; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000954; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000955; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
956; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
957; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
958; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
959; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000960; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
961; AVX512BW-NEXT: vzeroupper
962; AVX512BW-NEXT: retq
963;
964; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000965; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000966; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
967; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000968; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
969; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
970; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000971; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
972; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000973; AVX512BWVL-NEXT: vzeroupper
974; AVX512BWVL-NEXT: retq
975 %vec = load <32 x i8>, <32 x i8>* %L
976 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
977 store <4 x i8> %strided.vec, <4 x i8>* %S
978 ret void
979}
980
981define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
982; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000983; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000984; AVX1-NEXT: vmovdqa (%rdi), %ymm0
985; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
986; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
987; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
988; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
989; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
990; AVX1-NEXT: vmovd %xmm0, (%rsi)
991; AVX1-NEXT: vzeroupper
992; AVX1-NEXT: retq
993;
994; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000995; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000996; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000997; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
998; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
999; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1000; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1001; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001002; AVX2-NEXT: vmovd %xmm0, (%rsi)
1003; AVX2-NEXT: vzeroupper
1004; AVX2-NEXT: retq
1005;
1006; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001007; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001008; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001009; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1010; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1011; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1012; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1013; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001014; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1015; AVX512F-NEXT: vzeroupper
1016; AVX512F-NEXT: retq
1017;
1018; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001019; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001020; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1021; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1022; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1023; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1024; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1025; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
1026; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1027; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001028; AVX512VL-NEXT: vzeroupper
1029; AVX512VL-NEXT: retq
1030;
1031; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001032; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001033; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001034; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1035; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1036; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1037; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1038; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001039; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1040; AVX512BW-NEXT: vzeroupper
1041; AVX512BW-NEXT: retq
1042;
1043; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001044; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001045; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1046; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1047; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1048; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1049; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1050; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
1051; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1052; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001053; AVX512BWVL-NEXT: vzeroupper
1054; AVX512BWVL-NEXT: retq
1055 %vec = load <32 x i8>, <32 x i8>* %L
1056 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
1057 store <4 x i8> %strided.vec, <4 x i8>* %S
1058 ret void
1059}
1060
1061define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1062; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001063; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001064; AVX1-NEXT: vmovdqa (%rdi), %ymm0
1065; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1066; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1067; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1068; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1069; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1070; AVX1-NEXT: vmovd %xmm0, (%rsi)
1071; AVX1-NEXT: vzeroupper
1072; AVX1-NEXT: retq
1073;
1074; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001075; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001076; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001077; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1078; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1079; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1080; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1081; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001082; AVX2-NEXT: vmovd %xmm0, (%rsi)
1083; AVX2-NEXT: vzeroupper
1084; AVX2-NEXT: retq
1085;
1086; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001087; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001088; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001089; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1090; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1091; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1092; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1093; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001094; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1095; AVX512F-NEXT: vzeroupper
1096; AVX512F-NEXT: retq
1097;
1098; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001099; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001100; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1101; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1102; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
1103; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1104; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1105; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1106; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001107; AVX512VL-NEXT: vzeroupper
1108; AVX512VL-NEXT: retq
1109;
1110; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001111; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001112; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001113; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1114; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1115; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1116; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1117; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001118; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1119; AVX512BW-NEXT: vzeroupper
1120; AVX512BW-NEXT: retq
1121;
1122; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001123; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001124; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1125; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1126; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
1127; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1128; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1129; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1130; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001131; AVX512BWVL-NEXT: vzeroupper
1132; AVX512BWVL-NEXT: retq
1133 %vec = load <32 x i8>, <32 x i8>* %L
1134 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
1135 store <4 x i8> %strided.vec, <4 x i8>* %S
1136 ret void
1137}
1138