blob: 081c962ab94a9fd35e55ecc35624ea7408aed417 [file] [log] [blame]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
Simon Pilgrima50eec02017-12-20 13:12:34 +00003; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00005; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
9
10define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
11; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000012; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000013; AVX1-NEXT: vmovdqa (%rdi), %ymm0
14; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
15; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
16; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
17; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
18; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
19; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
20; AVX1-NEXT: vzeroupper
21; AVX1-NEXT: retq
22;
23; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000024; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000025; AVX2-NEXT: vmovdqa (%rdi), %ymm0
26; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
27; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
28; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
29; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
30; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
31; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
32; AVX2-NEXT: vzeroupper
33; AVX2-NEXT: retq
34;
Wei Mi1736efd2017-10-12 00:24:52 +000035; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000036; AVX512: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +000037; AVX512-NEXT: vmovdqa (%rdi), %ymm0
38; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
39; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
40; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
41; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
42; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
43; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
44; AVX512-NEXT: vzeroupper
45; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000046 %vec = load <32 x i8>, <32 x i8>* %L
47 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
48 store <16 x i8> %strided.vec, <16 x i8>* %S
49 ret void
50}
51
52define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
53; AVX1-LABEL: shuffle_v16i16_to_v8i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000054; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000055; AVX1-NEXT: vmovdqa (%rdi), %ymm0
56; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
57; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
58; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
59; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
60; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
61; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
62; AVX1-NEXT: vzeroupper
63; AVX1-NEXT: retq
64;
65; AVX2-LABEL: shuffle_v16i16_to_v8i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000066; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000067; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +000068; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
69; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
70; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
71; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
72; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000073; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
74; AVX2-NEXT: vzeroupper
75; AVX2-NEXT: retq
76;
Wei Mi1736efd2017-10-12 00:24:52 +000077; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000078; AVX512: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +000079; AVX512-NEXT: vmovdqa (%rdi), %ymm0
80; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
81; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
82; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
83; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
84; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
85; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
86; AVX512-NEXT: vzeroupper
87; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000088 %vec = load <16 x i16>, <16 x i16>* %L
89 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
90 store <8 x i16> %strided.vec, <8 x i16>* %S
91 ret void
92}
93
94define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
95; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000096; AVX: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000097; AVX-NEXT: vmovaps (%rdi), %ymm0
98; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
99; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
100; AVX-NEXT: vmovaps %xmm0, (%rsi)
101; AVX-NEXT: vzeroupper
102; AVX-NEXT: retq
103;
Craig Topper410d2522017-07-31 22:07:29 +0000104; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000105; AVX512: # %bb.0:
Craig Topper410d2522017-07-31 22:07:29 +0000106; AVX512-NEXT: vmovaps (%rdi), %ymm0
107; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
108; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
109; AVX512-NEXT: vmovaps %xmm0, (%rsi)
110; AVX512-NEXT: vzeroupper
111; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000112 %vec = load <8 x i32>, <8 x i32>* %L
113 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
114 store <4 x i32> %strided.vec, <4 x i32>* %S
115 ret void
116}
117
118define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
119; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000120; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000121; AVX1-NEXT: vmovdqa (%rdi), %ymm0
122; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
123; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
124; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
125; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
126; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
127; AVX1-NEXT: vmovq %xmm0, (%rsi)
128; AVX1-NEXT: vzeroupper
129; AVX1-NEXT: retq
130;
131; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000132; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000133; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000134; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
135; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
136; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
137; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
138; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000139; AVX2-NEXT: vmovq %xmm0, (%rsi)
140; AVX2-NEXT: vzeroupper
141; AVX2-NEXT: retq
142;
143; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000144; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000145; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000146; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
147; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
148; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
149; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
150; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000151; AVX512F-NEXT: vmovq %xmm0, (%rsi)
152; AVX512F-NEXT: vzeroupper
153; AVX512F-NEXT: retq
154;
155; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000156; AVX512VL: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000157; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000158; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
159; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
160; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
161; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
162; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
163; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000164; AVX512VL-NEXT: vzeroupper
165; AVX512VL-NEXT: retq
166;
167; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000168; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000169; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000170; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
171; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
172; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
173; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
174; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000175; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
176; AVX512BW-NEXT: vzeroupper
177; AVX512BW-NEXT: retq
178;
179; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000180; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000181; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
182; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
183; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
184; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
185; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
186; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
187; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000188; AVX512BWVL-NEXT: vzeroupper
189; AVX512BWVL-NEXT: retq
190 %vec = load <32 x i8>, <32 x i8>* %L
191 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
192 store <8 x i8> %strided.vec, <8 x i8>* %S
193 ret void
194}
195
196define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
197; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000198; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000199; AVX1-NEXT: vmovdqa (%rdi), %ymm0
200; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
201; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
202; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
203; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
204; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
205; AVX1-NEXT: vmovq %xmm0, (%rsi)
206; AVX1-NEXT: vzeroupper
207; AVX1-NEXT: retq
208;
209; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000210; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000211; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000212; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
213; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
214; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
215; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
216; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000217; AVX2-NEXT: vmovq %xmm0, (%rsi)
218; AVX2-NEXT: vzeroupper
219; AVX2-NEXT: retq
220;
221; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000222; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000223; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000224; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
225; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
226; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
227; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
228; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000229; AVX512F-NEXT: vmovq %xmm0, (%rsi)
230; AVX512F-NEXT: vzeroupper
231; AVX512F-NEXT: retq
232;
233; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000234; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000235; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
236; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
237; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
238; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
239; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
240; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
241; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000242; AVX512VL-NEXT: vzeroupper
243; AVX512VL-NEXT: retq
244;
245; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000246; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000247; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000248; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
249; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
250; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
251; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
252; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000253; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
254; AVX512BW-NEXT: vzeroupper
255; AVX512BW-NEXT: retq
256;
257; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000258; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000259; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
260; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
261; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
262; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
263; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
264; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
265; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000266; AVX512BWVL-NEXT: vzeroupper
267; AVX512BWVL-NEXT: retq
268 %vec = load <32 x i8>, <32 x i8>* %L
269 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
270 store <8 x i8> %strided.vec, <8 x i8>* %S
271 ret void
272}
273
274define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
275; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000276; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000277; AVX1-NEXT: vmovdqa (%rdi), %ymm0
278; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
279; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
280; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
281; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
282; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
283; AVX1-NEXT: vmovq %xmm0, (%rsi)
284; AVX1-NEXT: vzeroupper
285; AVX1-NEXT: retq
286;
287; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000288; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000289; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000290; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
291; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
292; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
293; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
294; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000295; AVX2-NEXT: vmovq %xmm0, (%rsi)
296; AVX2-NEXT: vzeroupper
297; AVX2-NEXT: retq
298;
299; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000300; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000301; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000302; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
303; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
304; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
305; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
306; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000307; AVX512F-NEXT: vmovq %xmm0, (%rsi)
308; AVX512F-NEXT: vzeroupper
309; AVX512F-NEXT: retq
310;
311; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000312; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000313; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
314; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
315; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
316; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
317; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
318; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
319; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000320; AVX512VL-NEXT: vzeroupper
321; AVX512VL-NEXT: retq
322;
323; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000324; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000325; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000326; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
327; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
328; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
329; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
330; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000331; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
332; AVX512BW-NEXT: vzeroupper
333; AVX512BW-NEXT: retq
334;
335; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000336; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000337; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
338; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
339; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
340; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
341; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
342; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
343; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000344; AVX512BWVL-NEXT: vzeroupper
345; AVX512BWVL-NEXT: retq
346 %vec = load <32 x i8>, <32 x i8>* %L
347 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
348 store <8 x i8> %strided.vec, <8 x i8>* %S
349 ret void
350}
351
352define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
353; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000354; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000355; AVX1-NEXT: vmovdqa (%rdi), %ymm0
356; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
357; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
358; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
359; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
360; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
361; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
362; AVX1-NEXT: vmovq %xmm0, (%rsi)
363; AVX1-NEXT: vzeroupper
364; AVX1-NEXT: retq
365;
Simon Pilgrima50eec02017-12-20 13:12:34 +0000366; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
367; AVX2-SLOW: # %bb.0:
368; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
369; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
370; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
371; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
372; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
373; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
374; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
375; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
376; AVX2-SLOW-NEXT: vzeroupper
377; AVX2-SLOW-NEXT: retq
378;
379; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
380; AVX2-FAST: # %bb.0:
381; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
382; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
383; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
384; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
385; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
386; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
387; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
388; AVX2-FAST-NEXT: vzeroupper
389; AVX2-FAST-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000390;
391; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000392; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000393; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000394; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
395; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
396; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
397; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
398; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
399; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000400; AVX512F-NEXT: vmovq %xmm0, (%rsi)
401; AVX512F-NEXT: vzeroupper
402; AVX512F-NEXT: retq
403;
404; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000405; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000406; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
407; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
408; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
409; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
410; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
411; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
412; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
413; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000414; AVX512VL-NEXT: vzeroupper
415; AVX512VL-NEXT: retq
416;
417; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000418; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000419; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000420; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
421; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
422; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
423; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
424; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
425; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000426; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
427; AVX512BW-NEXT: vzeroupper
428; AVX512BW-NEXT: retq
429;
430; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000431; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000432; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
433; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
434; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
435; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
436; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
437; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
438; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
439; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000440; AVX512BWVL-NEXT: vzeroupper
441; AVX512BWVL-NEXT: retq
442 %vec = load <16 x i16>, <16 x i16>* %L
443 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
444 store <4 x i16> %strided.vec, <4 x i16>* %S
445 ret void
446}
447
448define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
449; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000450; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000451; AVX1-NEXT: vmovdqa (%rdi), %ymm0
452; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
453; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
454; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
455; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
456; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
457; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
458; AVX1-NEXT: vmovq %xmm0, (%rsi)
459; AVX1-NEXT: vzeroupper
460; AVX1-NEXT: retq
461;
Simon Pilgrima50eec02017-12-20 13:12:34 +0000462; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
463; AVX2-SLOW: # %bb.0:
464; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
465; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
466; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
467; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
468; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
469; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
470; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
471; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
472; AVX2-SLOW-NEXT: vzeroupper
473; AVX2-SLOW-NEXT: retq
474;
475; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
476; AVX2-FAST: # %bb.0:
477; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
478; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
479; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
480; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
481; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
482; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
483; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
484; AVX2-FAST-NEXT: vzeroupper
485; AVX2-FAST-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000486;
487; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000488; AVX512F: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000489; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
490; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
491; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
492; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
493; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
494; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
495; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000496; AVX512F-NEXT: vmovq %xmm0, (%rsi)
497; AVX512F-NEXT: vzeroupper
498; AVX512F-NEXT: retq
499;
500; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000501; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000502; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
503; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
504; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
505; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000506; AVX512VL-NEXT: vzeroupper
507; AVX512VL-NEXT: retq
508;
509; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000510; AVX512BW: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000511; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
512; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
513; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
514; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
515; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
516; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
517; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000518; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
519; AVX512BW-NEXT: vzeroupper
520; AVX512BW-NEXT: retq
521;
522; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000523; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000524; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
525; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
526; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
527; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000528; AVX512BWVL-NEXT: vzeroupper
529; AVX512BWVL-NEXT: retq
530 %vec = load <16 x i16>, <16 x i16>* %L
531 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
532 store <4 x i16> %strided.vec, <4 x i16>* %S
533 ret void
534}
535
536define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
537; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000538; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000539; AVX1-NEXT: vmovdqa (%rdi), %ymm0
540; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
541; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
542; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
543; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
544; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
545; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
546; AVX1-NEXT: vmovq %xmm0, (%rsi)
547; AVX1-NEXT: vzeroupper
548; AVX1-NEXT: retq
549;
Simon Pilgrima50eec02017-12-20 13:12:34 +0000550; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
551; AVX2-SLOW: # %bb.0:
552; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
553; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
554; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
555; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
556; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
557; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
558; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
559; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
560; AVX2-SLOW-NEXT: vzeroupper
561; AVX2-SLOW-NEXT: retq
562;
563; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
564; AVX2-FAST: # %bb.0:
565; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
566; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
567; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
568; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
569; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
570; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
571; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
572; AVX2-FAST-NEXT: vzeroupper
573; AVX2-FAST-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000574;
575; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000576; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000577; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000578; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
579; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
580; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
581; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
582; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
583; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000584; AVX512F-NEXT: vmovq %xmm0, (%rsi)
585; AVX512F-NEXT: vzeroupper
586; AVX512F-NEXT: retq
587;
588; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000589; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000590; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
591; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
592; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
593; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
594; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
595; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
596; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
597; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000598; AVX512VL-NEXT: vzeroupper
599; AVX512VL-NEXT: retq
600;
601; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000602; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000603; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000604; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
605; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
606; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
607; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
608; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
609; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000610; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
611; AVX512BW-NEXT: vzeroupper
612; AVX512BW-NEXT: retq
613;
614; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000615; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000616; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
617; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
618; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
619; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
620; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
621; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
622; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
623; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000624; AVX512BWVL-NEXT: vzeroupper
625; AVX512BWVL-NEXT: retq
626 %vec = load <16 x i16>, <16 x i16>* %L
627 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
628 store <4 x i16> %strided.vec, <4 x i16>* %S
629 ret void
630}
631
632define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
633; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000634; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000635; AVX1-NEXT: vmovdqa (%rdi), %ymm0
636; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
637; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
638; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
639; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
640; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
641; AVX1-NEXT: vmovd %xmm0, (%rsi)
642; AVX1-NEXT: vzeroupper
643; AVX1-NEXT: retq
644;
645; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000646; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000647; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000648; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
649; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
650; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
651; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
652; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000653; AVX2-NEXT: vmovd %xmm0, (%rsi)
654; AVX2-NEXT: vzeroupper
655; AVX2-NEXT: retq
656;
657; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000658; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000659; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000660; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
661; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
662; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
663; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
664; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000665; AVX512F-NEXT: vmovd %xmm0, (%rsi)
666; AVX512F-NEXT: vzeroupper
667; AVX512F-NEXT: retq
668;
669; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000670; AVX512VL: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000671; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000672; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000673; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
Wei Mi1736efd2017-10-12 00:24:52 +0000674; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000675; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000676; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
677; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000678; AVX512VL-NEXT: vzeroupper
679; AVX512VL-NEXT: retq
680;
681; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000682; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000683; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000684; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
685; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
686; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
687; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
688; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000689; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
690; AVX512BW-NEXT: vzeroupper
691; AVX512BW-NEXT: retq
692;
693; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000694; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000695; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
696; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000697; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
Wei Mi1736efd2017-10-12 00:24:52 +0000698; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000699; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000700; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
701; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000702; AVX512BWVL-NEXT: vzeroupper
703; AVX512BWVL-NEXT: retq
704 %vec = load <32 x i8>, <32 x i8>* %L
705 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
706 store <4 x i8> %strided.vec, <4 x i8>* %S
707 ret void
708}
709
710define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
711; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000712; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000713; AVX1-NEXT: vmovdqa (%rdi), %ymm0
714; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
715; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
716; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
717; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
718; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
719; AVX1-NEXT: vmovd %xmm0, (%rsi)
720; AVX1-NEXT: vzeroupper
721; AVX1-NEXT: retq
722;
723; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000724; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000725; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000726; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
727; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
728; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
729; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
730; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000731; AVX2-NEXT: vmovd %xmm0, (%rsi)
732; AVX2-NEXT: vzeroupper
733; AVX2-NEXT: retq
734;
735; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000736; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000737; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000738; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
739; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
740; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
741; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
742; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000743; AVX512F-NEXT: vmovd %xmm0, (%rsi)
744; AVX512F-NEXT: vzeroupper
745; AVX512F-NEXT: retq
746;
747; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000748; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000749; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
750; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
751; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
752; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
753; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
754; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
755; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
756; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000757; AVX512VL-NEXT: vzeroupper
758; AVX512VL-NEXT: retq
759;
760; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000761; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000762; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000763; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
764; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
765; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
766; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
767; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000768; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
769; AVX512BW-NEXT: vzeroupper
770; AVX512BW-NEXT: retq
771;
772; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000773; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000774; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
775; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
776; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
777; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
778; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
779; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
780; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
781; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000782; AVX512BWVL-NEXT: vzeroupper
783; AVX512BWVL-NEXT: retq
784 %vec = load <32 x i8>, <32 x i8>* %L
785 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
786 store <4 x i8> %strided.vec, <4 x i8>* %S
787 ret void
788}
789
790define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
791; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000792; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000793; AVX1-NEXT: vmovdqa (%rdi), %ymm0
794; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
795; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
796; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
797; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
798; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
799; AVX1-NEXT: vmovd %xmm0, (%rsi)
800; AVX1-NEXT: vzeroupper
801; AVX1-NEXT: retq
802;
803; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000804; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000805; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000806; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
807; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
808; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
809; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
810; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000811; AVX2-NEXT: vmovd %xmm0, (%rsi)
812; AVX2-NEXT: vzeroupper
813; AVX2-NEXT: retq
814;
815; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000816; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000817; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000818; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
819; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
820; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
821; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
822; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000823; AVX512F-NEXT: vmovd %xmm0, (%rsi)
824; AVX512F-NEXT: vzeroupper
825; AVX512F-NEXT: retq
826;
827; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000828; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000829; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
830; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000831; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
Wei Mi1736efd2017-10-12 00:24:52 +0000832; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000833; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000834; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
835; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000836; AVX512VL-NEXT: vzeroupper
837; AVX512VL-NEXT: retq
838;
839; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000840; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000841; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000842; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
843; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
844; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
845; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
846; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000847; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
848; AVX512BW-NEXT: vzeroupper
849; AVX512BW-NEXT: retq
850;
851; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000852; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000853; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
854; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000855; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
Wei Mi1736efd2017-10-12 00:24:52 +0000856; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000857; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000858; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
859; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000860; AVX512BWVL-NEXT: vzeroupper
861; AVX512BWVL-NEXT: retq
862 %vec = load <32 x i8>, <32 x i8>* %L
863 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
864 store <4 x i8> %strided.vec, <4 x i8>* %S
865 ret void
866}
867
868define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
869; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000870; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000871; AVX1-NEXT: vmovdqa (%rdi), %ymm0
872; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
873; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
874; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
875; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
876; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
877; AVX1-NEXT: vmovd %xmm0, (%rsi)
878; AVX1-NEXT: vzeroupper
879; AVX1-NEXT: retq
880;
881; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000882; AVX2: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000883; AVX2-NEXT: vmovdqa (%rdi), %ymm0
884; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
885; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
886; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
887; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
888; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000889; AVX2-NEXT: vmovd %xmm0, (%rsi)
890; AVX2-NEXT: vzeroupper
891; AVX2-NEXT: retq
892;
893; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000894; AVX512F: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000895; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
896; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
897; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
898; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
899; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
900; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000901; AVX512F-NEXT: vmovd %xmm0, (%rsi)
902; AVX512F-NEXT: vzeroupper
903; AVX512F-NEXT: retq
904;
905; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000906; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000907; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
908; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
909; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
910; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000911; AVX512VL-NEXT: vzeroupper
912; AVX512VL-NEXT: retq
913;
914; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000915; AVX512BW: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000916; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
917; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
918; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
919; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
920; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
921; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000922; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
923; AVX512BW-NEXT: vzeroupper
924; AVX512BW-NEXT: retq
925;
926; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000927; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000928; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
929; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
930; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
931; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000932; AVX512BWVL-NEXT: vzeroupper
933; AVX512BWVL-NEXT: retq
934 %vec = load <32 x i8>, <32 x i8>* %L
935 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
936 store <4 x i8> %strided.vec, <4 x i8>* %S
937 ret void
938}
939
940define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
941; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000942; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000943; AVX1-NEXT: vmovdqa (%rdi), %ymm0
944; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
945; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
946; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
947; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
948; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
949; AVX1-NEXT: vmovd %xmm0, (%rsi)
950; AVX1-NEXT: vzeroupper
951; AVX1-NEXT: retq
952;
953; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000954; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000955; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000956; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
957; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
958; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
959; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
960; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000961; AVX2-NEXT: vmovd %xmm0, (%rsi)
962; AVX2-NEXT: vzeroupper
963; AVX2-NEXT: retq
964;
965; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000966; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000967; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000968; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
969; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
970; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
971; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
972; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000973; AVX512F-NEXT: vmovd %xmm0, (%rsi)
974; AVX512F-NEXT: vzeroupper
975; AVX512F-NEXT: retq
976;
977; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000978; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000979; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
980; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000981; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
982; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
983; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000984; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
985; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000986; AVX512VL-NEXT: vzeroupper
987; AVX512VL-NEXT: retq
988;
989; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000990; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000991; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000992; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
993; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
994; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
995; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
996; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000997; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
998; AVX512BW-NEXT: vzeroupper
999; AVX512BW-NEXT: retq
1000;
1001; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001002; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001003; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1004; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +00001005; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
1006; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1007; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +00001008; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1009; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001010; AVX512BWVL-NEXT: vzeroupper
1011; AVX512BWVL-NEXT: retq
1012 %vec = load <32 x i8>, <32 x i8>* %L
1013 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
1014 store <4 x i8> %strided.vec, <4 x i8>* %S
1015 ret void
1016}
1017
1018define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1019; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001020; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001021; AVX1-NEXT: vmovdqa (%rdi), %ymm0
1022; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1023; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1024; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1025; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1026; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1027; AVX1-NEXT: vmovd %xmm0, (%rsi)
1028; AVX1-NEXT: vzeroupper
1029; AVX1-NEXT: retq
1030;
1031; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001032; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001033; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001034; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1035; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1036; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1037; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1038; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001039; AVX2-NEXT: vmovd %xmm0, (%rsi)
1040; AVX2-NEXT: vzeroupper
1041; AVX2-NEXT: retq
1042;
1043; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001044; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001045; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001046; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1047; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1048; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1049; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1050; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001051; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1052; AVX512F-NEXT: vzeroupper
1053; AVX512F-NEXT: retq
1054;
1055; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001056; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001057; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1058; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1059; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1060; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1061; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1062; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
1063; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1064; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001065; AVX512VL-NEXT: vzeroupper
1066; AVX512VL-NEXT: retq
1067;
1068; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001069; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001070; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001071; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1072; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1073; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1074; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1075; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001076; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1077; AVX512BW-NEXT: vzeroupper
1078; AVX512BW-NEXT: retq
1079;
1080; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001081; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001082; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1083; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1084; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1085; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1086; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1087; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
1088; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1089; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001090; AVX512BWVL-NEXT: vzeroupper
1091; AVX512BWVL-NEXT: retq
1092 %vec = load <32 x i8>, <32 x i8>* %L
1093 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
1094 store <4 x i8> %strided.vec, <4 x i8>* %S
1095 ret void
1096}
1097
1098define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1099; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001100; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001101; AVX1-NEXT: vmovdqa (%rdi), %ymm0
1102; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1103; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1104; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1105; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1106; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1107; AVX1-NEXT: vmovd %xmm0, (%rsi)
1108; AVX1-NEXT: vzeroupper
1109; AVX1-NEXT: retq
1110;
1111; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001112; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001113; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001114; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1115; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1116; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1117; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1118; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001119; AVX2-NEXT: vmovd %xmm0, (%rsi)
1120; AVX2-NEXT: vzeroupper
1121; AVX2-NEXT: retq
1122;
1123; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001124; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001125; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001126; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1127; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1128; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1129; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1130; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001131; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1132; AVX512F-NEXT: vzeroupper
1133; AVX512F-NEXT: retq
1134;
1135; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001136; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001137; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1138; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1139; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
1140; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1141; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1142; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1143; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001144; AVX512VL-NEXT: vzeroupper
1145; AVX512VL-NEXT: retq
1146;
1147; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001148; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001149; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001150; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1151; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1152; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1153; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1154; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001155; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1156; AVX512BW-NEXT: vzeroupper
1157; AVX512BW-NEXT: retq
1158;
1159; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001160; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001161; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1162; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1163; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
1164; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1165; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1166; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1167; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001168; AVX512BWVL-NEXT: vzeroupper
1169; AVX512BWVL-NEXT: retq
1170 %vec = load <32 x i8>, <32 x i8>* %L
1171 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
1172 store <4 x i8> %strided.vec, <4 x i8>* %S
1173 ret void
1174}
1175