blob: 86c83a782ff02ce8e420fbb5a5d5ac61cd3137eb [file] [log] [blame]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
Simon Pilgrima50eec02017-12-20 13:12:34 +00003; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00005; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
Zvi Rackover72b0bb12018-01-09 16:26:06 +00006; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00009
10define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
11; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000012; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000013; AVX1-NEXT: vmovdqa (%rdi), %ymm0
14; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
15; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
16; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
17; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
18; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
19; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
20; AVX1-NEXT: vzeroupper
21; AVX1-NEXT: retq
22;
23; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000024; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000025; AVX2-NEXT: vmovdqa (%rdi), %ymm0
26; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
27; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
28; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
29; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
30; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
31; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
32; AVX2-NEXT: vzeroupper
33; AVX2-NEXT: retq
34;
Wei Mi1736efd2017-10-12 00:24:52 +000035; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000036; AVX512: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +000037; AVX512-NEXT: vmovdqa (%rdi), %ymm0
38; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
39; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
40; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
41; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
42; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
43; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
44; AVX512-NEXT: vzeroupper
45; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000046 %vec = load <32 x i8>, <32 x i8>* %L
47 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
48 store <16 x i8> %strided.vec, <16 x i8>* %S
49 ret void
50}
51
52define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
53; AVX1-LABEL: shuffle_v16i16_to_v8i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000054; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000055; AVX1-NEXT: vmovdqa (%rdi), %ymm0
56; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
57; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
58; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
59; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
60; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
61; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
62; AVX1-NEXT: vzeroupper
63; AVX1-NEXT: retq
64;
65; AVX2-LABEL: shuffle_v16i16_to_v8i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000066; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000067; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +000068; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
69; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
70; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
71; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
72; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000073; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
74; AVX2-NEXT: vzeroupper
75; AVX2-NEXT: retq
76;
Wei Mi1736efd2017-10-12 00:24:52 +000077; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000078; AVX512: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +000079; AVX512-NEXT: vmovdqa (%rdi), %ymm0
80; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
81; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
82; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
83; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
84; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
85; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
86; AVX512-NEXT: vzeroupper
87; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000088 %vec = load <16 x i16>, <16 x i16>* %L
89 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
90 store <8 x i16> %strided.vec, <8 x i16>* %S
91 ret void
92}
93
94define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
95; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +000096; AVX: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +000097; AVX-NEXT: vmovaps (%rdi), %ymm0
98; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
99; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
100; AVX-NEXT: vmovaps %xmm0, (%rsi)
101; AVX-NEXT: vzeroupper
102; AVX-NEXT: retq
103;
Craig Topper410d2522017-07-31 22:07:29 +0000104; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000105; AVX512: # %bb.0:
Craig Topper410d2522017-07-31 22:07:29 +0000106; AVX512-NEXT: vmovaps (%rdi), %ymm0
107; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
108; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
109; AVX512-NEXT: vmovaps %xmm0, (%rsi)
110; AVX512-NEXT: vzeroupper
111; AVX512-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000112 %vec = load <8 x i32>, <8 x i32>* %L
113 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
114 store <4 x i32> %strided.vec, <4 x i32>* %S
115 ret void
116}
117
118define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
119; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000120; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000121; AVX1-NEXT: vmovdqa (%rdi), %ymm0
122; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
123; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
124; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
125; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
126; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
127; AVX1-NEXT: vmovq %xmm0, (%rsi)
128; AVX1-NEXT: vzeroupper
129; AVX1-NEXT: retq
130;
131; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000132; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000133; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000134; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
135; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
136; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
137; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
138; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000139; AVX2-NEXT: vmovq %xmm0, (%rsi)
140; AVX2-NEXT: vzeroupper
141; AVX2-NEXT: retq
142;
143; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000144; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000145; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000146; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
147; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
148; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
149; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
150; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000151; AVX512F-NEXT: vmovq %xmm0, (%rsi)
152; AVX512F-NEXT: vzeroupper
153; AVX512F-NEXT: retq
154;
155; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000156; AVX512VL: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000157; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000158; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
159; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
160; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
161; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
162; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
163; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000164; AVX512VL-NEXT: vzeroupper
165; AVX512VL-NEXT: retq
166;
167; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000168; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000169; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000170; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
171; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
172; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
173; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
174; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000175; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
176; AVX512BW-NEXT: vzeroupper
177; AVX512BW-NEXT: retq
178;
179; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000180; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000181; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
182; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
183; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
184; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
185; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
186; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
187; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000188; AVX512BWVL-NEXT: vzeroupper
189; AVX512BWVL-NEXT: retq
190 %vec = load <32 x i8>, <32 x i8>* %L
191 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
192 store <8 x i8> %strided.vec, <8 x i8>* %S
193 ret void
194}
195
196define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
197; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000198; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000199; AVX1-NEXT: vmovdqa (%rdi), %ymm0
200; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
201; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
202; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
203; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
204; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
205; AVX1-NEXT: vmovq %xmm0, (%rsi)
206; AVX1-NEXT: vzeroupper
207; AVX1-NEXT: retq
208;
209; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000210; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000211; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000212; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
213; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
214; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
215; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
216; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000217; AVX2-NEXT: vmovq %xmm0, (%rsi)
218; AVX2-NEXT: vzeroupper
219; AVX2-NEXT: retq
220;
221; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000222; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000223; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000224; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
225; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
226; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
227; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
228; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000229; AVX512F-NEXT: vmovq %xmm0, (%rsi)
230; AVX512F-NEXT: vzeroupper
231; AVX512F-NEXT: retq
232;
233; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000234; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000235; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
236; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
237; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
238; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
239; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
240; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
241; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000242; AVX512VL-NEXT: vzeroupper
243; AVX512VL-NEXT: retq
244;
245; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000246; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000247; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000248; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
249; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
250; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
251; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
252; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000253; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
254; AVX512BW-NEXT: vzeroupper
255; AVX512BW-NEXT: retq
256;
257; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000258; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000259; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
260; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
261; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
262; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
263; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
264; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
265; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000266; AVX512BWVL-NEXT: vzeroupper
267; AVX512BWVL-NEXT: retq
268 %vec = load <32 x i8>, <32 x i8>* %L
269 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
270 store <8 x i8> %strided.vec, <8 x i8>* %S
271 ret void
272}
273
274define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
275; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000276; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000277; AVX1-NEXT: vmovdqa (%rdi), %ymm0
278; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
279; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
280; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
281; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
282; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
283; AVX1-NEXT: vmovq %xmm0, (%rsi)
284; AVX1-NEXT: vzeroupper
285; AVX1-NEXT: retq
286;
287; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000288; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000289; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000290; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
291; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
292; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
293; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
294; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000295; AVX2-NEXT: vmovq %xmm0, (%rsi)
296; AVX2-NEXT: vzeroupper
297; AVX2-NEXT: retq
298;
299; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000300; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000301; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000302; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
303; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
304; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
305; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
306; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000307; AVX512F-NEXT: vmovq %xmm0, (%rsi)
308; AVX512F-NEXT: vzeroupper
309; AVX512F-NEXT: retq
310;
311; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000312; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000313; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
314; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
315; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
316; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
317; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
318; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
319; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000320; AVX512VL-NEXT: vzeroupper
321; AVX512VL-NEXT: retq
322;
323; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000324; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000325; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000326; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
327; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
328; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
329; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
330; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000331; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
332; AVX512BW-NEXT: vzeroupper
333; AVX512BW-NEXT: retq
334;
335; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000336; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000337; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
338; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
339; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
340; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
341; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
342; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
343; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000344; AVX512BWVL-NEXT: vzeroupper
345; AVX512BWVL-NEXT: retq
346 %vec = load <32 x i8>, <32 x i8>* %L
347 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
348 store <8 x i8> %strided.vec, <8 x i8>* %S
349 ret void
350}
351
352define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
353; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000354; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000355; AVX1-NEXT: vmovdqa (%rdi), %ymm0
356; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
357; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
358; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
359; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
360; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
361; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
362; AVX1-NEXT: vmovq %xmm0, (%rsi)
363; AVX1-NEXT: vzeroupper
364; AVX1-NEXT: retq
365;
Simon Pilgrima50eec02017-12-20 13:12:34 +0000366; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
367; AVX2-SLOW: # %bb.0:
368; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
369; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
370; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
371; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
372; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
373; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
374; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
375; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
376; AVX2-SLOW-NEXT: vzeroupper
377; AVX2-SLOW-NEXT: retq
378;
379; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
380; AVX2-FAST: # %bb.0:
381; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
382; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
383; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
384; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
385; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
386; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
387; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
388; AVX2-FAST-NEXT: vzeroupper
389; AVX2-FAST-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000390;
391; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000392; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000393; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000394; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
395; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
396; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
397; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
398; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
399; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000400; AVX512F-NEXT: vmovq %xmm0, (%rsi)
401; AVX512F-NEXT: vzeroupper
402; AVX512F-NEXT: retq
403;
404; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000405; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000406; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
407; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +0000408; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
409; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
410; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000411; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
412; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000413; AVX512VL-NEXT: vzeroupper
414; AVX512VL-NEXT: retq
415;
416; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000417; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000418; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000419; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +0000420; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
421; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
422; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000423; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000424; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
425; AVX512BW-NEXT: vzeroupper
426; AVX512BW-NEXT: retq
427;
428; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000429; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000430; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
431; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +0000432; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
433; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
434; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000435; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
436; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000437; AVX512BWVL-NEXT: vzeroupper
438; AVX512BWVL-NEXT: retq
439 %vec = load <16 x i16>, <16 x i16>* %L
440 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
441 store <4 x i16> %strided.vec, <4 x i16>* %S
442 ret void
443}
444
445define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
446; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000447; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000448; AVX1-NEXT: vmovdqa (%rdi), %ymm0
449; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
450; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
451; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
452; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
453; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
454; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
455; AVX1-NEXT: vmovq %xmm0, (%rsi)
456; AVX1-NEXT: vzeroupper
457; AVX1-NEXT: retq
458;
Simon Pilgrima50eec02017-12-20 13:12:34 +0000459; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
460; AVX2-SLOW: # %bb.0:
461; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
462; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
463; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
464; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
465; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
466; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
467; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
468; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
469; AVX2-SLOW-NEXT: vzeroupper
470; AVX2-SLOW-NEXT: retq
471;
472; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
473; AVX2-FAST: # %bb.0:
474; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
475; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
476; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
477; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
478; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
479; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
480; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
481; AVX2-FAST-NEXT: vzeroupper
482; AVX2-FAST-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000483;
484; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000485; AVX512F: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000486; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
487; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
488; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
489; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
490; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
491; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
492; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000493; AVX512F-NEXT: vmovq %xmm0, (%rsi)
494; AVX512F-NEXT: vzeroupper
495; AVX512F-NEXT: retq
496;
497; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000498; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000499; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
500; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
501; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
502; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000503; AVX512VL-NEXT: vzeroupper
504; AVX512VL-NEXT: retq
505;
506; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000507; AVX512BW: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000508; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
509; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +0000510; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
511; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
512; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000513; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000514; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
515; AVX512BW-NEXT: vzeroupper
516; AVX512BW-NEXT: retq
517;
518; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000519; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000520; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
521; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
522; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
523; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000524; AVX512BWVL-NEXT: vzeroupper
525; AVX512BWVL-NEXT: retq
526 %vec = load <16 x i16>, <16 x i16>* %L
527 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
528 store <4 x i16> %strided.vec, <4 x i16>* %S
529 ret void
530}
531
532define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
533; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000534; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000535; AVX1-NEXT: vmovdqa (%rdi), %ymm0
536; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
537; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
538; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
539; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
540; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
541; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
542; AVX1-NEXT: vmovq %xmm0, (%rsi)
543; AVX1-NEXT: vzeroupper
544; AVX1-NEXT: retq
545;
Simon Pilgrima50eec02017-12-20 13:12:34 +0000546; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
547; AVX2-SLOW: # %bb.0:
548; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
549; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
550; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
551; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
552; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
553; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
554; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
555; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
556; AVX2-SLOW-NEXT: vzeroupper
557; AVX2-SLOW-NEXT: retq
558;
559; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
560; AVX2-FAST: # %bb.0:
561; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
562; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
563; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
564; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
565; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
566; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
567; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
568; AVX2-FAST-NEXT: vzeroupper
569; AVX2-FAST-NEXT: retq
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000570;
571; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000572; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000573; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000574; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
575; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
576; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
577; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
578; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
579; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000580; AVX512F-NEXT: vmovq %xmm0, (%rsi)
581; AVX512F-NEXT: vzeroupper
582; AVX512F-NEXT: retq
583;
584; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000585; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000586; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
587; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +0000588; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
589; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
590; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000591; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
592; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000593; AVX512VL-NEXT: vzeroupper
594; AVX512VL-NEXT: retq
595;
596; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000597; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000598; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000599; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +0000600; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
601; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
602; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000603; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000604; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
605; AVX512BW-NEXT: vzeroupper
606; AVX512BW-NEXT: retq
607;
608; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000609; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000610; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
611; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +0000612; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
613; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
614; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000615; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
616; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000617; AVX512BWVL-NEXT: vzeroupper
618; AVX512BWVL-NEXT: retq
619 %vec = load <16 x i16>, <16 x i16>* %L
620 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
621 store <4 x i16> %strided.vec, <4 x i16>* %S
622 ret void
623}
624
625define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
626; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000627; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000628; AVX1-NEXT: vmovdqa (%rdi), %ymm0
629; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
630; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
631; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
632; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
633; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
634; AVX1-NEXT: vmovd %xmm0, (%rsi)
635; AVX1-NEXT: vzeroupper
636; AVX1-NEXT: retq
637;
638; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000639; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000640; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000641; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
642; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
643; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
644; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
645; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000646; AVX2-NEXT: vmovd %xmm0, (%rsi)
647; AVX2-NEXT: vzeroupper
648; AVX2-NEXT: retq
649;
650; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000651; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000652; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000653; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
654; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
655; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
656; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
657; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000658; AVX512F-NEXT: vmovd %xmm0, (%rsi)
659; AVX512F-NEXT: vzeroupper
660; AVX512F-NEXT: retq
661;
662; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000663; AVX512VL: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000664; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000665; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000666; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
Wei Mi1736efd2017-10-12 00:24:52 +0000667; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000668; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000669; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
670; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000671; AVX512VL-NEXT: vzeroupper
672; AVX512VL-NEXT: retq
673;
674; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000675; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000676; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000677; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
678; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
679; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
680; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
681; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000682; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
683; AVX512BW-NEXT: vzeroupper
684; AVX512BW-NEXT: retq
685;
686; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000687; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000688; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
689; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000690; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
Wei Mi1736efd2017-10-12 00:24:52 +0000691; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000692; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000693; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
694; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000695; AVX512BWVL-NEXT: vzeroupper
696; AVX512BWVL-NEXT: retq
697 %vec = load <32 x i8>, <32 x i8>* %L
698 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
699 store <4 x i8> %strided.vec, <4 x i8>* %S
700 ret void
701}
702
703define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
704; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000705; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000706; AVX1-NEXT: vmovdqa (%rdi), %ymm0
707; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
708; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
709; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
710; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
711; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
712; AVX1-NEXT: vmovd %xmm0, (%rsi)
713; AVX1-NEXT: vzeroupper
714; AVX1-NEXT: retq
715;
716; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000717; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000718; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000719; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
720; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
721; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
722; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
723; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000724; AVX2-NEXT: vmovd %xmm0, (%rsi)
725; AVX2-NEXT: vzeroupper
726; AVX2-NEXT: retq
727;
728; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000729; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000730; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000731; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
732; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
733; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
734; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
735; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000736; AVX512F-NEXT: vmovd %xmm0, (%rsi)
737; AVX512F-NEXT: vzeroupper
738; AVX512F-NEXT: retq
739;
740; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000741; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000742; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
743; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +0000744; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
745; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
746; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000747; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
748; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000749; AVX512VL-NEXT: vzeroupper
750; AVX512VL-NEXT: retq
751;
752; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000753; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000754; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000755; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
756; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
757; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
758; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
759; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000760; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
761; AVX512BW-NEXT: vzeroupper
762; AVX512BW-NEXT: retq
763;
764; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000765; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000766; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
767; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +0000768; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
769; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
770; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000771; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
772; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000773; AVX512BWVL-NEXT: vzeroupper
774; AVX512BWVL-NEXT: retq
775 %vec = load <32 x i8>, <32 x i8>* %L
776 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
777 store <4 x i8> %strided.vec, <4 x i8>* %S
778 ret void
779}
780
781define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
782; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000783; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000784; AVX1-NEXT: vmovdqa (%rdi), %ymm0
785; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
786; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
787; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
788; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
789; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
790; AVX1-NEXT: vmovd %xmm0, (%rsi)
791; AVX1-NEXT: vzeroupper
792; AVX1-NEXT: retq
793;
794; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000795; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000796; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000797; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
798; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
799; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
800; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
801; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000802; AVX2-NEXT: vmovd %xmm0, (%rsi)
803; AVX2-NEXT: vzeroupper
804; AVX2-NEXT: retq
805;
806; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000807; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000808; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000809; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
810; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
811; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
812; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
813; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000814; AVX512F-NEXT: vmovd %xmm0, (%rsi)
815; AVX512F-NEXT: vzeroupper
816; AVX512F-NEXT: retq
817;
818; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000819; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000820; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
821; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000822; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
Wei Mi1736efd2017-10-12 00:24:52 +0000823; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000824; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000825; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
826; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000827; AVX512VL-NEXT: vzeroupper
828; AVX512VL-NEXT: retq
829;
830; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000831; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000832; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000833; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
834; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
835; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
836; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
837; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000838; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
839; AVX512BW-NEXT: vzeroupper
840; AVX512BW-NEXT: retq
841;
842; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000843; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000844; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
845; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000846; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
Wei Mi1736efd2017-10-12 00:24:52 +0000847; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
Wei Mi1736efd2017-10-12 00:24:52 +0000848; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000849; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
850; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000851; AVX512BWVL-NEXT: vzeroupper
852; AVX512BWVL-NEXT: retq
853 %vec = load <32 x i8>, <32 x i8>* %L
854 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
855 store <4 x i8> %strided.vec, <4 x i8>* %S
856 ret void
857}
858
859define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
860; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000861; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000862; AVX1-NEXT: vmovdqa (%rdi), %ymm0
863; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
864; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
865; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
866; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
867; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
868; AVX1-NEXT: vmovd %xmm0, (%rsi)
869; AVX1-NEXT: vzeroupper
870; AVX1-NEXT: retq
871;
872; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000873; AVX2: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000874; AVX2-NEXT: vmovdqa (%rdi), %ymm0
875; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
876; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
877; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
878; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
879; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000880; AVX2-NEXT: vmovd %xmm0, (%rsi)
881; AVX2-NEXT: vzeroupper
882; AVX2-NEXT: retq
883;
884; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000885; AVX512F: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000886; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
887; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
888; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
889; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
890; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
891; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000892; AVX512F-NEXT: vmovd %xmm0, (%rsi)
893; AVX512F-NEXT: vzeroupper
894; AVX512F-NEXT: retq
895;
896; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000897; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000898; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
899; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
900; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
901; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000902; AVX512VL-NEXT: vzeroupper
903; AVX512VL-NEXT: retq
904;
905; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000906; AVX512BW: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000907; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
908; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
909; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
910; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
911; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
912; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000913; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
914; AVX512BW-NEXT: vzeroupper
915; AVX512BW-NEXT: retq
916;
917; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000918; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000919; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
920; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
921; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
922; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000923; AVX512BWVL-NEXT: vzeroupper
924; AVX512BWVL-NEXT: retq
925 %vec = load <32 x i8>, <32 x i8>* %L
926 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
927 store <4 x i8> %strided.vec, <4 x i8>* %S
928 ret void
929}
930
931define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
932; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000933; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000934; AVX1-NEXT: vmovdqa (%rdi), %ymm0
935; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
936; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
937; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
938; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
939; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
940; AVX1-NEXT: vmovd %xmm0, (%rsi)
941; AVX1-NEXT: vzeroupper
942; AVX1-NEXT: retq
943;
944; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000945; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000946; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000947; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
948; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
949; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
950; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
951; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000952; AVX2-NEXT: vmovd %xmm0, (%rsi)
953; AVX2-NEXT: vzeroupper
954; AVX2-NEXT: retq
955;
956; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000957; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000958; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000959; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
960; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
961; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
962; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
963; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000964; AVX512F-NEXT: vmovd %xmm0, (%rsi)
965; AVX512F-NEXT: vzeroupper
966; AVX512F-NEXT: retq
967;
968; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000969; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000970; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
971; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000972; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
973; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
974; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000975; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
976; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000977; AVX512VL-NEXT: vzeroupper
978; AVX512VL-NEXT: retq
979;
980; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000981; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000982; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +0000983; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
984; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
985; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
986; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
987; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +0000988; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
989; AVX512BW-NEXT: vzeroupper
990; AVX512BW-NEXT: retq
991;
992; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000993; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +0000994; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
995; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Simon Pilgrimd873b6f2017-12-19 16:54:07 +0000996; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
997; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
998; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +0000999; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1000; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001001; AVX512BWVL-NEXT: vzeroupper
1002; AVX512BWVL-NEXT: retq
1003 %vec = load <32 x i8>, <32 x i8>* %L
1004 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
1005 store <4 x i8> %strided.vec, <4 x i8>* %S
1006 ret void
1007}
1008
1009define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1010; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001011; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001012; AVX1-NEXT: vmovdqa (%rdi), %ymm0
1013; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1014; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1015; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1016; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1017; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1018; AVX1-NEXT: vmovd %xmm0, (%rsi)
1019; AVX1-NEXT: vzeroupper
1020; AVX1-NEXT: retq
1021;
1022; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001023; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001024; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001025; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1026; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1027; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1028; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1029; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001030; AVX2-NEXT: vmovd %xmm0, (%rsi)
1031; AVX2-NEXT: vzeroupper
1032; AVX2-NEXT: retq
1033;
1034; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001035; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001036; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001037; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1038; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1039; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1040; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1041; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001042; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1043; AVX512F-NEXT: vzeroupper
1044; AVX512F-NEXT: retq
1045;
1046; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001047; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001048; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1049; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +00001050; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
1051; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1052; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +00001053; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1054; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001055; AVX512VL-NEXT: vzeroupper
1056; AVX512VL-NEXT: retq
1057;
1058; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001059; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001060; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001061; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1062; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1063; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1064; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1065; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001066; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1067; AVX512BW-NEXT: vzeroupper
1068; AVX512BW-NEXT: retq
1069;
1070; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001071; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001072; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1073; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
Zvi Rackover72b0bb12018-01-09 16:26:06 +00001074; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
1075; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1076; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
Wei Mi1736efd2017-10-12 00:24:52 +00001077; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1078; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001079; AVX512BWVL-NEXT: vzeroupper
1080; AVX512BWVL-NEXT: retq
1081 %vec = load <32 x i8>, <32 x i8>* %L
1082 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
1083 store <4 x i8> %strided.vec, <4 x i8>* %S
1084 ret void
1085}
1086
1087define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
1088; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001089; AVX1: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001090; AVX1-NEXT: vmovdqa (%rdi), %ymm0
1091; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1092; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1093; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1094; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1095; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1096; AVX1-NEXT: vmovd %xmm0, (%rsi)
1097; AVX1-NEXT: vzeroupper
1098; AVX1-NEXT: retq
1099;
1100; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001101; AVX2: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001102; AVX2-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001103; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1104; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1105; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1106; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1107; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001108; AVX2-NEXT: vmovd %xmm0, (%rsi)
1109; AVX2-NEXT: vzeroupper
1110; AVX2-NEXT: retq
1111;
1112; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001113; AVX512F: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001114; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001115; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1116; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1117; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1118; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1119; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001120; AVX512F-NEXT: vmovd %xmm0, (%rsi)
1121; AVX512F-NEXT: vzeroupper
1122; AVX512F-NEXT: retq
1123;
1124; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001125; AVX512VL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001126; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1127; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1128; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
1129; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1130; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1131; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1132; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001133; AVX512VL-NEXT: vzeroupper
1134; AVX512VL-NEXT: retq
1135;
1136; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001137; AVX512BW: # %bb.0:
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001138; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
Wei Mi1736efd2017-10-12 00:24:52 +00001139; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1140; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
1141; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1142; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1143; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001144; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
1145; AVX512BW-NEXT: vzeroupper
1146; AVX512BW-NEXT: retq
1147;
1148; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001149; AVX512BWVL: # %bb.0:
Wei Mi1736efd2017-10-12 00:24:52 +00001150; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
1151; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1152; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
1153; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1154; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1155; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1156; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
Zvi Rackovereac8e7c2017-07-20 21:03:36 +00001157; AVX512BWVL-NEXT: vzeroupper
1158; AVX512BWVL-NEXT: retq
1159 %vec = load <32 x i8>, <32 x i8>* %L
1160 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
1161 store <4 x i8> %strided.vec, <4 x i8>* %S
1162 ret void
1163}
1164