blob: f6b07318328ed57b06ec303d5d54697af2bc2e54 [file] [log] [blame]
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
12
13; Just one 32-bit run to make sure we do reasonable things for i64 cases.
14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
15
16declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
17declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
18declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
19declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
20
21;
22; Variable Shifts
23;
24
25define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
26; SSE2-LABEL: var_funnnel_v2i64:
27; SSE2: # %bb.0:
28; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
29; SSE2-NEXT: movdqa %xmm1, %xmm3
30; SSE2-NEXT: psrlq %xmm2, %xmm3
31; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
32; SSE2-NEXT: movdqa %xmm1, %xmm5
33; SSE2-NEXT: psrlq %xmm4, %xmm5
34; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
35; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,64]
36; SSE2-NEXT: psubq %xmm2, %xmm3
37; SSE2-NEXT: movdqa %xmm0, %xmm4
38; SSE2-NEXT: psllq %xmm3, %xmm4
39; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
40; SSE2-NEXT: psllq %xmm3, %xmm0
41; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
42; SSE2-NEXT: orpd %xmm5, %xmm0
43; SSE2-NEXT: pxor %xmm3, %xmm3
44; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
45; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
46; SSE2-NEXT: pand %xmm3, %xmm2
47; SSE2-NEXT: pand %xmm2, %xmm1
48; SSE2-NEXT: pandn %xmm0, %xmm2
49; SSE2-NEXT: por %xmm1, %xmm2
50; SSE2-NEXT: movdqa %xmm2, %xmm0
51; SSE2-NEXT: retq
52;
53; SSE41-LABEL: var_funnnel_v2i64:
54; SSE41: # %bb.0:
55; SSE41-NEXT: movdqa %xmm0, %xmm3
56; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
57; SSE41-NEXT: movdqa %xmm1, %xmm0
58; SSE41-NEXT: psrlq %xmm2, %xmm0
59; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
60; SSE41-NEXT: movdqa %xmm1, %xmm5
61; SSE41-NEXT: psrlq %xmm4, %xmm5
62; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm5[4,5,6,7]
63; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [64,64]
64; SSE41-NEXT: psubq %xmm2, %xmm0
65; SSE41-NEXT: movdqa %xmm3, %xmm4
66; SSE41-NEXT: psllq %xmm0, %xmm4
67; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
68; SSE41-NEXT: psllq %xmm0, %xmm3
69; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
70; SSE41-NEXT: por %xmm5, %xmm3
71; SSE41-NEXT: pxor %xmm0, %xmm0
72; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
73; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
74; SSE41-NEXT: movapd %xmm3, %xmm0
75; SSE41-NEXT: retq
76;
77; AVX1-LABEL: var_funnnel_v2i64:
78; AVX1: # %bb.0:
79; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
80; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
81; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
82; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm4
83; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
84; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
85; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
86; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm5
87; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
88; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
89; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
90; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
91; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
92; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
93; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
94; AVX1-NEXT: retq
95;
96; AVX2-LABEL: var_funnnel_v2i64:
97; AVX2: # %bb.0:
98; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
99; AVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
100; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
101; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
102; AVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
103; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
104; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
105; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
106; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
107; AVX2-NEXT: retq
108;
109; AVX512F-LABEL: var_funnnel_v2i64:
110; AVX512F: # %bb.0:
111; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
112; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
113; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
114; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
115; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
116; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
117; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
118; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
119; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
120; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
121; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
122; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
123; AVX512F-NEXT: vzeroupper
124; AVX512F-NEXT: retq
125;
126; AVX512VL-LABEL: var_funnnel_v2i64:
127; AVX512VL: # %bb.0:
128; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
129; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
130; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
131; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
132; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
133; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
134; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
135; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
136; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
137; AVX512VL-NEXT: retq
138;
139; AVX512BW-LABEL: var_funnnel_v2i64:
140; AVX512BW: # %bb.0:
141; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
142; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
143; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
144; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
145; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
146; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
147; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
148; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
149; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
150; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
151; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
152; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
153; AVX512BW-NEXT: vzeroupper
154; AVX512BW-NEXT: retq
155;
156; AVX512VLBW-LABEL: var_funnnel_v2i64:
157; AVX512VLBW: # %bb.0:
158; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
159; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
160; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
161; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
162; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
163; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
164; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
165; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
166; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
167; AVX512VLBW-NEXT: retq
168;
169; XOPAVX1-LABEL: var_funnnel_v2i64:
170; XOPAVX1: # %bb.0:
171; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
172; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
173; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm4
174; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm4
175; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64]
176; XOPAVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5
177; XOPAVX1-NEXT: vpshlq %xmm5, %xmm0, %xmm0
178; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
179; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
180; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
181; XOPAVX1-NEXT: retq
182;
183; XOPAVX2-LABEL: var_funnnel_v2i64:
184; XOPAVX2: # %bb.0:
185; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
186; XOPAVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
187; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
188; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
189; XOPAVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
190; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
191; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
192; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
193; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
194; XOPAVX2-NEXT: retq
195;
196; X32-SSE-LABEL: var_funnnel_v2i64:
197; X32-SSE: # %bb.0:
198; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
199; X32-SSE-NEXT: movdqa %xmm1, %xmm3
200; X32-SSE-NEXT: psrlq %xmm2, %xmm3
201; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
202; X32-SSE-NEXT: movdqa %xmm1, %xmm5
203; X32-SSE-NEXT: psrlq %xmm4, %xmm5
204; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
205; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
206; X32-SSE-NEXT: psubq %xmm2, %xmm3
207; X32-SSE-NEXT: movdqa %xmm0, %xmm4
208; X32-SSE-NEXT: psllq %xmm3, %xmm4
209; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
210; X32-SSE-NEXT: psllq %xmm3, %xmm0
211; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
212; X32-SSE-NEXT: orpd %xmm5, %xmm0
213; X32-SSE-NEXT: pxor %xmm3, %xmm3
214; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
215; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
216; X32-SSE-NEXT: pand %xmm3, %xmm2
217; X32-SSE-NEXT: pand %xmm2, %xmm1
218; X32-SSE-NEXT: pandn %xmm0, %xmm2
219; X32-SSE-NEXT: por %xmm1, %xmm2
220; X32-SSE-NEXT: movdqa %xmm2, %xmm0
221; X32-SSE-NEXT: retl
222 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
223 ret <2 x i64> %res
224}
225
226define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
227; SSE2-LABEL: var_funnnel_v4i32:
228; SSE2: # %bb.0:
229; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
230; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
231; SSE2-NEXT: movdqa %xmm1, %xmm4
232; SSE2-NEXT: psrld %xmm3, %xmm4
233; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
234; SSE2-NEXT: movdqa %xmm1, %xmm3
235; SSE2-NEXT: psrld %xmm5, %xmm3
236; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
237; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
238; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
239; SSE2-NEXT: movdqa %xmm1, %xmm6
240; SSE2-NEXT: psrld %xmm5, %xmm6
241; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
242; SSE2-NEXT: movdqa %xmm1, %xmm5
243; SSE2-NEXT: psrld %xmm4, %xmm5
244; SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
245; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
246; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
247; SSE2-NEXT: psubd %xmm2, %xmm4
248; SSE2-NEXT: pslld $23, %xmm4
249; SSE2-NEXT: paddd {{.*}}(%rip), %xmm4
250; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
251; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
252; SSE2-NEXT: pmuludq %xmm4, %xmm0
253; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
254; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
255; SSE2-NEXT: pmuludq %xmm5, %xmm0
256; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
257; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
258; SSE2-NEXT: por %xmm3, %xmm6
259; SSE2-NEXT: pxor %xmm0, %xmm0
260; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
261; SSE2-NEXT: pand %xmm0, %xmm1
262; SSE2-NEXT: pandn %xmm6, %xmm0
263; SSE2-NEXT: por %xmm1, %xmm0
264; SSE2-NEXT: retq
265;
266; SSE41-LABEL: var_funnnel_v4i32:
267; SSE41: # %bb.0:
268; SSE41-NEXT: movdqa %xmm0, %xmm3
269; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
270; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7]
271; SSE41-NEXT: movdqa %xmm1, %xmm4
272; SSE41-NEXT: psrld %xmm0, %xmm4
273; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
274; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7]
275; SSE41-NEXT: movdqa %xmm1, %xmm6
276; SSE41-NEXT: psrld %xmm5, %xmm6
277; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
278; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
279; SSE41-NEXT: movdqa %xmm1, %xmm5
280; SSE41-NEXT: psrld %xmm4, %xmm5
281; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
282; SSE41-NEXT: movdqa %xmm1, %xmm4
283; SSE41-NEXT: psrld %xmm0, %xmm4
284; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
285; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
286; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
287; SSE41-NEXT: psubd %xmm2, %xmm0
288; SSE41-NEXT: pslld $23, %xmm0
289; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
290; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
291; SSE41-NEXT: pmulld %xmm0, %xmm3
292; SSE41-NEXT: por %xmm4, %xmm3
293; SSE41-NEXT: pxor %xmm0, %xmm0
294; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
295; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
296; SSE41-NEXT: movaps %xmm3, %xmm0
297; SSE41-NEXT: retq
298;
299; AVX1-LABEL: var_funnnel_v4i32:
300; AVX1: # %bb.0:
301; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
302; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
303; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
304; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
305; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4
306; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
307; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
308; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
309; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
310; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
311; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
312; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
313; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
314; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
315; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
316; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
317; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm5, %xmm5
318; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
319; AVX1-NEXT: vpmulld %xmm5, %xmm0, %xmm0
320; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
321; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
322; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
323; AVX1-NEXT: retq
324;
325; AVX2-LABEL: var_funnnel_v4i32:
326; AVX2: # %bb.0:
327; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
328; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
329; AVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
330; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
331; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
332; AVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
333; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
334; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
335; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
336; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
337; AVX2-NEXT: retq
338;
339; AVX512F-LABEL: var_funnnel_v4i32:
340; AVX512F: # %bb.0:
341; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
342; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
343; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
344; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
345; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
346; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
347; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
348; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
349; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
350; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
351; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
352; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
353; AVX512F-NEXT: vzeroupper
354; AVX512F-NEXT: retq
355;
356; AVX512VL-LABEL: var_funnnel_v4i32:
357; AVX512VL: # %bb.0:
358; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
359; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
360; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
361; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
362; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
363; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
364; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
365; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
366; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
367; AVX512VL-NEXT: retq
368;
369; AVX512BW-LABEL: var_funnnel_v4i32:
370; AVX512BW: # %bb.0:
371; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
372; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
373; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
374; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
375; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
376; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
377; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
378; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
379; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
380; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
381; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
382; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
383; AVX512BW-NEXT: vzeroupper
384; AVX512BW-NEXT: retq
385;
386; AVX512VLBW-LABEL: var_funnnel_v4i32:
387; AVX512VLBW: # %bb.0:
388; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
389; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
390; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
391; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
392; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
393; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
394; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
395; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
396; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
397; AVX512VLBW-NEXT: retq
398;
399; XOPAVX1-LABEL: var_funnnel_v4i32:
400; XOPAVX1: # %bb.0:
401; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
402; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
403; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm4
404; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm4
405; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
406; XOPAVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
407; XOPAVX1-NEXT: vpshld %xmm5, %xmm0, %xmm0
408; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
409; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
410; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
411; XOPAVX1-NEXT: retq
412;
413; XOPAVX2-LABEL: var_funnnel_v4i32:
414; XOPAVX2: # %bb.0:
415; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
416; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
417; XOPAVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
418; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
419; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
420; XOPAVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
421; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
422; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
423; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
424; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
425; XOPAVX2-NEXT: retq
426;
427; X32-SSE-LABEL: var_funnnel_v4i32:
428; X32-SSE: # %bb.0:
429; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
430; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
431; X32-SSE-NEXT: movdqa %xmm1, %xmm4
432; X32-SSE-NEXT: psrld %xmm3, %xmm4
433; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
434; X32-SSE-NEXT: movdqa %xmm1, %xmm3
435; X32-SSE-NEXT: psrld %xmm5, %xmm3
436; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
437; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
438; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
439; X32-SSE-NEXT: movdqa %xmm1, %xmm6
440; X32-SSE-NEXT: psrld %xmm5, %xmm6
441; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
442; X32-SSE-NEXT: movdqa %xmm1, %xmm5
443; X32-SSE-NEXT: psrld %xmm4, %xmm5
444; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
445; X32-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
446; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
447; X32-SSE-NEXT: psubd %xmm2, %xmm4
448; X32-SSE-NEXT: pslld $23, %xmm4
449; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm4
450; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
451; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
452; X32-SSE-NEXT: pmuludq %xmm4, %xmm0
453; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
454; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
455; X32-SSE-NEXT: pmuludq %xmm5, %xmm0
456; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
457; X32-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
458; X32-SSE-NEXT: por %xmm3, %xmm6
459; X32-SSE-NEXT: pxor %xmm0, %xmm0
460; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0
461; X32-SSE-NEXT: pand %xmm0, %xmm1
462; X32-SSE-NEXT: pandn %xmm6, %xmm0
463; X32-SSE-NEXT: por %xmm1, %xmm0
464; X32-SSE-NEXT: retl
465 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
466 ret <4 x i32> %res
467}
468
469define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
470; SSE2-LABEL: var_funnnel_v8i16:
471; SSE2: # %bb.0:
472; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
473; SSE2-NEXT: movdqa %xmm2, %xmm4
474; SSE2-NEXT: psllw $12, %xmm4
475; SSE2-NEXT: movdqa %xmm4, %xmm3
476; SSE2-NEXT: psraw $15, %xmm3
477; SSE2-NEXT: movdqa %xmm1, %xmm5
478; SSE2-NEXT: psrlw $8, %xmm5
479; SSE2-NEXT: pand %xmm3, %xmm5
480; SSE2-NEXT: pandn %xmm1, %xmm3
481; SSE2-NEXT: por %xmm5, %xmm3
482; SSE2-NEXT: paddw %xmm4, %xmm4
483; SSE2-NEXT: movdqa %xmm4, %xmm5
484; SSE2-NEXT: psraw $15, %xmm5
485; SSE2-NEXT: movdqa %xmm5, %xmm6
486; SSE2-NEXT: pandn %xmm3, %xmm6
487; SSE2-NEXT: psrlw $4, %xmm3
488; SSE2-NEXT: pand %xmm5, %xmm3
489; SSE2-NEXT: por %xmm6, %xmm3
490; SSE2-NEXT: paddw %xmm4, %xmm4
491; SSE2-NEXT: movdqa %xmm4, %xmm5
492; SSE2-NEXT: psraw $15, %xmm5
493; SSE2-NEXT: movdqa %xmm5, %xmm6
494; SSE2-NEXT: pandn %xmm3, %xmm6
495; SSE2-NEXT: psrlw $2, %xmm3
496; SSE2-NEXT: pand %xmm5, %xmm3
497; SSE2-NEXT: por %xmm6, %xmm3
498; SSE2-NEXT: paddw %xmm4, %xmm4
499; SSE2-NEXT: psraw $15, %xmm4
500; SSE2-NEXT: movdqa %xmm4, %xmm5
501; SSE2-NEXT: pandn %xmm3, %xmm5
502; SSE2-NEXT: psrlw $1, %xmm3
503; SSE2-NEXT: pand %xmm4, %xmm3
504; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
505; SSE2-NEXT: psubw %xmm2, %xmm4
506; SSE2-NEXT: pxor %xmm8, %xmm8
507; SSE2-NEXT: movdqa %xmm4, %xmm7
508; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
509; SSE2-NEXT: pslld $23, %xmm7
510; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
511; SSE2-NEXT: paddd %xmm6, %xmm7
512; SSE2-NEXT: cvttps2dq %xmm7, %xmm7
513; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
514; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
515; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
516; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
517; SSE2-NEXT: pslld $23, %xmm4
518; SSE2-NEXT: paddd %xmm6, %xmm4
519; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
520; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
521; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
522; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
523; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
524; SSE2-NEXT: pmullw %xmm0, %xmm4
525; SSE2-NEXT: por %xmm5, %xmm4
526; SSE2-NEXT: por %xmm3, %xmm4
527; SSE2-NEXT: pcmpeqw %xmm8, %xmm2
528; SSE2-NEXT: pand %xmm2, %xmm1
529; SSE2-NEXT: pandn %xmm4, %xmm2
530; SSE2-NEXT: por %xmm1, %xmm2
531; SSE2-NEXT: movdqa %xmm2, %xmm0
532; SSE2-NEXT: retq
533;
534; SSE41-LABEL: var_funnnel_v8i16:
535; SSE41: # %bb.0:
536; SSE41-NEXT: movdqa %xmm0, %xmm8
537; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
538; SSE41-NEXT: movdqa %xmm2, %xmm0
539; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
540; SSE41-NEXT: psubw %xmm2, %xmm5
541; SSE41-NEXT: pxor %xmm4, %xmm4
542; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
543; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
544; SSE41-NEXT: pcmpeqw %xmm2, %xmm4
545; SSE41-NEXT: psllw $12, %xmm2
546; SSE41-NEXT: psllw $4, %xmm0
547; SSE41-NEXT: por %xmm2, %xmm0
548; SSE41-NEXT: movdqa %xmm0, %xmm2
549; SSE41-NEXT: paddw %xmm0, %xmm2
550; SSE41-NEXT: movdqa %xmm1, %xmm7
551; SSE41-NEXT: psrlw $8, %xmm7
552; SSE41-NEXT: movdqa %xmm1, %xmm3
553; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
554; SSE41-NEXT: movdqa %xmm3, %xmm7
555; SSE41-NEXT: psrlw $4, %xmm7
556; SSE41-NEXT: movdqa %xmm2, %xmm0
557; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
558; SSE41-NEXT: movdqa %xmm3, %xmm7
559; SSE41-NEXT: psrlw $2, %xmm7
560; SSE41-NEXT: paddw %xmm2, %xmm2
561; SSE41-NEXT: movdqa %xmm2, %xmm0
562; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
563; SSE41-NEXT: movdqa %xmm3, %xmm7
564; SSE41-NEXT: psrlw $1, %xmm7
565; SSE41-NEXT: paddw %xmm2, %xmm2
566; SSE41-NEXT: movdqa %xmm2, %xmm0
567; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
568; SSE41-NEXT: pslld $23, %xmm5
569; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
570; SSE41-NEXT: paddd %xmm0, %xmm5
571; SSE41-NEXT: cvttps2dq %xmm5, %xmm2
572; SSE41-NEXT: pslld $23, %xmm6
573; SSE41-NEXT: paddd %xmm0, %xmm6
574; SSE41-NEXT: cvttps2dq %xmm6, %xmm0
575; SSE41-NEXT: packusdw %xmm2, %xmm0
576; SSE41-NEXT: pmullw %xmm0, %xmm8
577; SSE41-NEXT: por %xmm3, %xmm8
578; SSE41-NEXT: movdqa %xmm4, %xmm0
579; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm8
580; SSE41-NEXT: movdqa %xmm8, %xmm0
581; SSE41-NEXT: retq
582;
583; AVX1-LABEL: var_funnnel_v8i16:
584; AVX1: # %bb.0:
585; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
586; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
587; AVX1-NEXT: vpsllw $4, %xmm2, %xmm4
588; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
589; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm4
590; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm5
591; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm3
592; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm5
593; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
594; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm5
595; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
596; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
597; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm5
598; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
599; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
600; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
601; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
602; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
603; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
604; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
605; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
606; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
607; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
608; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
609; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
610; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
611; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
612; AVX1-NEXT: vpackusdw %xmm6, %xmm4, %xmm4
613; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
614; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
615; AVX1-NEXT: vpcmpeqw %xmm5, %xmm2, %xmm2
616; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
617; AVX1-NEXT: retq
618;
619; AVX2-LABEL: var_funnnel_v8i16:
620; AVX2: # %bb.0:
621; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
622; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
623; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
624; AVX2-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
625; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
626; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
627; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
628; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
629; AVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm5
630; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
631; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
632; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm0
633; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
634; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
635; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
636; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
637; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
638; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
639; AVX2-NEXT: vzeroupper
640; AVX2-NEXT: retq
641;
642; AVX512F-LABEL: var_funnnel_v8i16:
643; AVX512F: # %bb.0:
644; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
645; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
646; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
647; AVX512F-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
648; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
649; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
650; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
651; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
652; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
653; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
654; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
655; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
656; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
657; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
658; AVX512F-NEXT: vzeroupper
659; AVX512F-NEXT: retq
660;
661; AVX512VL-LABEL: var_funnnel_v8i16:
662; AVX512VL: # %bb.0:
663; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
664; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
665; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
666; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
667; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
668; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
669; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
670; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
671; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
672; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
673; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
674; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
675; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
676; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
677; AVX512VL-NEXT: vzeroupper
678; AVX512VL-NEXT: retq
679;
680; AVX512BW-LABEL: var_funnnel_v8i16:
681; AVX512BW: # %bb.0:
682; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
683; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
684; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
685; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
686; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
687; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
688; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
689; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
690; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
691; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
692; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
693; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
694; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
695; AVX512BW-NEXT: vzeroupper
696; AVX512BW-NEXT: retq
697;
698; AVX512VLBW-LABEL: var_funnnel_v8i16:
699; AVX512VLBW: # %bb.0:
700; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
701; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
702; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm5
703; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
704; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
705; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm0
706; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
707; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
708; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
709; AVX512VLBW-NEXT: retq
710;
711; XOP-LABEL: var_funnnel_v8i16:
712; XOP: # %bb.0:
713; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
714; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
715; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm4
716; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm4
717; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
718; XOP-NEXT: vpsubw %xmm2, %xmm5, %xmm5
719; XOP-NEXT: vpshlw %xmm5, %xmm0, %xmm0
720; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
721; XOP-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
722; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
723; XOP-NEXT: retq
724;
725; X32-SSE-LABEL: var_funnnel_v8i16:
726; X32-SSE: # %bb.0:
727; X32-SSE-NEXT: subl $28, %esp
728; X32-SSE-NEXT: movups %xmm0, (%esp) # 16-byte Spill
729; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
730; X32-SSE-NEXT: movdqa %xmm2, %xmm4
731; X32-SSE-NEXT: psllw $12, %xmm4
732; X32-SSE-NEXT: movdqa %xmm4, %xmm3
733; X32-SSE-NEXT: psraw $15, %xmm3
734; X32-SSE-NEXT: movdqa %xmm1, %xmm5
735; X32-SSE-NEXT: psrlw $8, %xmm5
736; X32-SSE-NEXT: pand %xmm3, %xmm5
737; X32-SSE-NEXT: pandn %xmm1, %xmm3
738; X32-SSE-NEXT: por %xmm5, %xmm3
739; X32-SSE-NEXT: paddw %xmm4, %xmm4
740; X32-SSE-NEXT: movdqa %xmm4, %xmm5
741; X32-SSE-NEXT: psraw $15, %xmm5
742; X32-SSE-NEXT: movdqa %xmm5, %xmm6
743; X32-SSE-NEXT: pandn %xmm3, %xmm6
744; X32-SSE-NEXT: psrlw $4, %xmm3
745; X32-SSE-NEXT: pand %xmm5, %xmm3
746; X32-SSE-NEXT: por %xmm6, %xmm3
747; X32-SSE-NEXT: paddw %xmm4, %xmm4
748; X32-SSE-NEXT: movdqa %xmm4, %xmm5
749; X32-SSE-NEXT: psraw $15, %xmm5
750; X32-SSE-NEXT: movdqa %xmm5, %xmm6
751; X32-SSE-NEXT: pandn %xmm3, %xmm6
752; X32-SSE-NEXT: psrlw $2, %xmm3
753; X32-SSE-NEXT: pand %xmm5, %xmm3
754; X32-SSE-NEXT: por %xmm6, %xmm3
755; X32-SSE-NEXT: paddw %xmm4, %xmm4
756; X32-SSE-NEXT: psraw $15, %xmm4
757; X32-SSE-NEXT: movdqa %xmm4, %xmm5
758; X32-SSE-NEXT: pandn %xmm3, %xmm5
759; X32-SSE-NEXT: psrlw $1, %xmm3
760; X32-SSE-NEXT: pand %xmm4, %xmm3
761; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
762; X32-SSE-NEXT: psubw %xmm2, %xmm4
763; X32-SSE-NEXT: pxor %xmm6, %xmm6
764; X32-SSE-NEXT: movdqa %xmm4, %xmm7
765; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
766; X32-SSE-NEXT: pslld $23, %xmm7
767; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
768; X32-SSE-NEXT: paddd %xmm0, %xmm7
769; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
770; X32-SSE-NEXT: pslld $23, %xmm4
771; X32-SSE-NEXT: paddd %xmm0, %xmm4
772; X32-SSE-NEXT: cvttps2dq %xmm7, %xmm0
773; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
774; X32-SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
775; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
776; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
777; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
778; X32-SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
779; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
780; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
781; X32-SSE-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload
782; X32-SSE-NEXT: pmullw %xmm0, %xmm4
783; X32-SSE-NEXT: por %xmm5, %xmm4
784; X32-SSE-NEXT: por %xmm3, %xmm4
785; X32-SSE-NEXT: pcmpeqw %xmm6, %xmm2
786; X32-SSE-NEXT: pand %xmm2, %xmm1
787; X32-SSE-NEXT: pandn %xmm4, %xmm2
788; X32-SSE-NEXT: por %xmm1, %xmm2
789; X32-SSE-NEXT: movdqa %xmm2, %xmm0
790; X32-SSE-NEXT: addl $28, %esp
791; X32-SSE-NEXT: retl
792 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
793 ret <8 x i16> %res
794}
795
796define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
797; SSE2-LABEL: var_funnnel_v16i8:
798; SSE2: # %bb.0:
799; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
800; SSE2-NEXT: movdqa %xmm2, %xmm5
801; SSE2-NEXT: psllw $5, %xmm5
802; SSE2-NEXT: pxor %xmm3, %xmm3
803; SSE2-NEXT: pxor %xmm6, %xmm6
804; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
805; SSE2-NEXT: movdqa %xmm1, %xmm4
806; SSE2-NEXT: psrlw $4, %xmm4
807; SSE2-NEXT: pand %xmm6, %xmm4
808; SSE2-NEXT: pandn %xmm1, %xmm6
809; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
810; SSE2-NEXT: por %xmm6, %xmm4
811; SSE2-NEXT: paddb %xmm5, %xmm5
812; SSE2-NEXT: pxor %xmm6, %xmm6
813; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
814; SSE2-NEXT: movdqa %xmm6, %xmm7
815; SSE2-NEXT: pandn %xmm4, %xmm7
816; SSE2-NEXT: psrlw $2, %xmm4
817; SSE2-NEXT: pand %xmm6, %xmm4
818; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
819; SSE2-NEXT: por %xmm7, %xmm4
820; SSE2-NEXT: paddb %xmm5, %xmm5
821; SSE2-NEXT: pxor %xmm6, %xmm6
822; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
823; SSE2-NEXT: movdqa %xmm6, %xmm5
824; SSE2-NEXT: pandn %xmm4, %xmm5
825; SSE2-NEXT: psrlw $1, %xmm4
826; SSE2-NEXT: pand %xmm6, %xmm4
827; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
828; SSE2-NEXT: por %xmm5, %xmm4
829; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
830; SSE2-NEXT: psubb %xmm2, %xmm5
831; SSE2-NEXT: psllw $5, %xmm5
832; SSE2-NEXT: pxor %xmm6, %xmm6
833; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
834; SSE2-NEXT: movdqa %xmm6, %xmm7
835; SSE2-NEXT: pandn %xmm0, %xmm7
836; SSE2-NEXT: psllw $4, %xmm0
837; SSE2-NEXT: pand %xmm6, %xmm0
838; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
839; SSE2-NEXT: por %xmm7, %xmm0
840; SSE2-NEXT: paddb %xmm5, %xmm5
841; SSE2-NEXT: pxor %xmm6, %xmm6
842; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
843; SSE2-NEXT: movdqa %xmm6, %xmm7
844; SSE2-NEXT: pandn %xmm0, %xmm7
845; SSE2-NEXT: psllw $2, %xmm0
846; SSE2-NEXT: pand %xmm6, %xmm0
847; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
848; SSE2-NEXT: por %xmm7, %xmm0
849; SSE2-NEXT: paddb %xmm5, %xmm5
850; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
851; SSE2-NEXT: pcmpgtb %xmm5, %xmm3
852; SSE2-NEXT: movdqa %xmm3, %xmm5
853; SSE2-NEXT: pandn %xmm0, %xmm5
854; SSE2-NEXT: por %xmm4, %xmm5
855; SSE2-NEXT: paddb %xmm0, %xmm0
856; SSE2-NEXT: pand %xmm3, %xmm0
857; SSE2-NEXT: por %xmm5, %xmm0
858; SSE2-NEXT: pand %xmm2, %xmm1
859; SSE2-NEXT: pandn %xmm0, %xmm2
860; SSE2-NEXT: por %xmm1, %xmm2
861; SSE2-NEXT: movdqa %xmm2, %xmm0
862; SSE2-NEXT: retq
863;
864; SSE41-LABEL: var_funnnel_v16i8:
865; SSE41: # %bb.0:
866; SSE41-NEXT: movdqa %xmm0, %xmm3
867; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
868; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
869; SSE41-NEXT: psubb %xmm2, %xmm4
870; SSE41-NEXT: pxor %xmm5, %xmm5
871; SSE41-NEXT: pcmpeqb %xmm2, %xmm5
872; SSE41-NEXT: movdqa %xmm2, %xmm0
873; SSE41-NEXT: psllw $5, %xmm0
874; SSE41-NEXT: movdqa %xmm1, %xmm2
875; SSE41-NEXT: psrlw $4, %xmm2
876; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
877; SSE41-NEXT: movdqa %xmm1, %xmm6
878; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
879; SSE41-NEXT: movdqa %xmm6, %xmm2
880; SSE41-NEXT: psrlw $2, %xmm2
881; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
882; SSE41-NEXT: paddb %xmm0, %xmm0
883; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
884; SSE41-NEXT: movdqa %xmm6, %xmm2
885; SSE41-NEXT: psrlw $1, %xmm2
886; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
887; SSE41-NEXT: paddb %xmm0, %xmm0
888; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
889; SSE41-NEXT: psllw $5, %xmm4
890; SSE41-NEXT: movdqa %xmm4, %xmm2
891; SSE41-NEXT: paddb %xmm4, %xmm2
892; SSE41-NEXT: movdqa %xmm3, %xmm7
893; SSE41-NEXT: psllw $4, %xmm7
894; SSE41-NEXT: pand {{.*}}(%rip), %xmm7
895; SSE41-NEXT: movdqa %xmm4, %xmm0
896; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
897; SSE41-NEXT: movdqa %xmm3, %xmm4
898; SSE41-NEXT: psllw $2, %xmm4
899; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
900; SSE41-NEXT: movdqa %xmm2, %xmm0
901; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
902; SSE41-NEXT: movdqa %xmm3, %xmm4
903; SSE41-NEXT: paddb %xmm3, %xmm4
904; SSE41-NEXT: paddb %xmm2, %xmm2
905; SSE41-NEXT: movdqa %xmm2, %xmm0
906; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
907; SSE41-NEXT: por %xmm6, %xmm3
908; SSE41-NEXT: movdqa %xmm5, %xmm0
909; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
910; SSE41-NEXT: movdqa %xmm3, %xmm0
911; SSE41-NEXT: retq
912;
913; AVX-LABEL: var_funnnel_v16i8:
914; AVX: # %bb.0:
915; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
916; AVX-NEXT: vpsllw $5, %xmm2, %xmm3
917; AVX-NEXT: vpsrlw $4, %xmm1, %xmm4
918; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
919; AVX-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm4
920; AVX-NEXT: vpsrlw $2, %xmm4, %xmm5
921; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
922; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
923; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
924; AVX-NEXT: vpsrlw $1, %xmm4, %xmm5
925; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
926; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
927; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
928; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
929; AVX-NEXT: vpsubb %xmm2, %xmm4, %xmm4
930; AVX-NEXT: vpsllw $5, %xmm4, %xmm4
931; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5
932; AVX-NEXT: vpsllw $4, %xmm0, %xmm6
933; AVX-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
934; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm0, %xmm0
935; AVX-NEXT: vpsllw $2, %xmm0, %xmm4
936; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
937; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
938; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm4
939; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5
940; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
941; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
942; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
943; AVX-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
944; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
945; AVX-NEXT: retq
946;
947; AVX512F-LABEL: var_funnnel_v16i8:
948; AVX512F: # %bb.0:
949; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
950; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
951; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
952; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
953; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
954; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
955; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
956; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
957; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
958; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
959; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
960; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
961; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
962; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
963; AVX512F-NEXT: vzeroupper
964; AVX512F-NEXT: retq
965;
966; AVX512VL-LABEL: var_funnnel_v16i8:
967; AVX512VL: # %bb.0:
968; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
969; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
970; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
971; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
972; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
973; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
974; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
975; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
976; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
977; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
978; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
979; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
980; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
981; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
982; AVX512VL-NEXT: vzeroupper
983; AVX512VL-NEXT: retq
984;
985; AVX512BW-LABEL: var_funnnel_v16i8:
986; AVX512BW: # %bb.0:
987; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
988; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
989; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
990; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
991; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
992; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
993; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
994; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
995; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
996; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
997; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
998; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
999; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
1000; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1001; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
1002; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1003; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1004; AVX512BW-NEXT: vzeroupper
1005; AVX512BW-NEXT: retq
1006;
1007; AVX512VLBW-LABEL: var_funnnel_v16i8:
1008; AVX512VLBW: # %bb.0:
1009; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1010; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1011; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1012; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1013; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
1014; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1015; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
1016; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1017; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1018; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
1019; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
1020; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1021; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
1022; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
1023; AVX512VLBW-NEXT: vzeroupper
1024; AVX512VLBW-NEXT: retq
1025;
1026; XOP-LABEL: var_funnnel_v16i8:
1027; XOP: # %bb.0:
1028; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1029; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
1030; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm4
1031; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm4
1032; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1033; XOP-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1034; XOP-NEXT: vpshlb %xmm5, %xmm0, %xmm0
1035; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
1036; XOP-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
1037; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1038; XOP-NEXT: retq
1039;
1040; X32-SSE-LABEL: var_funnnel_v16i8:
1041; X32-SSE: # %bb.0:
1042; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1043; X32-SSE-NEXT: movdqa %xmm2, %xmm5
1044; X32-SSE-NEXT: psllw $5, %xmm5
1045; X32-SSE-NEXT: pxor %xmm3, %xmm3
1046; X32-SSE-NEXT: pxor %xmm6, %xmm6
1047; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1048; X32-SSE-NEXT: movdqa %xmm1, %xmm4
1049; X32-SSE-NEXT: psrlw $4, %xmm4
1050; X32-SSE-NEXT: pand %xmm6, %xmm4
1051; X32-SSE-NEXT: pandn %xmm1, %xmm6
1052; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1053; X32-SSE-NEXT: por %xmm6, %xmm4
1054; X32-SSE-NEXT: paddb %xmm5, %xmm5
1055; X32-SSE-NEXT: pxor %xmm6, %xmm6
1056; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1057; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1058; X32-SSE-NEXT: pandn %xmm4, %xmm7
1059; X32-SSE-NEXT: psrlw $2, %xmm4
1060; X32-SSE-NEXT: pand %xmm6, %xmm4
1061; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1062; X32-SSE-NEXT: por %xmm7, %xmm4
1063; X32-SSE-NEXT: paddb %xmm5, %xmm5
1064; X32-SSE-NEXT: pxor %xmm6, %xmm6
1065; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1066; X32-SSE-NEXT: movdqa %xmm6, %xmm5
1067; X32-SSE-NEXT: pandn %xmm4, %xmm5
1068; X32-SSE-NEXT: psrlw $1, %xmm4
1069; X32-SSE-NEXT: pand %xmm6, %xmm4
1070; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1071; X32-SSE-NEXT: por %xmm5, %xmm4
1072; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1073; X32-SSE-NEXT: psubb %xmm2, %xmm5
1074; X32-SSE-NEXT: psllw $5, %xmm5
1075; X32-SSE-NEXT: pxor %xmm6, %xmm6
1076; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1077; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1078; X32-SSE-NEXT: pandn %xmm0, %xmm7
1079; X32-SSE-NEXT: psllw $4, %xmm0
1080; X32-SSE-NEXT: pand %xmm6, %xmm0
1081; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1082; X32-SSE-NEXT: por %xmm7, %xmm0
1083; X32-SSE-NEXT: paddb %xmm5, %xmm5
1084; X32-SSE-NEXT: pxor %xmm6, %xmm6
1085; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1086; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1087; X32-SSE-NEXT: pandn %xmm0, %xmm7
1088; X32-SSE-NEXT: psllw $2, %xmm0
1089; X32-SSE-NEXT: pand %xmm6, %xmm0
1090; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1091; X32-SSE-NEXT: por %xmm7, %xmm0
1092; X32-SSE-NEXT: paddb %xmm5, %xmm5
1093; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
1094; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm3
1095; X32-SSE-NEXT: movdqa %xmm3, %xmm5
1096; X32-SSE-NEXT: pandn %xmm0, %xmm5
1097; X32-SSE-NEXT: por %xmm4, %xmm5
1098; X32-SSE-NEXT: paddb %xmm0, %xmm0
1099; X32-SSE-NEXT: pand %xmm3, %xmm0
1100; X32-SSE-NEXT: por %xmm5, %xmm0
1101; X32-SSE-NEXT: pand %xmm2, %xmm1
1102; X32-SSE-NEXT: pandn %xmm0, %xmm2
1103; X32-SSE-NEXT: por %xmm1, %xmm2
1104; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1105; X32-SSE-NEXT: retl
1106 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
1107 ret <16 x i8> %res
1108}
1109
1110;
1111; Uniform Variable Shifts
1112;
1113
1114define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
1115; SSE2-LABEL: splatvar_funnnel_v2i64:
1116; SSE2: # %bb.0:
1117; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1118; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1119; SSE2-NEXT: movdqa %xmm1, %xmm3
1120; SSE2-NEXT: psrlq %xmm2, %xmm3
1121; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
1122; SSE2-NEXT: psubq %xmm2, %xmm4
1123; SSE2-NEXT: psllq %xmm4, %xmm0
1124; SSE2-NEXT: por %xmm3, %xmm0
1125; SSE2-NEXT: pxor %xmm3, %xmm3
1126; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
1127; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
1128; SSE2-NEXT: pand %xmm3, %xmm2
1129; SSE2-NEXT: pand %xmm2, %xmm1
1130; SSE2-NEXT: pandn %xmm0, %xmm2
1131; SSE2-NEXT: por %xmm1, %xmm2
1132; SSE2-NEXT: movdqa %xmm2, %xmm0
1133; SSE2-NEXT: retq
1134;
1135; SSE41-LABEL: splatvar_funnnel_v2i64:
1136; SSE41: # %bb.0:
1137; SSE41-NEXT: movdqa %xmm0, %xmm3
1138; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1139; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1140; SSE41-NEXT: movdqa %xmm1, %xmm0
1141; SSE41-NEXT: psrlq %xmm2, %xmm0
1142; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
1143; SSE41-NEXT: psubq %xmm2, %xmm4
1144; SSE41-NEXT: psllq %xmm4, %xmm3
1145; SSE41-NEXT: por %xmm0, %xmm3
1146; SSE41-NEXT: pxor %xmm0, %xmm0
1147; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
1148; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
1149; SSE41-NEXT: movapd %xmm3, %xmm0
1150; SSE41-NEXT: retq
1151;
1152; AVX1-LABEL: splatvar_funnnel_v2i64:
1153; AVX1: # %bb.0:
1154; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1155; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1156; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1157; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1158; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1159; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1160; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1161; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1162; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1163; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1164; AVX1-NEXT: retq
1165;
1166; AVX2-LABEL: splatvar_funnnel_v2i64:
1167; AVX2: # %bb.0:
1168; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2
1169; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1170; AVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1171; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1172; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1173; AVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1174; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1175; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1176; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1177; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1178; AVX2-NEXT: retq
1179;
1180; AVX512F-LABEL: splatvar_funnnel_v2i64:
1181; AVX512F: # %bb.0:
1182; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1183; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
1184; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1185; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1186; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1187; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1188; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1189; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1190; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
1191; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
1192; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1193; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1194; AVX512F-NEXT: vzeroupper
1195; AVX512F-NEXT: retq
1196;
1197; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1198; AVX512VL: # %bb.0:
1199; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
1200; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1201; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1202; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1203; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1204; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1205; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1206; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
1207; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
1208; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
1209; AVX512VL-NEXT: retq
1210;
1211; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1212; AVX512BW: # %bb.0:
1213; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1214; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
1215; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1216; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1217; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1218; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1219; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1220; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1221; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1222; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
1223; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1224; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1225; AVX512BW-NEXT: vzeroupper
1226; AVX512BW-NEXT: retq
1227;
1228; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1229; AVX512VLBW: # %bb.0:
1230; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
1231; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1232; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1233; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1234; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1235; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1236; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1237; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1238; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
1239; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
1240; AVX512VLBW-NEXT: retq
1241;
1242; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
1243; XOPAVX1: # %bb.0:
1244; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1245; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1246; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1247; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1248; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1249; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1250; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1251; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1252; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
1253; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1254; XOPAVX1-NEXT: retq
1255;
1256; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
1257; XOPAVX2: # %bb.0:
1258; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2
1259; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1260; XOPAVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1261; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1262; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1263; XOPAVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1264; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1265; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1266; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
1267; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1268; XOPAVX2-NEXT: retq
1269;
1270; X32-SSE-LABEL: splatvar_funnnel_v2i64:
1271; X32-SSE: # %bb.0:
1272; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1273; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1274; X32-SSE-NEXT: movdqa %xmm1, %xmm3
1275; X32-SSE-NEXT: psrlq %xmm2, %xmm3
1276; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
1277; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1278; X32-SSE-NEXT: psrlq %xmm4, %xmm5
1279; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
1280; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
1281; X32-SSE-NEXT: psubq %xmm2, %xmm3
1282; X32-SSE-NEXT: movdqa %xmm0, %xmm4
1283; X32-SSE-NEXT: psllq %xmm3, %xmm4
1284; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1285; X32-SSE-NEXT: psllq %xmm3, %xmm0
1286; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1287; X32-SSE-NEXT: orpd %xmm5, %xmm0
1288; X32-SSE-NEXT: pxor %xmm3, %xmm3
1289; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
1290; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
1291; X32-SSE-NEXT: pand %xmm3, %xmm2
1292; X32-SSE-NEXT: pand %xmm2, %xmm1
1293; X32-SSE-NEXT: pandn %xmm0, %xmm2
1294; X32-SSE-NEXT: por %xmm1, %xmm2
1295; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1296; X32-SSE-NEXT: retl
1297 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1298 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1299 ret <2 x i64> %res
1300}
1301
1302define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1303; SSE2-LABEL: splatvar_funnnel_v4i32:
1304; SSE2: # %bb.0:
1305; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1306; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1307; SSE2-NEXT: pxor %xmm3, %xmm3
1308; SSE2-NEXT: xorps %xmm4, %xmm4
1309; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1310; SSE2-NEXT: movdqa %xmm1, %xmm5
1311; SSE2-NEXT: psrld %xmm4, %xmm5
Sanjay Patel9633d762019-01-03 21:31:16 +00001312; SSE2-NEXT: movd %xmm2, %eax
1313; SSE2-NEXT: movl $32, %ecx
1314; SSE2-NEXT: subl %eax, %ecx
1315; SSE2-NEXT: movd %ecx, %xmm4
1316; SSE2-NEXT: pslld %xmm4, %xmm0
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001317; SSE2-NEXT: por %xmm5, %xmm0
Sanjay Patel9633d762019-01-03 21:31:16 +00001318; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001319; SSE2-NEXT: pand %xmm2, %xmm1
1320; SSE2-NEXT: pandn %xmm0, %xmm2
1321; SSE2-NEXT: por %xmm1, %xmm2
1322; SSE2-NEXT: movdqa %xmm2, %xmm0
1323; SSE2-NEXT: retq
1324;
1325; SSE41-LABEL: splatvar_funnnel_v4i32:
1326; SSE41: # %bb.0:
1327; SSE41-NEXT: movdqa %xmm0, %xmm3
1328; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1329; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1330; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
1331; SSE41-NEXT: movdqa %xmm1, %xmm4
1332; SSE41-NEXT: psrld %xmm0, %xmm4
1333; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
1334; SSE41-NEXT: psubd %xmm2, %xmm0
1335; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1336; SSE41-NEXT: pslld %xmm0, %xmm3
1337; SSE41-NEXT: por %xmm4, %xmm3
1338; SSE41-NEXT: pxor %xmm0, %xmm0
1339; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
1340; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
1341; SSE41-NEXT: movaps %xmm3, %xmm0
1342; SSE41-NEXT: retq
1343;
1344; AVX1-LABEL: splatvar_funnnel_v4i32:
1345; AVX1: # %bb.0:
1346; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1347; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1348; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1349; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1350; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
1351; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1352; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1353; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
1354; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1355; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1356; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1357; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1358; AVX1-NEXT: retq
1359;
1360; AVX2-LABEL: splatvar_funnnel_v4i32:
1361; AVX2: # %bb.0:
1362; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
1363; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1364; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1365; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1366; AVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1367; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1368; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1369; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1370; AVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1371; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1372; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1373; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1374; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1375; AVX2-NEXT: retq
1376;
1377; AVX512F-LABEL: splatvar_funnnel_v4i32:
1378; AVX512F: # %bb.0:
1379; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1380; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
1381; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1382; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1383; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1384; AVX512F-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1385; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1386; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1387; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1388; AVX512F-NEXT: vpslld %xmm4, %xmm0, %xmm0
1389; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
1390; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
1391; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1392; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1393; AVX512F-NEXT: vzeroupper
1394; AVX512F-NEXT: retq
1395;
1396; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1397; AVX512VL: # %bb.0:
1398; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
1399; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1400; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1401; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1402; AVX512VL-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1403; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1404; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1405; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1406; AVX512VL-NEXT: vpslld %xmm4, %xmm0, %xmm0
1407; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
1408; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
1409; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1410; AVX512VL-NEXT: retq
1411;
1412; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1413; AVX512BW: # %bb.0:
1414; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1415; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
1416; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1417; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1418; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1419; AVX512BW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1420; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1421; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1422; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1423; AVX512BW-NEXT: vpslld %xmm4, %xmm0, %xmm0
1424; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1425; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
1426; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1427; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1428; AVX512BW-NEXT: vzeroupper
1429; AVX512BW-NEXT: retq
1430;
1431; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1432; AVX512VLBW: # %bb.0:
1433; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
1434; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1435; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1436; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1437; AVX512VLBW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1438; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1439; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1440; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1441; AVX512VLBW-NEXT: vpslld %xmm4, %xmm0, %xmm0
1442; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1443; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
1444; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1445; AVX512VLBW-NEXT: retq
1446;
1447; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
1448; XOPAVX1: # %bb.0:
1449; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1450; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1451; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1452; XOPAVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1453; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
1454; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1455; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1456; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
1457; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1458; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1459; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
1460; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1461; XOPAVX1-NEXT: retq
1462;
1463; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
1464; XOPAVX2: # %bb.0:
1465; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2
1466; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1467; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1468; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1469; XOPAVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1470; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1471; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1472; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1473; XOPAVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1474; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1475; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1476; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
1477; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1478; XOPAVX2-NEXT: retq
1479;
1480; X32-SSE-LABEL: splatvar_funnnel_v4i32:
1481; X32-SSE: # %bb.0:
1482; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1483; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1484; X32-SSE-NEXT: pxor %xmm3, %xmm3
1485; X32-SSE-NEXT: xorps %xmm4, %xmm4
1486; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1487; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1488; X32-SSE-NEXT: psrld %xmm4, %xmm5
Sanjay Patel9633d762019-01-03 21:31:16 +00001489; X32-SSE-NEXT: movd %xmm2, %eax
1490; X32-SSE-NEXT: movl $32, %ecx
1491; X32-SSE-NEXT: subl %eax, %ecx
1492; X32-SSE-NEXT: movd %ecx, %xmm4
1493; X32-SSE-NEXT: pslld %xmm4, %xmm0
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001494; X32-SSE-NEXT: por %xmm5, %xmm0
Sanjay Patel9633d762019-01-03 21:31:16 +00001495; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001496; X32-SSE-NEXT: pand %xmm2, %xmm1
1497; X32-SSE-NEXT: pandn %xmm0, %xmm2
1498; X32-SSE-NEXT: por %xmm1, %xmm2
1499; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1500; X32-SSE-NEXT: retl
1501 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1502 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1503 ret <4 x i32> %res
1504}
1505
1506define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1507; SSE2-LABEL: splatvar_funnnel_v8i16:
1508; SSE2: # %bb.0:
1509; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1510; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1511; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1512; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1513; SSE2-NEXT: psubw %xmm3, %xmm4
1514; SSE2-NEXT: pxor %xmm2, %xmm2
1515; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
1516; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
1517; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1518; SSE2-NEXT: movdqa %xmm1, %xmm5
1519; SSE2-NEXT: psrlw %xmm3, %xmm5
1520; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1521; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1522; SSE2-NEXT: psllw %xmm4, %xmm0
1523; SSE2-NEXT: por %xmm5, %xmm0
1524; SSE2-NEXT: pand %xmm2, %xmm1
1525; SSE2-NEXT: pandn %xmm0, %xmm2
1526; SSE2-NEXT: por %xmm1, %xmm2
1527; SSE2-NEXT: movdqa %xmm2, %xmm0
1528; SSE2-NEXT: retq
1529;
1530; SSE41-LABEL: splatvar_funnnel_v8i16:
1531; SSE41: # %bb.0:
1532; SSE41-NEXT: movdqa %xmm0, %xmm3
1533; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
1534; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
1535; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1536; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1537; SSE41-NEXT: movdqa %xmm1, %xmm4
1538; SSE41-NEXT: psrlw %xmm0, %xmm4
1539; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
1540; SSE41-NEXT: psubw %xmm2, %xmm0
1541; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1542; SSE41-NEXT: psllw %xmm0, %xmm3
1543; SSE41-NEXT: por %xmm4, %xmm3
1544; SSE41-NEXT: pxor %xmm0, %xmm0
1545; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
1546; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
1547; SSE41-NEXT: movdqa %xmm3, %xmm0
1548; SSE41-NEXT: retq
1549;
1550; AVX1-LABEL: splatvar_funnnel_v8i16:
1551; AVX1: # %bb.0:
1552; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1553; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1554; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1555; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1556; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1557; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1558; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1559; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1560; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1561; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1562; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1563; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1564; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1565; AVX1-NEXT: retq
1566;
1567; AVX2-LABEL: splatvar_funnnel_v8i16:
1568; AVX2: # %bb.0:
1569; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
1570; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1571; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1572; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1573; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1574; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1575; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1576; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1577; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1578; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1579; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1580; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1581; AVX2-NEXT: retq
1582;
1583; AVX512F-LABEL: splatvar_funnnel_v8i16:
1584; AVX512F: # %bb.0:
1585; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
1586; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1587; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1588; AVX512F-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1589; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1590; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1591; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1592; AVX512F-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1593; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
1594; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1595; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1596; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1597; AVX512F-NEXT: retq
1598;
1599; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1600; AVX512VL: # %bb.0:
1601; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
1602; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1603; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1604; AVX512VL-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1605; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1606; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1607; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1608; AVX512VL-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1609; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
1610; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1611; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1612; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1613; AVX512VL-NEXT: retq
1614;
1615; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1616; AVX512BW: # %bb.0:
1617; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1618; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
1619; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1620; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1621; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1622; AVX512BW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
1623; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
1624; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
1625; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1626; AVX512BW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1627; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1628; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
1629; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1630; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1631; AVX512BW-NEXT: vzeroupper
1632; AVX512BW-NEXT: retq
1633;
1634; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1635; AVX512VLBW: # %bb.0:
1636; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
1637; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1638; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1639; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1640; AVX512VLBW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
1641; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
1642; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
1643; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1644; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1645; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1646; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
1647; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
1648; AVX512VLBW-NEXT: retq
1649;
1650; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
1651; XOPAVX1: # %bb.0:
1652; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1653; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1654; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1655; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1656; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1657; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1658; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1659; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1660; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1661; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1662; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1663; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
1664; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1665; XOPAVX1-NEXT: retq
1666;
1667; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
1668; XOPAVX2: # %bb.0:
1669; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2
1670; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1671; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1672; XOPAVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1673; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1674; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1675; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1676; XOPAVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1677; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1678; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1679; XOPAVX2-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
1680; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1681; XOPAVX2-NEXT: retq
1682;
1683; X32-SSE-LABEL: splatvar_funnnel_v8i16:
1684; X32-SSE: # %bb.0:
1685; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1686; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1687; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
1688; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1689; X32-SSE-NEXT: psubw %xmm3, %xmm4
1690; X32-SSE-NEXT: pxor %xmm2, %xmm2
1691; X32-SSE-NEXT: pcmpeqw %xmm3, %xmm2
1692; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
1693; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1694; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1695; X32-SSE-NEXT: psrlw %xmm3, %xmm5
1696; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1697; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1698; X32-SSE-NEXT: psllw %xmm4, %xmm0
1699; X32-SSE-NEXT: por %xmm5, %xmm0
1700; X32-SSE-NEXT: pand %xmm2, %xmm1
1701; X32-SSE-NEXT: pandn %xmm0, %xmm2
1702; X32-SSE-NEXT: por %xmm1, %xmm2
1703; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1704; X32-SSE-NEXT: retl
1705 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1706 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1707 ret <8 x i16> %res
1708}
1709
1710define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1711; SSE2-LABEL: splatvar_funnnel_v16i8:
1712; SSE2: # %bb.0:
1713; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1714; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1715; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1716; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1717; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1718; SSE2-NEXT: psubb %xmm3, %xmm4
1719; SSE2-NEXT: pxor %xmm2, %xmm2
1720; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
1721; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1722; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1723; SSE2-NEXT: movdqa %xmm1, %xmm5
1724; SSE2-NEXT: psrlw %xmm3, %xmm5
1725; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
1726; SSE2-NEXT: psrlw %xmm3, %xmm6
1727; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
1728; SSE2-NEXT: psrlw $8, %xmm6
1729; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1730; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
1731; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
1732; SSE2-NEXT: pand %xmm5, %xmm6
1733; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
1734; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1735; SSE2-NEXT: psllw %xmm4, %xmm0
1736; SSE2-NEXT: psllw %xmm4, %xmm3
1737; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1738; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
1739; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1740; SSE2-NEXT: pand %xmm0, %xmm3
1741; SSE2-NEXT: por %xmm6, %xmm3
1742; SSE2-NEXT: pand %xmm2, %xmm1
1743; SSE2-NEXT: pandn %xmm3, %xmm2
1744; SSE2-NEXT: por %xmm1, %xmm2
1745; SSE2-NEXT: movdqa %xmm2, %xmm0
1746; SSE2-NEXT: retq
1747;
1748; SSE41-LABEL: splatvar_funnnel_v16i8:
1749; SSE41: # %bb.0:
1750; SSE41-NEXT: movdqa %xmm0, %xmm3
1751; SSE41-NEXT: pxor %xmm0, %xmm0
1752; SSE41-NEXT: pshufb %xmm0, %xmm2
1753; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1754; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1755; SSE41-NEXT: movdqa %xmm1, %xmm5
1756; SSE41-NEXT: psrlw %xmm4, %xmm5
1757; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
1758; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
1759; SSE41-NEXT: psrlw %xmm4, %xmm7
1760; SSE41-NEXT: pshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1761; SSE41-NEXT: pand %xmm5, %xmm7
1762; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1763; SSE41-NEXT: psubb %xmm2, %xmm4
1764; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1765; SSE41-NEXT: psllw %xmm4, %xmm3
1766; SSE41-NEXT: psllw %xmm4, %xmm6
1767; SSE41-NEXT: pshufb %xmm0, %xmm6
1768; SSE41-NEXT: pand %xmm6, %xmm3
1769; SSE41-NEXT: por %xmm7, %xmm3
1770; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
1771; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
1772; SSE41-NEXT: movdqa %xmm3, %xmm0
1773; SSE41-NEXT: retq
1774;
1775; AVX1-LABEL: splatvar_funnnel_v16i8:
1776; AVX1: # %bb.0:
1777; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1778; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1779; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1780; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1781; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm5
1782; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
1783; AVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm4
1784; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1785; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
1786; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1787; AVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1788; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
1789; AVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0
1790; AVX1-NEXT: vpsllw %xmm5, %xmm6, %xmm5
1791; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5
1792; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
1793; AVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
1794; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1795; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1796; AVX1-NEXT: retq
1797;
1798; AVX2-LABEL: splatvar_funnnel_v16i8:
1799; AVX2: # %bb.0:
1800; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1801; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1802; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1803; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm4
1804; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1805; AVX2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
1806; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
1807; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3
1808; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3
1809; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1810; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1811; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1812; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1813; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
1814; AVX2-NEXT: vpbroadcastb %xmm4, %xmm4
1815; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
1816; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1817; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1818; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1819; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1820; AVX2-NEXT: retq
1821;
1822; AVX512F-LABEL: splatvar_funnnel_v16i8:
1823; AVX512F: # %bb.0:
1824; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
1825; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1826; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1827; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1828; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1829; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1830; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1831; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1832; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1833; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1834; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
1835; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1836; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1837; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1838; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1839; AVX512F-NEXT: vzeroupper
1840; AVX512F-NEXT: retq
1841;
1842; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1843; AVX512VL: # %bb.0:
1844; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
1845; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1846; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1847; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1848; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1849; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1850; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1851; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1852; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1853; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1854; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
1855; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1856; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1857; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1858; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1859; AVX512VL-NEXT: vzeroupper
1860; AVX512VL-NEXT: retq
1861;
1862; AVX512BW-LABEL: splatvar_funnnel_v16i8:
1863; AVX512BW: # %bb.0:
1864; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1865; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
1866; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1867; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1868; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
1869; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1870; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
1871; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1872; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1873; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
1874; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1875; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
1876; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
1877; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1878; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
1879; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1880; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1881; AVX512BW-NEXT: vzeroupper
1882; AVX512BW-NEXT: retq
1883;
1884; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
1885; AVX512VLBW: # %bb.0:
1886; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
1887; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1888; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1889; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1890; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1891; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
1892; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1893; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
1894; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1895; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1896; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
1897; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
1898; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1899; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
1900; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
1901; AVX512VLBW-NEXT: vzeroupper
1902; AVX512VLBW-NEXT: retq
1903;
1904; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
1905; XOPAVX1: # %bb.0:
1906; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1907; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1908; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1909; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4
1910; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4
1911; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1912; XOPAVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1913; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0
1914; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
1915; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
1916; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1917; XOPAVX1-NEXT: retq
1918;
1919; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
1920; XOPAVX2: # %bb.0:
1921; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1922; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1923; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1924; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm4
1925; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm4
1926; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1927; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1928; XOPAVX2-NEXT: vpshlb %xmm5, %xmm0, %xmm0
1929; XOPAVX2-NEXT: vpor %xmm4, %xmm0, %xmm0
1930; XOPAVX2-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
1931; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1932; XOPAVX2-NEXT: retq
1933;
1934; X32-SSE-LABEL: splatvar_funnnel_v16i8:
1935; X32-SSE: # %bb.0:
1936; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1937; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1938; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1939; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
1940; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1941; X32-SSE-NEXT: psubb %xmm3, %xmm4
1942; X32-SSE-NEXT: pxor %xmm2, %xmm2
1943; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
1944; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1945; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1946; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1947; X32-SSE-NEXT: psrlw %xmm3, %xmm5
1948; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6
1949; X32-SSE-NEXT: psrlw %xmm3, %xmm6
1950; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
1951; X32-SSE-NEXT: psrlw $8, %xmm6
1952; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1953; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
1954; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
1955; X32-SSE-NEXT: pand %xmm5, %xmm6
1956; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
1957; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1958; X32-SSE-NEXT: psllw %xmm4, %xmm0
1959; X32-SSE-NEXT: psllw %xmm4, %xmm3
1960; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1961; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
1962; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1963; X32-SSE-NEXT: pand %xmm0, %xmm3
1964; X32-SSE-NEXT: por %xmm6, %xmm3
1965; X32-SSE-NEXT: pand %xmm2, %xmm1
1966; X32-SSE-NEXT: pandn %xmm3, %xmm2
1967; X32-SSE-NEXT: por %xmm1, %xmm2
1968; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1969; X32-SSE-NEXT: retl
1970 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
1971 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
1972 ret <16 x i8> %res
1973}
1974
1975;
1976; Constant Shifts
1977;
1978
1979define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
1980; SSE2-LABEL: constant_funnnel_v2i64:
1981; SSE2: # %bb.0:
1982; SSE2-NEXT: movdqa %xmm1, %xmm2
1983; SSE2-NEXT: psrlq $4, %xmm2
1984; SSE2-NEXT: psrlq $14, %xmm1
1985; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1986; SSE2-NEXT: movdqa %xmm0, %xmm2
1987; SSE2-NEXT: psllq $60, %xmm2
1988; SSE2-NEXT: psllq $50, %xmm0
1989; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1990; SSE2-NEXT: orpd %xmm1, %xmm0
1991; SSE2-NEXT: retq
1992;
1993; SSE41-LABEL: constant_funnnel_v2i64:
1994; SSE41: # %bb.0:
1995; SSE41-NEXT: movdqa %xmm1, %xmm2
1996; SSE41-NEXT: psrlq $14, %xmm2
1997; SSE41-NEXT: psrlq $4, %xmm1
1998; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1999; SSE41-NEXT: movdqa %xmm0, %xmm2
2000; SSE41-NEXT: psllq $50, %xmm2
2001; SSE41-NEXT: psllq $60, %xmm0
2002; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2003; SSE41-NEXT: por %xmm1, %xmm0
2004; SSE41-NEXT: retq
2005;
2006; AVX1-LABEL: constant_funnnel_v2i64:
2007; AVX1: # %bb.0:
2008; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm2
2009; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm1
2010; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2011; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2
2012; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
2013; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2014; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2015; AVX1-NEXT: retq
2016;
2017; AVX2-LABEL: constant_funnnel_v2i64:
2018; AVX2: # %bb.0:
2019; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2020; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2021; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2022; AVX2-NEXT: retq
2023;
2024; AVX512-LABEL: constant_funnnel_v2i64:
2025; AVX512: # %bb.0:
2026; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2027; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2028; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2029; AVX512-NEXT: retq
2030;
2031; XOPAVX1-LABEL: constant_funnnel_v2i64:
2032; XOPAVX1: # %bb.0:
2033; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1
2034; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
2035; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2036; XOPAVX1-NEXT: retq
2037;
2038; XOPAVX2-LABEL: constant_funnnel_v2i64:
2039; XOPAVX2: # %bb.0:
2040; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2041; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2042; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2043; XOPAVX2-NEXT: retq
2044;
2045; X32-SSE-LABEL: constant_funnnel_v2i64:
2046; X32-SSE: # %bb.0:
2047; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2048; X32-SSE-NEXT: psrlq $4, %xmm2
2049; X32-SSE-NEXT: psrlq $14, %xmm1
2050; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2051; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2052; X32-SSE-NEXT: psllq $60, %xmm2
2053; X32-SSE-NEXT: psllq $50, %xmm0
2054; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2055; X32-SSE-NEXT: orpd %xmm1, %xmm0
2056; X32-SSE-NEXT: retl
2057 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
2058 ret <2 x i64> %res
2059}
2060
2061define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2062; SSE2-LABEL: constant_funnnel_v4i32:
2063; SSE2: # %bb.0:
2064; SSE2-NEXT: movdqa %xmm1, %xmm2
2065; SSE2-NEXT: psrld $7, %xmm2
2066; SSE2-NEXT: movdqa %xmm1, %xmm3
2067; SSE2-NEXT: psrld $6, %xmm3
2068; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2069; SSE2-NEXT: movdqa %xmm1, %xmm2
2070; SSE2-NEXT: psrld $5, %xmm2
2071; SSE2-NEXT: psrld $4, %xmm1
2072; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2073; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2074; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2075; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2076; SSE2-NEXT: pmuludq %xmm2, %xmm0
2077; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2078; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2079; SSE2-NEXT: pmuludq %xmm3, %xmm2
2080; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2081; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2082; SSE2-NEXT: por %xmm1, %xmm0
2083; SSE2-NEXT: retq
2084;
2085; SSE41-LABEL: constant_funnnel_v4i32:
2086; SSE41: # %bb.0:
2087; SSE41-NEXT: movdqa %xmm1, %xmm2
2088; SSE41-NEXT: psrld $7, %xmm2
2089; SSE41-NEXT: movdqa %xmm1, %xmm3
2090; SSE41-NEXT: psrld $5, %xmm3
2091; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2092; SSE41-NEXT: movdqa %xmm1, %xmm2
2093; SSE41-NEXT: psrld $6, %xmm2
2094; SSE41-NEXT: psrld $4, %xmm1
2095; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2096; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2097; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
2098; SSE41-NEXT: por %xmm1, %xmm0
2099; SSE41-NEXT: retq
2100;
2101; AVX1-LABEL: constant_funnnel_v4i32:
2102; AVX1: # %bb.0:
2103; AVX1-NEXT: vpsrld $7, %xmm1, %xmm2
2104; AVX1-NEXT: vpsrld $5, %xmm1, %xmm3
2105; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2106; AVX1-NEXT: vpsrld $6, %xmm1, %xmm3
2107; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
2108; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
2109; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2110; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2111; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2112; AVX1-NEXT: retq
2113;
2114; AVX2-LABEL: constant_funnnel_v4i32:
2115; AVX2: # %bb.0:
2116; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2117; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2118; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2119; AVX2-NEXT: retq
2120;
2121; AVX512-LABEL: constant_funnnel_v4i32:
2122; AVX512: # %bb.0:
2123; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2124; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2125; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2126; AVX512-NEXT: retq
2127;
2128; XOPAVX1-LABEL: constant_funnnel_v4i32:
2129; XOPAVX1: # %bb.0:
2130; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
2131; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
2132; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2133; XOPAVX1-NEXT: retq
2134;
2135; XOPAVX2-LABEL: constant_funnnel_v4i32:
2136; XOPAVX2: # %bb.0:
2137; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2138; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2139; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2140; XOPAVX2-NEXT: retq
2141;
2142; X32-SSE-LABEL: constant_funnnel_v4i32:
2143; X32-SSE: # %bb.0:
2144; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2145; X32-SSE-NEXT: psrld $7, %xmm2
2146; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2147; X32-SSE-NEXT: psrld $6, %xmm3
2148; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2149; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2150; X32-SSE-NEXT: psrld $5, %xmm2
2151; X32-SSE-NEXT: psrld $4, %xmm1
2152; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2153; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2154; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2155; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2156; X32-SSE-NEXT: pmuludq %xmm2, %xmm0
2157; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2158; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2159; X32-SSE-NEXT: pmuludq %xmm3, %xmm2
2160; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2161; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2162; X32-SSE-NEXT: por %xmm1, %xmm0
2163; X32-SSE-NEXT: retl
2164 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
2165 ret <4 x i32> %res
2166}
2167
2168define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2169; SSE2-LABEL: constant_funnnel_v8i16:
2170; SSE2: # %bb.0:
2171; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2172; SSE2-NEXT: movdqa %xmm2, %xmm3
2173; SSE2-NEXT: pandn %xmm1, %xmm3
2174; SSE2-NEXT: movdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
2175; SSE2-NEXT: pmulhuw %xmm4, %xmm1
2176; SSE2-NEXT: pand %xmm2, %xmm1
2177; SSE2-NEXT: pmullw %xmm4, %xmm0
2178; SSE2-NEXT: por %xmm3, %xmm0
2179; SSE2-NEXT: por %xmm1, %xmm0
2180; SSE2-NEXT: pand %xmm2, %xmm0
2181; SSE2-NEXT: por %xmm3, %xmm0
2182; SSE2-NEXT: retq
2183;
2184; SSE41-LABEL: constant_funnnel_v8i16:
2185; SSE41: # %bb.0:
2186; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2187; SSE41-NEXT: movdqa %xmm1, %xmm3
2188; SSE41-NEXT: pmulhuw %xmm2, %xmm3
2189; SSE41-NEXT: pmullw %xmm2, %xmm0
2190; SSE41-NEXT: por %xmm3, %xmm0
2191; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2192; SSE41-NEXT: retq
2193;
2194; AVX-LABEL: constant_funnnel_v8i16:
2195; AVX: # %bb.0:
2196; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2197; AVX-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2198; AVX-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2199; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
2200; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2201; AVX-NEXT: retq
2202;
2203; AVX512F-LABEL: constant_funnnel_v8i16:
2204; AVX512F: # %bb.0:
2205; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2206; AVX512F-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2207; AVX512F-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2208; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
2209; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2210; AVX512F-NEXT: retq
2211;
2212; AVX512VL-LABEL: constant_funnnel_v8i16:
2213; AVX512VL: # %bb.0:
2214; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2215; AVX512VL-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2216; AVX512VL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2217; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
2218; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2219; AVX512VL-NEXT: retq
2220;
2221; AVX512BW-LABEL: constant_funnnel_v8i16:
2222; AVX512BW: # %bb.0:
2223; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2224; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2225; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
2226; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
2227; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,15,14,13,12,11,10,9]
2228; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2229; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0
2230; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2231; AVX512BW-NEXT: vzeroupper
2232; AVX512BW-NEXT: retq
2233;
2234; AVX512VLBW-LABEL: constant_funnnel_v8i16:
2235; AVX512VLBW: # %bb.0:
2236; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm2
2237; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
2238; AVX512VLBW-NEXT: vpor %xmm2, %xmm0, %xmm0
2239; AVX512VLBW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2240; AVX512VLBW-NEXT: retq
2241;
2242; XOP-LABEL: constant_funnnel_v8i16:
2243; XOP: # %bb.0:
2244; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm2
2245; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
2246; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
2247; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2248; XOP-NEXT: retq
2249;
2250; X32-SSE-LABEL: constant_funnnel_v8i16:
2251; X32-SSE: # %bb.0:
2252; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2253; X32-SSE-NEXT: movdqa %xmm2, %xmm3
2254; X32-SSE-NEXT: pandn %xmm1, %xmm3
2255; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
2256; X32-SSE-NEXT: pmulhuw %xmm4, %xmm1
2257; X32-SSE-NEXT: pand %xmm2, %xmm1
2258; X32-SSE-NEXT: pmullw %xmm4, %xmm0
2259; X32-SSE-NEXT: por %xmm3, %xmm0
2260; X32-SSE-NEXT: por %xmm1, %xmm0
2261; X32-SSE-NEXT: pand %xmm2, %xmm0
2262; X32-SSE-NEXT: por %xmm3, %xmm0
2263; X32-SSE-NEXT: retl
2264 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2265 ret <8 x i16> %res
2266}
2267
2268define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2269; SSE2-LABEL: constant_funnnel_v16i8:
2270; SSE2: # %bb.0:
2271; SSE2-NEXT: pxor %xmm2, %xmm2
2272; SSE2-NEXT: movdqa %xmm1, %xmm3
2273; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2274; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
2275; SSE2-NEXT: psrlw $8, %xmm3
2276; SSE2-NEXT: movdqa %xmm1, %xmm4
2277; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2278; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm4
2279; SSE2-NEXT: psrlw $8, %xmm4
2280; SSE2-NEXT: packuswb %xmm3, %xmm4
2281; SSE2-NEXT: movdqa %xmm0, %xmm2
2282; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2283; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2284; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2285; SSE2-NEXT: pand %xmm3, %xmm2
2286; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2287; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
2288; SSE2-NEXT: pand %xmm3, %xmm0
2289; SSE2-NEXT: packuswb %xmm2, %xmm0
2290; SSE2-NEXT: por %xmm4, %xmm0
2291; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2292; SSE2-NEXT: pand %xmm2, %xmm0
2293; SSE2-NEXT: pandn %xmm1, %xmm2
2294; SSE2-NEXT: por %xmm2, %xmm0
2295; SSE2-NEXT: retq
2296;
2297; SSE41-LABEL: constant_funnnel_v16i8:
2298; SSE41: # %bb.0:
2299; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2300; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2301; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
2302; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2303; SSE41-NEXT: pand %xmm3, %xmm0
2304; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
2305; SSE41-NEXT: pand %xmm3, %xmm2
2306; SSE41-NEXT: packuswb %xmm0, %xmm2
2307; SSE41-NEXT: pxor %xmm0, %xmm0
2308; SSE41-NEXT: movdqa %xmm1, %xmm3
2309; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2310; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm3
2311; SSE41-NEXT: psrlw $8, %xmm3
2312; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2313; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm4
2314; SSE41-NEXT: psrlw $8, %xmm4
2315; SSE41-NEXT: packuswb %xmm3, %xmm4
2316; SSE41-NEXT: por %xmm2, %xmm4
2317; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2318; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
2319; SSE41-NEXT: movdqa %xmm1, %xmm0
2320; SSE41-NEXT: retq
2321;
2322; AVX1-LABEL: constant_funnnel_v16i8:
2323; AVX1: # %bb.0:
2324; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2325; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2326; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2327; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2328; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2329; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2330; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2331; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2332; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2333; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2334; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2335; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2336; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2337; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3
2338; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2339; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2340; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
2341; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2342; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2343; AVX1-NEXT: retq
2344;
2345; AVX2-LABEL: constant_funnnel_v16i8:
2346; AVX2: # %bb.0:
2347; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2348; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2349; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2350; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2351; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2352; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2353; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2354; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
2355; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
2356; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
2357; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
2358; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2359; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2360; AVX2-NEXT: vzeroupper
2361; AVX2-NEXT: retq
2362;
2363; AVX512F-LABEL: constant_funnnel_v16i8:
2364; AVX512F: # %bb.0:
2365; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2366; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2367; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2368; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
2369; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0
2370; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2371; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2372; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2373; AVX512F-NEXT: vzeroupper
2374; AVX512F-NEXT: retq
2375;
2376; AVX512VL-LABEL: constant_funnnel_v16i8:
2377; AVX512VL: # %bb.0:
2378; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2379; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2380; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2381; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
2382; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0
2383; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2384; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2385; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2386; AVX512VL-NEXT: vzeroupper
2387; AVX512VL-NEXT: retq
2388;
2389; AVX512BW-LABEL: constant_funnnel_v16i8:
2390; AVX512BW: # %bb.0:
2391; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2392; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2393; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm3, %zmm2
2394; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,7,6,5,4,3,2,1,8,1,2,3,4,5,6,7]
2395; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2396; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2397; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
2398; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2399; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2400; AVX512BW-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2401; AVX512BW-NEXT: vzeroupper
2402; AVX512BW-NEXT: retq
2403;
2404; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2405; AVX512VLBW: # %bb.0:
2406; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2407; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2408; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2409; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
2410; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm0
2411; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2412; AVX512VLBW-NEXT: movw $257, %ax # imm = 0x101
2413; AVX512VLBW-NEXT: kmovd %eax, %k1
2414; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2415; AVX512VLBW-NEXT: vzeroupper
2416; AVX512VLBW-NEXT: retq
2417;
2418; XOP-LABEL: constant_funnnel_v16i8:
2419; XOP: # %bb.0:
2420; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm2
2421; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
2422; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
2423; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2424; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2425; XOP-NEXT: retq
2426;
2427; X32-SSE-LABEL: constant_funnnel_v16i8:
2428; X32-SSE: # %bb.0:
2429; X32-SSE-NEXT: pxor %xmm2, %xmm2
2430; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2431; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2432; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm3
2433; X32-SSE-NEXT: psrlw $8, %xmm3
2434; X32-SSE-NEXT: movdqa %xmm1, %xmm4
2435; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2436; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm4
2437; X32-SSE-NEXT: psrlw $8, %xmm4
2438; X32-SSE-NEXT: packuswb %xmm3, %xmm4
2439; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2440; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2441; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm2
2442; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2443; X32-SSE-NEXT: pand %xmm3, %xmm2
2444; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2445; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
2446; X32-SSE-NEXT: pand %xmm3, %xmm0
2447; X32-SSE-NEXT: packuswb %xmm2, %xmm0
2448; X32-SSE-NEXT: por %xmm4, %xmm0
2449; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2450; X32-SSE-NEXT: pand %xmm2, %xmm0
2451; X32-SSE-NEXT: pandn %xmm1, %xmm2
2452; X32-SSE-NEXT: por %xmm2, %xmm0
2453; X32-SSE-NEXT: retl
2454 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2455 ret <16 x i8> %res
2456}
2457
2458;
2459; Uniform Constant Shifts
2460;
2461
2462define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2463; SSE-LABEL: splatconstant_funnnel_v2i64:
2464; SSE: # %bb.0:
2465; SSE-NEXT: psrlq $14, %xmm1
2466; SSE-NEXT: psllq $50, %xmm0
2467; SSE-NEXT: por %xmm1, %xmm0
2468; SSE-NEXT: retq
2469;
2470; AVX-LABEL: splatconstant_funnnel_v2i64:
2471; AVX: # %bb.0:
2472; AVX-NEXT: vpsrlq $14, %xmm1, %xmm1
2473; AVX-NEXT: vpsllq $50, %xmm0, %xmm0
2474; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2475; AVX-NEXT: retq
2476;
2477; AVX512-LABEL: splatconstant_funnnel_v2i64:
2478; AVX512: # %bb.0:
2479; AVX512-NEXT: vpsrlq $14, %xmm1, %xmm1
2480; AVX512-NEXT: vpsllq $50, %xmm0, %xmm0
2481; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2482; AVX512-NEXT: retq
2483;
2484; XOP-LABEL: splatconstant_funnnel_v2i64:
2485; XOP: # %bb.0:
2486; XOP-NEXT: vpsrlq $14, %xmm1, %xmm1
2487; XOP-NEXT: vpsllq $50, %xmm0, %xmm0
2488; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2489; XOP-NEXT: retq
2490;
2491; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
2492; X32-SSE: # %bb.0:
2493; X32-SSE-NEXT: psrlq $14, %xmm1
2494; X32-SSE-NEXT: psllq $50, %xmm0
2495; X32-SSE-NEXT: por %xmm1, %xmm0
2496; X32-SSE-NEXT: retl
2497 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2498 ret <2 x i64> %res
2499}
2500
2501define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2502; SSE-LABEL: splatconstant_funnnel_v4i32:
2503; SSE: # %bb.0:
2504; SSE-NEXT: psrld $4, %xmm1
2505; SSE-NEXT: pslld $28, %xmm0
2506; SSE-NEXT: por %xmm1, %xmm0
2507; SSE-NEXT: retq
2508;
2509; AVX-LABEL: splatconstant_funnnel_v4i32:
2510; AVX: # %bb.0:
2511; AVX-NEXT: vpsrld $4, %xmm1, %xmm1
2512; AVX-NEXT: vpslld $28, %xmm0, %xmm0
2513; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2514; AVX-NEXT: retq
2515;
2516; AVX512-LABEL: splatconstant_funnnel_v4i32:
2517; AVX512: # %bb.0:
2518; AVX512-NEXT: vpsrld $4, %xmm1, %xmm1
2519; AVX512-NEXT: vpslld $28, %xmm0, %xmm0
2520; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2521; AVX512-NEXT: retq
2522;
2523; XOP-LABEL: splatconstant_funnnel_v4i32:
2524; XOP: # %bb.0:
2525; XOP-NEXT: vpsrld $4, %xmm1, %xmm1
2526; XOP-NEXT: vpslld $28, %xmm0, %xmm0
2527; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2528; XOP-NEXT: retq
2529;
2530; X32-SSE-LABEL: splatconstant_funnnel_v4i32:
2531; X32-SSE: # %bb.0:
2532; X32-SSE-NEXT: psrld $4, %xmm1
2533; X32-SSE-NEXT: pslld $28, %xmm0
2534; X32-SSE-NEXT: por %xmm1, %xmm0
2535; X32-SSE-NEXT: retl
2536 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2537 ret <4 x i32> %res
2538}
2539
2540define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2541; SSE-LABEL: splatconstant_funnnel_v8i16:
2542; SSE: # %bb.0:
2543; SSE-NEXT: psrlw $7, %xmm1
2544; SSE-NEXT: psllw $9, %xmm0
2545; SSE-NEXT: por %xmm1, %xmm0
2546; SSE-NEXT: retq
2547;
2548; AVX-LABEL: splatconstant_funnnel_v8i16:
2549; AVX: # %bb.0:
2550; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
2551; AVX-NEXT: vpsllw $9, %xmm0, %xmm0
2552; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2553; AVX-NEXT: retq
2554;
2555; AVX512-LABEL: splatconstant_funnnel_v8i16:
2556; AVX512: # %bb.0:
2557; AVX512-NEXT: vpsrlw $7, %xmm1, %xmm1
2558; AVX512-NEXT: vpsllw $9, %xmm0, %xmm0
2559; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2560; AVX512-NEXT: retq
2561;
2562; XOP-LABEL: splatconstant_funnnel_v8i16:
2563; XOP: # %bb.0:
2564; XOP-NEXT: vpsrlw $7, %xmm1, %xmm1
2565; XOP-NEXT: vpsllw $9, %xmm0, %xmm0
2566; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2567; XOP-NEXT: retq
2568;
2569; X32-SSE-LABEL: splatconstant_funnnel_v8i16:
2570; X32-SSE: # %bb.0:
2571; X32-SSE-NEXT: psrlw $7, %xmm1
2572; X32-SSE-NEXT: psllw $9, %xmm0
2573; X32-SSE-NEXT: por %xmm1, %xmm0
2574; X32-SSE-NEXT: retl
2575 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2576 ret <8 x i16> %res
2577}
2578
2579define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2580; SSE-LABEL: splatconstant_funnnel_v16i8:
2581; SSE: # %bb.0:
2582; SSE-NEXT: psrlw $4, %xmm1
2583; SSE-NEXT: pand {{.*}}(%rip), %xmm1
2584; SSE-NEXT: psllw $4, %xmm0
2585; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2586; SSE-NEXT: por %xmm1, %xmm0
2587; SSE-NEXT: retq
2588;
2589; AVX-LABEL: splatconstant_funnnel_v16i8:
2590; AVX: # %bb.0:
2591; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1
2592; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
2593; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
2594; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2595; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2596; AVX-NEXT: retq
2597;
2598; AVX512-LABEL: splatconstant_funnnel_v16i8:
2599; AVX512: # %bb.0:
2600; AVX512-NEXT: vpsrlw $4, %xmm1, %xmm1
2601; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
2602; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0
2603; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2604; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2605; AVX512-NEXT: retq
2606;
2607; XOP-LABEL: splatconstant_funnnel_v16i8:
2608; XOP: # %bb.0:
2609; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1
2610; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
2611; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2612; XOP-NEXT: retq
2613;
2614; X32-SSE-LABEL: splatconstant_funnnel_v16i8:
2615; X32-SSE: # %bb.0:
2616; X32-SSE-NEXT: psrlw $4, %xmm1
2617; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
2618; X32-SSE-NEXT: psllw $4, %xmm0
2619; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2620; X32-SSE-NEXT: por %xmm1, %xmm0
2621; X32-SSE-NEXT: retl
2622 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
2623 ret <16 x i8> %res
2624}