blob: 616a3780e346f031affdd11cf2430c18b2f1081d [file] [log] [blame]
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
12
13; Just one 32-bit run to make sure we do reasonable things for i64 cases.
14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
15
16declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
17declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
18declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
19declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
20
21;
22; Variable Shifts
23;
24
25define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
26; SSE2-LABEL: var_funnnel_v2i64:
27; SSE2: # %bb.0:
28; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
29; SSE2-NEXT: movdqa %xmm1, %xmm3
30; SSE2-NEXT: psrlq %xmm2, %xmm3
31; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
32; SSE2-NEXT: movdqa %xmm1, %xmm5
33; SSE2-NEXT: psrlq %xmm4, %xmm5
34; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
35; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,64]
36; SSE2-NEXT: psubq %xmm2, %xmm3
37; SSE2-NEXT: movdqa %xmm0, %xmm4
38; SSE2-NEXT: psllq %xmm3, %xmm4
39; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
40; SSE2-NEXT: psllq %xmm3, %xmm0
41; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
42; SSE2-NEXT: orpd %xmm5, %xmm0
43; SSE2-NEXT: pxor %xmm3, %xmm3
44; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
45; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
46; SSE2-NEXT: pand %xmm3, %xmm2
47; SSE2-NEXT: pand %xmm2, %xmm1
48; SSE2-NEXT: pandn %xmm0, %xmm2
49; SSE2-NEXT: por %xmm1, %xmm2
50; SSE2-NEXT: movdqa %xmm2, %xmm0
51; SSE2-NEXT: retq
52;
53; SSE41-LABEL: var_funnnel_v2i64:
54; SSE41: # %bb.0:
55; SSE41-NEXT: movdqa %xmm0, %xmm3
56; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
57; SSE41-NEXT: movdqa %xmm1, %xmm0
58; SSE41-NEXT: psrlq %xmm2, %xmm0
59; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
60; SSE41-NEXT: movdqa %xmm1, %xmm5
61; SSE41-NEXT: psrlq %xmm4, %xmm5
62; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm5[4,5,6,7]
63; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [64,64]
64; SSE41-NEXT: psubq %xmm2, %xmm0
65; SSE41-NEXT: movdqa %xmm3, %xmm4
66; SSE41-NEXT: psllq %xmm0, %xmm4
67; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
68; SSE41-NEXT: psllq %xmm0, %xmm3
69; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
70; SSE41-NEXT: por %xmm5, %xmm3
71; SSE41-NEXT: pxor %xmm0, %xmm0
72; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
73; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
74; SSE41-NEXT: movapd %xmm3, %xmm0
75; SSE41-NEXT: retq
76;
77; AVX1-LABEL: var_funnnel_v2i64:
78; AVX1: # %bb.0:
79; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
80; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
81; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
82; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm4
83; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
84; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
85; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
86; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm5
87; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
88; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
89; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
90; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
91; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
92; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
93; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
94; AVX1-NEXT: retq
95;
96; AVX2-LABEL: var_funnnel_v2i64:
97; AVX2: # %bb.0:
98; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
99; AVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
100; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
101; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
102; AVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
103; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
104; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
105; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
106; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
107; AVX2-NEXT: retq
108;
109; AVX512F-LABEL: var_funnnel_v2i64:
110; AVX512F: # %bb.0:
111; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
112; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
113; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
114; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
115; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
116; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
117; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
118; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
119; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
120; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
121; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
122; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
123; AVX512F-NEXT: vzeroupper
124; AVX512F-NEXT: retq
125;
126; AVX512VL-LABEL: var_funnnel_v2i64:
127; AVX512VL: # %bb.0:
128; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
129; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
130; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
131; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
132; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
133; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
134; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
135; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
136; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
137; AVX512VL-NEXT: retq
138;
139; AVX512BW-LABEL: var_funnnel_v2i64:
140; AVX512BW: # %bb.0:
141; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
142; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
143; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
144; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
145; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
146; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
147; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
148; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
149; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
150; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
151; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
152; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
153; AVX512BW-NEXT: vzeroupper
154; AVX512BW-NEXT: retq
155;
156; AVX512VLBW-LABEL: var_funnnel_v2i64:
157; AVX512VLBW: # %bb.0:
158; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
159; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
160; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
161; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
162; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
163; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
164; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
165; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
166; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
167; AVX512VLBW-NEXT: retq
168;
169; XOPAVX1-LABEL: var_funnnel_v2i64:
170; XOPAVX1: # %bb.0:
171; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
172; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
173; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm4
174; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm4
175; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64]
176; XOPAVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5
177; XOPAVX1-NEXT: vpshlq %xmm5, %xmm0, %xmm0
178; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
179; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
180; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
181; XOPAVX1-NEXT: retq
182;
183; XOPAVX2-LABEL: var_funnnel_v2i64:
184; XOPAVX2: # %bb.0:
185; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
186; XOPAVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
187; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
188; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
189; XOPAVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
190; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
191; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
192; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
193; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
194; XOPAVX2-NEXT: retq
195;
196; X32-SSE-LABEL: var_funnnel_v2i64:
197; X32-SSE: # %bb.0:
198; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
199; X32-SSE-NEXT: movdqa %xmm1, %xmm3
200; X32-SSE-NEXT: psrlq %xmm2, %xmm3
201; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
202; X32-SSE-NEXT: movdqa %xmm1, %xmm5
203; X32-SSE-NEXT: psrlq %xmm4, %xmm5
204; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
205; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
206; X32-SSE-NEXT: psubq %xmm2, %xmm3
207; X32-SSE-NEXT: movdqa %xmm0, %xmm4
208; X32-SSE-NEXT: psllq %xmm3, %xmm4
209; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
210; X32-SSE-NEXT: psllq %xmm3, %xmm0
211; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
212; X32-SSE-NEXT: orpd %xmm5, %xmm0
213; X32-SSE-NEXT: pxor %xmm3, %xmm3
214; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
215; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
216; X32-SSE-NEXT: pand %xmm3, %xmm2
217; X32-SSE-NEXT: pand %xmm2, %xmm1
218; X32-SSE-NEXT: pandn %xmm0, %xmm2
219; X32-SSE-NEXT: por %xmm1, %xmm2
220; X32-SSE-NEXT: movdqa %xmm2, %xmm0
221; X32-SSE-NEXT: retl
222 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
223 ret <2 x i64> %res
224}
225
226define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
227; SSE2-LABEL: var_funnnel_v4i32:
228; SSE2: # %bb.0:
229; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
230; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
231; SSE2-NEXT: movdqa %xmm1, %xmm4
232; SSE2-NEXT: psrld %xmm3, %xmm4
233; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
234; SSE2-NEXT: movdqa %xmm1, %xmm3
235; SSE2-NEXT: psrld %xmm5, %xmm3
236; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
237; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
238; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
239; SSE2-NEXT: movdqa %xmm1, %xmm6
240; SSE2-NEXT: psrld %xmm5, %xmm6
241; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
242; SSE2-NEXT: movdqa %xmm1, %xmm5
243; SSE2-NEXT: psrld %xmm4, %xmm5
244; SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
245; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
246; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
247; SSE2-NEXT: psubd %xmm2, %xmm4
248; SSE2-NEXT: pslld $23, %xmm4
249; SSE2-NEXT: paddd {{.*}}(%rip), %xmm4
250; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
251; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
252; SSE2-NEXT: pmuludq %xmm4, %xmm0
253; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
254; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
255; SSE2-NEXT: pmuludq %xmm5, %xmm0
256; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
257; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
258; SSE2-NEXT: por %xmm3, %xmm6
259; SSE2-NEXT: pxor %xmm0, %xmm0
260; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
261; SSE2-NEXT: pand %xmm0, %xmm1
262; SSE2-NEXT: pandn %xmm6, %xmm0
263; SSE2-NEXT: por %xmm1, %xmm0
264; SSE2-NEXT: retq
265;
266; SSE41-LABEL: var_funnnel_v4i32:
267; SSE41: # %bb.0:
268; SSE41-NEXT: movdqa %xmm0, %xmm3
269; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
270; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7]
271; SSE41-NEXT: movdqa %xmm1, %xmm4
272; SSE41-NEXT: psrld %xmm0, %xmm4
273; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
274; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7]
275; SSE41-NEXT: movdqa %xmm1, %xmm6
276; SSE41-NEXT: psrld %xmm5, %xmm6
277; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
278; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
279; SSE41-NEXT: movdqa %xmm1, %xmm5
280; SSE41-NEXT: psrld %xmm4, %xmm5
281; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
282; SSE41-NEXT: movdqa %xmm1, %xmm4
283; SSE41-NEXT: psrld %xmm0, %xmm4
284; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
285; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
286; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
287; SSE41-NEXT: psubd %xmm2, %xmm0
288; SSE41-NEXT: pslld $23, %xmm0
289; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
290; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
291; SSE41-NEXT: pmulld %xmm0, %xmm3
292; SSE41-NEXT: por %xmm4, %xmm3
293; SSE41-NEXT: pxor %xmm0, %xmm0
294; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
295; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
296; SSE41-NEXT: movaps %xmm3, %xmm0
297; SSE41-NEXT: retq
298;
299; AVX1-LABEL: var_funnnel_v4i32:
300; AVX1: # %bb.0:
301; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
302; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
303; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
304; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
305; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4
306; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
307; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
308; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
309; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
310; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
311; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
312; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
313; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
314; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
315; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
316; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
317; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm5, %xmm5
318; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
319; AVX1-NEXT: vpmulld %xmm5, %xmm0, %xmm0
320; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
321; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
322; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
323; AVX1-NEXT: retq
324;
325; AVX2-LABEL: var_funnnel_v4i32:
326; AVX2: # %bb.0:
327; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
328; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
329; AVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
330; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
331; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
332; AVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
333; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
334; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
335; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
336; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
337; AVX2-NEXT: retq
338;
339; AVX512F-LABEL: var_funnnel_v4i32:
340; AVX512F: # %bb.0:
341; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
342; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
343; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
344; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
345; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
346; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
347; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
348; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
349; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
350; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
351; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
352; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
353; AVX512F-NEXT: vzeroupper
354; AVX512F-NEXT: retq
355;
356; AVX512VL-LABEL: var_funnnel_v4i32:
357; AVX512VL: # %bb.0:
358; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
359; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
360; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
361; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
362; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
363; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
364; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
365; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
366; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
367; AVX512VL-NEXT: retq
368;
369; AVX512BW-LABEL: var_funnnel_v4i32:
370; AVX512BW: # %bb.0:
371; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
372; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
373; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
374; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
375; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
376; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
377; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
378; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
379; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
380; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
381; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
382; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
383; AVX512BW-NEXT: vzeroupper
384; AVX512BW-NEXT: retq
385;
386; AVX512VLBW-LABEL: var_funnnel_v4i32:
387; AVX512VLBW: # %bb.0:
388; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
389; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
390; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
391; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
392; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
393; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
394; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
395; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
396; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
397; AVX512VLBW-NEXT: retq
398;
399; XOPAVX1-LABEL: var_funnnel_v4i32:
400; XOPAVX1: # %bb.0:
401; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
402; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
403; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm4
404; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm4
405; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
406; XOPAVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
407; XOPAVX1-NEXT: vpshld %xmm5, %xmm0, %xmm0
408; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
409; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
410; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
411; XOPAVX1-NEXT: retq
412;
413; XOPAVX2-LABEL: var_funnnel_v4i32:
414; XOPAVX2: # %bb.0:
415; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
416; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
417; XOPAVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
418; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
419; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
420; XOPAVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
421; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
422; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
423; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
424; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
425; XOPAVX2-NEXT: retq
426;
427; X32-SSE-LABEL: var_funnnel_v4i32:
428; X32-SSE: # %bb.0:
429; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
430; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
431; X32-SSE-NEXT: movdqa %xmm1, %xmm4
432; X32-SSE-NEXT: psrld %xmm3, %xmm4
433; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
434; X32-SSE-NEXT: movdqa %xmm1, %xmm3
435; X32-SSE-NEXT: psrld %xmm5, %xmm3
436; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
437; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
438; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
439; X32-SSE-NEXT: movdqa %xmm1, %xmm6
440; X32-SSE-NEXT: psrld %xmm5, %xmm6
441; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
442; X32-SSE-NEXT: movdqa %xmm1, %xmm5
443; X32-SSE-NEXT: psrld %xmm4, %xmm5
444; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
445; X32-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
446; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
447; X32-SSE-NEXT: psubd %xmm2, %xmm4
448; X32-SSE-NEXT: pslld $23, %xmm4
449; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm4
450; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
451; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
452; X32-SSE-NEXT: pmuludq %xmm4, %xmm0
453; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
454; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
455; X32-SSE-NEXT: pmuludq %xmm5, %xmm0
456; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
457; X32-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
458; X32-SSE-NEXT: por %xmm3, %xmm6
459; X32-SSE-NEXT: pxor %xmm0, %xmm0
460; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0
461; X32-SSE-NEXT: pand %xmm0, %xmm1
462; X32-SSE-NEXT: pandn %xmm6, %xmm0
463; X32-SSE-NEXT: por %xmm1, %xmm0
464; X32-SSE-NEXT: retl
465 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
466 ret <4 x i32> %res
467}
468
469define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
470; SSE2-LABEL: var_funnnel_v8i16:
471; SSE2: # %bb.0:
472; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
473; SSE2-NEXT: movdqa %xmm2, %xmm4
474; SSE2-NEXT: psllw $12, %xmm4
475; SSE2-NEXT: movdqa %xmm4, %xmm3
476; SSE2-NEXT: psraw $15, %xmm3
477; SSE2-NEXT: movdqa %xmm1, %xmm5
478; SSE2-NEXT: psrlw $8, %xmm5
479; SSE2-NEXT: pand %xmm3, %xmm5
480; SSE2-NEXT: pandn %xmm1, %xmm3
481; SSE2-NEXT: por %xmm5, %xmm3
482; SSE2-NEXT: paddw %xmm4, %xmm4
483; SSE2-NEXT: movdqa %xmm4, %xmm5
484; SSE2-NEXT: psraw $15, %xmm5
485; SSE2-NEXT: movdqa %xmm5, %xmm6
486; SSE2-NEXT: pandn %xmm3, %xmm6
487; SSE2-NEXT: psrlw $4, %xmm3
488; SSE2-NEXT: pand %xmm5, %xmm3
489; SSE2-NEXT: por %xmm6, %xmm3
490; SSE2-NEXT: paddw %xmm4, %xmm4
491; SSE2-NEXT: movdqa %xmm4, %xmm5
492; SSE2-NEXT: psraw $15, %xmm5
493; SSE2-NEXT: movdqa %xmm5, %xmm6
494; SSE2-NEXT: pandn %xmm3, %xmm6
495; SSE2-NEXT: psrlw $2, %xmm3
496; SSE2-NEXT: pand %xmm5, %xmm3
497; SSE2-NEXT: por %xmm6, %xmm3
498; SSE2-NEXT: paddw %xmm4, %xmm4
499; SSE2-NEXT: psraw $15, %xmm4
500; SSE2-NEXT: movdqa %xmm4, %xmm5
501; SSE2-NEXT: pandn %xmm3, %xmm5
502; SSE2-NEXT: psrlw $1, %xmm3
503; SSE2-NEXT: pand %xmm4, %xmm3
504; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
505; SSE2-NEXT: psubw %xmm2, %xmm4
506; SSE2-NEXT: pxor %xmm8, %xmm8
507; SSE2-NEXT: movdqa %xmm4, %xmm7
508; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
509; SSE2-NEXT: pslld $23, %xmm7
510; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
511; SSE2-NEXT: paddd %xmm6, %xmm7
512; SSE2-NEXT: cvttps2dq %xmm7, %xmm7
513; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
514; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
515; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
516; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
517; SSE2-NEXT: pslld $23, %xmm4
518; SSE2-NEXT: paddd %xmm6, %xmm4
519; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
520; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
521; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
522; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
523; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
524; SSE2-NEXT: pmullw %xmm0, %xmm4
525; SSE2-NEXT: por %xmm5, %xmm4
526; SSE2-NEXT: por %xmm3, %xmm4
527; SSE2-NEXT: pcmpeqw %xmm8, %xmm2
528; SSE2-NEXT: pand %xmm2, %xmm1
529; SSE2-NEXT: pandn %xmm4, %xmm2
530; SSE2-NEXT: por %xmm1, %xmm2
531; SSE2-NEXT: movdqa %xmm2, %xmm0
532; SSE2-NEXT: retq
533;
534; SSE41-LABEL: var_funnnel_v8i16:
535; SSE41: # %bb.0:
536; SSE41-NEXT: movdqa %xmm0, %xmm8
537; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
538; SSE41-NEXT: movdqa %xmm2, %xmm0
539; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
540; SSE41-NEXT: psubw %xmm2, %xmm5
541; SSE41-NEXT: pxor %xmm4, %xmm4
542; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
543; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
544; SSE41-NEXT: pcmpeqw %xmm2, %xmm4
545; SSE41-NEXT: psllw $12, %xmm2
546; SSE41-NEXT: psllw $4, %xmm0
547; SSE41-NEXT: por %xmm2, %xmm0
548; SSE41-NEXT: movdqa %xmm0, %xmm2
549; SSE41-NEXT: paddw %xmm0, %xmm2
550; SSE41-NEXT: movdqa %xmm1, %xmm7
551; SSE41-NEXT: psrlw $8, %xmm7
552; SSE41-NEXT: movdqa %xmm1, %xmm3
553; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
554; SSE41-NEXT: movdqa %xmm3, %xmm7
555; SSE41-NEXT: psrlw $4, %xmm7
556; SSE41-NEXT: movdqa %xmm2, %xmm0
557; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
558; SSE41-NEXT: movdqa %xmm3, %xmm7
559; SSE41-NEXT: psrlw $2, %xmm7
560; SSE41-NEXT: paddw %xmm2, %xmm2
561; SSE41-NEXT: movdqa %xmm2, %xmm0
562; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
563; SSE41-NEXT: movdqa %xmm3, %xmm7
564; SSE41-NEXT: psrlw $1, %xmm7
565; SSE41-NEXT: paddw %xmm2, %xmm2
566; SSE41-NEXT: movdqa %xmm2, %xmm0
567; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
568; SSE41-NEXT: pslld $23, %xmm5
569; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
570; SSE41-NEXT: paddd %xmm0, %xmm5
571; SSE41-NEXT: cvttps2dq %xmm5, %xmm2
572; SSE41-NEXT: pslld $23, %xmm6
573; SSE41-NEXT: paddd %xmm0, %xmm6
574; SSE41-NEXT: cvttps2dq %xmm6, %xmm0
575; SSE41-NEXT: packusdw %xmm2, %xmm0
576; SSE41-NEXT: pmullw %xmm0, %xmm8
577; SSE41-NEXT: por %xmm3, %xmm8
578; SSE41-NEXT: movdqa %xmm4, %xmm0
579; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm8
580; SSE41-NEXT: movdqa %xmm8, %xmm0
581; SSE41-NEXT: retq
582;
583; AVX1-LABEL: var_funnnel_v8i16:
584; AVX1: # %bb.0:
585; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
586; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
587; AVX1-NEXT: vpsllw $4, %xmm2, %xmm4
588; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
589; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm4
590; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm5
591; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm3
592; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm5
593; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
594; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm5
595; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
596; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
597; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm5
598; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
599; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
600; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
601; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
602; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
603; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
604; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
605; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
606; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
607; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
608; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
609; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
610; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
611; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
612; AVX1-NEXT: vpackusdw %xmm6, %xmm4, %xmm4
613; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
614; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
615; AVX1-NEXT: vpcmpeqw %xmm5, %xmm2, %xmm2
616; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
617; AVX1-NEXT: retq
618;
619; AVX2-LABEL: var_funnnel_v8i16:
620; AVX2: # %bb.0:
621; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
622; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
623; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
624; AVX2-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
625; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
626; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
627; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
628; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
629; AVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm5
630; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
631; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
632; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm0
633; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
634; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
635; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
636; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
637; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
638; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
639; AVX2-NEXT: vzeroupper
640; AVX2-NEXT: retq
641;
642; AVX512F-LABEL: var_funnnel_v8i16:
643; AVX512F: # %bb.0:
644; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
645; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
646; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
647; AVX512F-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
648; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
649; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
650; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
651; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
652; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
653; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
654; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
655; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
656; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
657; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
658; AVX512F-NEXT: vzeroupper
659; AVX512F-NEXT: retq
660;
661; AVX512VL-LABEL: var_funnnel_v8i16:
662; AVX512VL: # %bb.0:
663; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
664; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
665; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
666; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
667; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
668; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
669; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
670; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
671; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
672; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
673; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
674; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
675; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
676; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
677; AVX512VL-NEXT: vzeroupper
678; AVX512VL-NEXT: retq
679;
680; AVX512BW-LABEL: var_funnnel_v8i16:
681; AVX512BW: # %bb.0:
682; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
683; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
684; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
685; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
686; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
687; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
688; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
689; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
690; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
691; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
692; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
693; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
694; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
695; AVX512BW-NEXT: vzeroupper
696; AVX512BW-NEXT: retq
697;
698; AVX512VLBW-LABEL: var_funnnel_v8i16:
699; AVX512VLBW: # %bb.0:
700; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
701; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
702; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm5
703; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
704; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
705; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm0
706; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
707; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
708; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
709; AVX512VLBW-NEXT: retq
710;
711; XOP-LABEL: var_funnnel_v8i16:
712; XOP: # %bb.0:
713; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
714; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
715; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm4
716; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm4
717; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
718; XOP-NEXT: vpsubw %xmm2, %xmm5, %xmm5
719; XOP-NEXT: vpshlw %xmm5, %xmm0, %xmm0
720; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
721; XOP-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
722; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
723; XOP-NEXT: retq
724;
725; X32-SSE-LABEL: var_funnnel_v8i16:
726; X32-SSE: # %bb.0:
727; X32-SSE-NEXT: subl $28, %esp
728; X32-SSE-NEXT: movups %xmm0, (%esp) # 16-byte Spill
729; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
730; X32-SSE-NEXT: movdqa %xmm2, %xmm4
731; X32-SSE-NEXT: psllw $12, %xmm4
732; X32-SSE-NEXT: movdqa %xmm4, %xmm3
733; X32-SSE-NEXT: psraw $15, %xmm3
734; X32-SSE-NEXT: movdqa %xmm1, %xmm5
735; X32-SSE-NEXT: psrlw $8, %xmm5
736; X32-SSE-NEXT: pand %xmm3, %xmm5
737; X32-SSE-NEXT: pandn %xmm1, %xmm3
738; X32-SSE-NEXT: por %xmm5, %xmm3
739; X32-SSE-NEXT: paddw %xmm4, %xmm4
740; X32-SSE-NEXT: movdqa %xmm4, %xmm5
741; X32-SSE-NEXT: psraw $15, %xmm5
742; X32-SSE-NEXT: movdqa %xmm5, %xmm6
743; X32-SSE-NEXT: pandn %xmm3, %xmm6
744; X32-SSE-NEXT: psrlw $4, %xmm3
745; X32-SSE-NEXT: pand %xmm5, %xmm3
746; X32-SSE-NEXT: por %xmm6, %xmm3
747; X32-SSE-NEXT: paddw %xmm4, %xmm4
748; X32-SSE-NEXT: movdqa %xmm4, %xmm5
749; X32-SSE-NEXT: psraw $15, %xmm5
750; X32-SSE-NEXT: movdqa %xmm5, %xmm6
751; X32-SSE-NEXT: pandn %xmm3, %xmm6
752; X32-SSE-NEXT: psrlw $2, %xmm3
753; X32-SSE-NEXT: pand %xmm5, %xmm3
754; X32-SSE-NEXT: por %xmm6, %xmm3
755; X32-SSE-NEXT: paddw %xmm4, %xmm4
756; X32-SSE-NEXT: psraw $15, %xmm4
757; X32-SSE-NEXT: movdqa %xmm4, %xmm5
758; X32-SSE-NEXT: pandn %xmm3, %xmm5
759; X32-SSE-NEXT: psrlw $1, %xmm3
760; X32-SSE-NEXT: pand %xmm4, %xmm3
761; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
762; X32-SSE-NEXT: psubw %xmm2, %xmm4
763; X32-SSE-NEXT: pxor %xmm6, %xmm6
764; X32-SSE-NEXT: movdqa %xmm4, %xmm7
765; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
766; X32-SSE-NEXT: pslld $23, %xmm7
767; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
768; X32-SSE-NEXT: paddd %xmm0, %xmm7
769; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
770; X32-SSE-NEXT: pslld $23, %xmm4
771; X32-SSE-NEXT: paddd %xmm0, %xmm4
772; X32-SSE-NEXT: cvttps2dq %xmm7, %xmm0
773; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
774; X32-SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
775; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
776; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
777; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
778; X32-SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
779; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
780; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
781; X32-SSE-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload
782; X32-SSE-NEXT: pmullw %xmm0, %xmm4
783; X32-SSE-NEXT: por %xmm5, %xmm4
784; X32-SSE-NEXT: por %xmm3, %xmm4
785; X32-SSE-NEXT: pcmpeqw %xmm6, %xmm2
786; X32-SSE-NEXT: pand %xmm2, %xmm1
787; X32-SSE-NEXT: pandn %xmm4, %xmm2
788; X32-SSE-NEXT: por %xmm1, %xmm2
789; X32-SSE-NEXT: movdqa %xmm2, %xmm0
790; X32-SSE-NEXT: addl $28, %esp
791; X32-SSE-NEXT: retl
792 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
793 ret <8 x i16> %res
794}
795
796define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
797; SSE2-LABEL: var_funnnel_v16i8:
798; SSE2: # %bb.0:
799; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
800; SSE2-NEXT: movdqa %xmm2, %xmm5
801; SSE2-NEXT: psllw $5, %xmm5
802; SSE2-NEXT: pxor %xmm3, %xmm3
803; SSE2-NEXT: pxor %xmm6, %xmm6
804; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
805; SSE2-NEXT: movdqa %xmm1, %xmm4
806; SSE2-NEXT: psrlw $4, %xmm4
807; SSE2-NEXT: pand %xmm6, %xmm4
808; SSE2-NEXT: pandn %xmm1, %xmm6
809; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
810; SSE2-NEXT: por %xmm6, %xmm4
811; SSE2-NEXT: paddb %xmm5, %xmm5
812; SSE2-NEXT: pxor %xmm6, %xmm6
813; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
814; SSE2-NEXT: movdqa %xmm6, %xmm7
815; SSE2-NEXT: pandn %xmm4, %xmm7
816; SSE2-NEXT: psrlw $2, %xmm4
817; SSE2-NEXT: pand %xmm6, %xmm4
818; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
819; SSE2-NEXT: por %xmm7, %xmm4
820; SSE2-NEXT: paddb %xmm5, %xmm5
821; SSE2-NEXT: pxor %xmm6, %xmm6
822; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
823; SSE2-NEXT: movdqa %xmm6, %xmm5
824; SSE2-NEXT: pandn %xmm4, %xmm5
825; SSE2-NEXT: psrlw $1, %xmm4
826; SSE2-NEXT: pand %xmm6, %xmm4
827; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
828; SSE2-NEXT: por %xmm5, %xmm4
829; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
830; SSE2-NEXT: psubb %xmm2, %xmm5
831; SSE2-NEXT: psllw $5, %xmm5
832; SSE2-NEXT: pxor %xmm6, %xmm6
833; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
834; SSE2-NEXT: movdqa %xmm6, %xmm7
835; SSE2-NEXT: pandn %xmm0, %xmm7
836; SSE2-NEXT: psllw $4, %xmm0
837; SSE2-NEXT: pand %xmm6, %xmm0
838; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
839; SSE2-NEXT: por %xmm7, %xmm0
840; SSE2-NEXT: paddb %xmm5, %xmm5
841; SSE2-NEXT: pxor %xmm6, %xmm6
842; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
843; SSE2-NEXT: movdqa %xmm6, %xmm7
844; SSE2-NEXT: pandn %xmm0, %xmm7
845; SSE2-NEXT: psllw $2, %xmm0
846; SSE2-NEXT: pand %xmm6, %xmm0
847; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
848; SSE2-NEXT: por %xmm7, %xmm0
849; SSE2-NEXT: paddb %xmm5, %xmm5
850; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
851; SSE2-NEXT: pcmpgtb %xmm5, %xmm3
852; SSE2-NEXT: movdqa %xmm3, %xmm5
853; SSE2-NEXT: pandn %xmm0, %xmm5
854; SSE2-NEXT: por %xmm4, %xmm5
855; SSE2-NEXT: paddb %xmm0, %xmm0
856; SSE2-NEXT: pand %xmm3, %xmm0
857; SSE2-NEXT: por %xmm5, %xmm0
858; SSE2-NEXT: pand %xmm2, %xmm1
859; SSE2-NEXT: pandn %xmm0, %xmm2
860; SSE2-NEXT: por %xmm1, %xmm2
861; SSE2-NEXT: movdqa %xmm2, %xmm0
862; SSE2-NEXT: retq
863;
864; SSE41-LABEL: var_funnnel_v16i8:
865; SSE41: # %bb.0:
866; SSE41-NEXT: movdqa %xmm0, %xmm3
867; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
868; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
869; SSE41-NEXT: psubb %xmm2, %xmm4
870; SSE41-NEXT: pxor %xmm5, %xmm5
871; SSE41-NEXT: pcmpeqb %xmm2, %xmm5
872; SSE41-NEXT: movdqa %xmm2, %xmm0
873; SSE41-NEXT: psllw $5, %xmm0
874; SSE41-NEXT: movdqa %xmm1, %xmm2
875; SSE41-NEXT: psrlw $4, %xmm2
876; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
877; SSE41-NEXT: movdqa %xmm1, %xmm6
878; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
879; SSE41-NEXT: movdqa %xmm6, %xmm2
880; SSE41-NEXT: psrlw $2, %xmm2
881; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
882; SSE41-NEXT: paddb %xmm0, %xmm0
883; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
884; SSE41-NEXT: movdqa %xmm6, %xmm2
885; SSE41-NEXT: psrlw $1, %xmm2
886; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
887; SSE41-NEXT: paddb %xmm0, %xmm0
888; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
889; SSE41-NEXT: psllw $5, %xmm4
890; SSE41-NEXT: movdqa %xmm4, %xmm2
891; SSE41-NEXT: paddb %xmm4, %xmm2
892; SSE41-NEXT: movdqa %xmm3, %xmm7
893; SSE41-NEXT: psllw $4, %xmm7
894; SSE41-NEXT: pand {{.*}}(%rip), %xmm7
895; SSE41-NEXT: movdqa %xmm4, %xmm0
896; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
897; SSE41-NEXT: movdqa %xmm3, %xmm4
898; SSE41-NEXT: psllw $2, %xmm4
899; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
900; SSE41-NEXT: movdqa %xmm2, %xmm0
901; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
902; SSE41-NEXT: movdqa %xmm3, %xmm4
903; SSE41-NEXT: paddb %xmm3, %xmm4
904; SSE41-NEXT: paddb %xmm2, %xmm2
905; SSE41-NEXT: movdqa %xmm2, %xmm0
906; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
907; SSE41-NEXT: por %xmm6, %xmm3
908; SSE41-NEXT: movdqa %xmm5, %xmm0
909; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
910; SSE41-NEXT: movdqa %xmm3, %xmm0
911; SSE41-NEXT: retq
912;
913; AVX-LABEL: var_funnnel_v16i8:
914; AVX: # %bb.0:
915; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
916; AVX-NEXT: vpsllw $5, %xmm2, %xmm3
917; AVX-NEXT: vpsrlw $4, %xmm1, %xmm4
918; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
919; AVX-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm4
920; AVX-NEXT: vpsrlw $2, %xmm4, %xmm5
921; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
922; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
923; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
924; AVX-NEXT: vpsrlw $1, %xmm4, %xmm5
925; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
926; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
927; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
928; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
929; AVX-NEXT: vpsubb %xmm2, %xmm4, %xmm4
930; AVX-NEXT: vpsllw $5, %xmm4, %xmm4
931; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5
932; AVX-NEXT: vpsllw $4, %xmm0, %xmm6
933; AVX-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
934; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm0, %xmm0
935; AVX-NEXT: vpsllw $2, %xmm0, %xmm4
936; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
937; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
938; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm4
939; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5
940; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
941; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
942; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
943; AVX-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
944; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
945; AVX-NEXT: retq
946;
947; AVX512F-LABEL: var_funnnel_v16i8:
948; AVX512F: # %bb.0:
949; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
950; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
951; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
952; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
953; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
954; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
955; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
956; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
957; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
958; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
959; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
960; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
961; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
962; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
963; AVX512F-NEXT: vzeroupper
964; AVX512F-NEXT: retq
965;
966; AVX512VL-LABEL: var_funnnel_v16i8:
967; AVX512VL: # %bb.0:
968; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
969; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
970; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
971; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
972; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
973; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
974; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
975; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
976; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
977; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
978; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
979; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
980; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
981; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
982; AVX512VL-NEXT: vzeroupper
983; AVX512VL-NEXT: retq
984;
985; AVX512BW-LABEL: var_funnnel_v16i8:
986; AVX512BW: # %bb.0:
987; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
988; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
989; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
990; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
991; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
992; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
993; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
994; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
995; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
996; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
997; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
998; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
999; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
1000; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1001; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
1002; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1003; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1004; AVX512BW-NEXT: vzeroupper
1005; AVX512BW-NEXT: retq
1006;
1007; AVX512VLBW-LABEL: var_funnnel_v16i8:
1008; AVX512VLBW: # %bb.0:
1009; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1010; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1011; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1012; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1013; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
1014; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1015; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
1016; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1017; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1018; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
1019; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
1020; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1021; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
1022; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
1023; AVX512VLBW-NEXT: vzeroupper
1024; AVX512VLBW-NEXT: retq
1025;
1026; XOP-LABEL: var_funnnel_v16i8:
1027; XOP: # %bb.0:
1028; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1029; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
1030; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm4
1031; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm4
1032; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1033; XOP-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1034; XOP-NEXT: vpshlb %xmm5, %xmm0, %xmm0
1035; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
1036; XOP-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
1037; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1038; XOP-NEXT: retq
1039;
1040; X32-SSE-LABEL: var_funnnel_v16i8:
1041; X32-SSE: # %bb.0:
1042; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1043; X32-SSE-NEXT: movdqa %xmm2, %xmm5
1044; X32-SSE-NEXT: psllw $5, %xmm5
1045; X32-SSE-NEXT: pxor %xmm3, %xmm3
1046; X32-SSE-NEXT: pxor %xmm6, %xmm6
1047; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1048; X32-SSE-NEXT: movdqa %xmm1, %xmm4
1049; X32-SSE-NEXT: psrlw $4, %xmm4
1050; X32-SSE-NEXT: pand %xmm6, %xmm4
1051; X32-SSE-NEXT: pandn %xmm1, %xmm6
1052; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1053; X32-SSE-NEXT: por %xmm6, %xmm4
1054; X32-SSE-NEXT: paddb %xmm5, %xmm5
1055; X32-SSE-NEXT: pxor %xmm6, %xmm6
1056; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1057; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1058; X32-SSE-NEXT: pandn %xmm4, %xmm7
1059; X32-SSE-NEXT: psrlw $2, %xmm4
1060; X32-SSE-NEXT: pand %xmm6, %xmm4
1061; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1062; X32-SSE-NEXT: por %xmm7, %xmm4
1063; X32-SSE-NEXT: paddb %xmm5, %xmm5
1064; X32-SSE-NEXT: pxor %xmm6, %xmm6
1065; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1066; X32-SSE-NEXT: movdqa %xmm6, %xmm5
1067; X32-SSE-NEXT: pandn %xmm4, %xmm5
1068; X32-SSE-NEXT: psrlw $1, %xmm4
1069; X32-SSE-NEXT: pand %xmm6, %xmm4
1070; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1071; X32-SSE-NEXT: por %xmm5, %xmm4
1072; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1073; X32-SSE-NEXT: psubb %xmm2, %xmm5
1074; X32-SSE-NEXT: psllw $5, %xmm5
1075; X32-SSE-NEXT: pxor %xmm6, %xmm6
1076; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1077; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1078; X32-SSE-NEXT: pandn %xmm0, %xmm7
1079; X32-SSE-NEXT: psllw $4, %xmm0
1080; X32-SSE-NEXT: pand %xmm6, %xmm0
1081; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1082; X32-SSE-NEXT: por %xmm7, %xmm0
1083; X32-SSE-NEXT: paddb %xmm5, %xmm5
1084; X32-SSE-NEXT: pxor %xmm6, %xmm6
1085; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1086; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1087; X32-SSE-NEXT: pandn %xmm0, %xmm7
1088; X32-SSE-NEXT: psllw $2, %xmm0
1089; X32-SSE-NEXT: pand %xmm6, %xmm0
1090; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1091; X32-SSE-NEXT: por %xmm7, %xmm0
1092; X32-SSE-NEXT: paddb %xmm5, %xmm5
1093; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
1094; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm3
1095; X32-SSE-NEXT: movdqa %xmm3, %xmm5
1096; X32-SSE-NEXT: pandn %xmm0, %xmm5
1097; X32-SSE-NEXT: por %xmm4, %xmm5
1098; X32-SSE-NEXT: paddb %xmm0, %xmm0
1099; X32-SSE-NEXT: pand %xmm3, %xmm0
1100; X32-SSE-NEXT: por %xmm5, %xmm0
1101; X32-SSE-NEXT: pand %xmm2, %xmm1
1102; X32-SSE-NEXT: pandn %xmm0, %xmm2
1103; X32-SSE-NEXT: por %xmm1, %xmm2
1104; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1105; X32-SSE-NEXT: retl
1106 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
1107 ret <16 x i8> %res
1108}
1109
1110;
1111; Uniform Variable Shifts
1112;
1113
1114define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
1115; SSE2-LABEL: splatvar_funnnel_v2i64:
1116; SSE2: # %bb.0:
1117; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1118; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1119; SSE2-NEXT: movdqa %xmm1, %xmm3
1120; SSE2-NEXT: psrlq %xmm2, %xmm3
1121; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
1122; SSE2-NEXT: psubq %xmm2, %xmm4
1123; SSE2-NEXT: psllq %xmm4, %xmm0
1124; SSE2-NEXT: por %xmm3, %xmm0
1125; SSE2-NEXT: pxor %xmm3, %xmm3
1126; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
1127; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
1128; SSE2-NEXT: pand %xmm3, %xmm2
1129; SSE2-NEXT: pand %xmm2, %xmm1
1130; SSE2-NEXT: pandn %xmm0, %xmm2
1131; SSE2-NEXT: por %xmm1, %xmm2
1132; SSE2-NEXT: movdqa %xmm2, %xmm0
1133; SSE2-NEXT: retq
1134;
1135; SSE41-LABEL: splatvar_funnnel_v2i64:
1136; SSE41: # %bb.0:
1137; SSE41-NEXT: movdqa %xmm0, %xmm3
1138; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1139; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1140; SSE41-NEXT: movdqa %xmm1, %xmm0
1141; SSE41-NEXT: psrlq %xmm2, %xmm0
1142; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
1143; SSE41-NEXT: psubq %xmm2, %xmm4
1144; SSE41-NEXT: psllq %xmm4, %xmm3
1145; SSE41-NEXT: por %xmm0, %xmm3
1146; SSE41-NEXT: pxor %xmm0, %xmm0
1147; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
1148; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
1149; SSE41-NEXT: movapd %xmm3, %xmm0
1150; SSE41-NEXT: retq
1151;
1152; AVX1-LABEL: splatvar_funnnel_v2i64:
1153; AVX1: # %bb.0:
1154; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1155; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1156; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1157; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1158; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1159; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1160; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1161; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1162; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1163; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1164; AVX1-NEXT: retq
1165;
1166; AVX2-LABEL: splatvar_funnnel_v2i64:
1167; AVX2: # %bb.0:
1168; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2
1169; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1170; AVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1171; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1172; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1173; AVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1174; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1175; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1176; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1177; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1178; AVX2-NEXT: retq
1179;
1180; AVX512F-LABEL: splatvar_funnnel_v2i64:
1181; AVX512F: # %bb.0:
1182; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1183; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
1184; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1185; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1186; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1187; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1188; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1189; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1190; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
1191; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
1192; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1193; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1194; AVX512F-NEXT: vzeroupper
1195; AVX512F-NEXT: retq
1196;
1197; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1198; AVX512VL: # %bb.0:
1199; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
1200; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1201; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1202; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1203; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1204; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1205; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1206; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
1207; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
1208; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
1209; AVX512VL-NEXT: retq
1210;
1211; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1212; AVX512BW: # %bb.0:
1213; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1214; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
1215; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1216; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1217; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1218; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1219; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1220; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1221; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1222; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
1223; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1224; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1225; AVX512BW-NEXT: vzeroupper
1226; AVX512BW-NEXT: retq
1227;
1228; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1229; AVX512VLBW: # %bb.0:
1230; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
1231; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1232; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1233; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1234; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1235; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1236; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1237; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1238; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
1239; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
1240; AVX512VLBW-NEXT: retq
1241;
1242; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
1243; XOPAVX1: # %bb.0:
1244; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1245; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1246; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1247; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1248; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1249; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1250; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1251; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1252; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
1253; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1254; XOPAVX1-NEXT: retq
1255;
1256; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
1257; XOPAVX2: # %bb.0:
1258; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2
1259; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1260; XOPAVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1261; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1262; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1263; XOPAVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1264; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1265; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1266; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
1267; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1268; XOPAVX2-NEXT: retq
1269;
1270; X32-SSE-LABEL: splatvar_funnnel_v2i64:
1271; X32-SSE: # %bb.0:
1272; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1273; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1274; X32-SSE-NEXT: movdqa %xmm1, %xmm3
1275; X32-SSE-NEXT: psrlq %xmm2, %xmm3
1276; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
1277; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1278; X32-SSE-NEXT: psrlq %xmm4, %xmm5
1279; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
1280; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
1281; X32-SSE-NEXT: psubq %xmm2, %xmm3
1282; X32-SSE-NEXT: movdqa %xmm0, %xmm4
1283; X32-SSE-NEXT: psllq %xmm3, %xmm4
1284; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1285; X32-SSE-NEXT: psllq %xmm3, %xmm0
1286; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1287; X32-SSE-NEXT: orpd %xmm5, %xmm0
1288; X32-SSE-NEXT: pxor %xmm3, %xmm3
1289; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
1290; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
1291; X32-SSE-NEXT: pand %xmm3, %xmm2
1292; X32-SSE-NEXT: pand %xmm2, %xmm1
1293; X32-SSE-NEXT: pandn %xmm0, %xmm2
1294; X32-SSE-NEXT: por %xmm1, %xmm2
1295; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1296; X32-SSE-NEXT: retl
1297 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1298 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1299 ret <2 x i64> %res
1300}
1301
1302define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1303; SSE2-LABEL: splatvar_funnnel_v4i32:
1304; SSE2: # %bb.0:
1305; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1306; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1307; SSE2-NEXT: pxor %xmm3, %xmm3
1308; SSE2-NEXT: xorps %xmm4, %xmm4
1309; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1310; SSE2-NEXT: movdqa %xmm1, %xmm5
1311; SSE2-NEXT: psrld %xmm4, %xmm5
1312; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
1313; SSE2-NEXT: psubd %xmm2, %xmm4
1314; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
1315; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
1316; SSE2-NEXT: pslld %xmm3, %xmm0
1317; SSE2-NEXT: por %xmm5, %xmm0
1318; SSE2-NEXT: pand %xmm2, %xmm1
1319; SSE2-NEXT: pandn %xmm0, %xmm2
1320; SSE2-NEXT: por %xmm1, %xmm2
1321; SSE2-NEXT: movdqa %xmm2, %xmm0
1322; SSE2-NEXT: retq
1323;
1324; SSE41-LABEL: splatvar_funnnel_v4i32:
1325; SSE41: # %bb.0:
1326; SSE41-NEXT: movdqa %xmm0, %xmm3
1327; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1328; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1329; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
1330; SSE41-NEXT: movdqa %xmm1, %xmm4
1331; SSE41-NEXT: psrld %xmm0, %xmm4
1332; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
1333; SSE41-NEXT: psubd %xmm2, %xmm0
1334; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1335; SSE41-NEXT: pslld %xmm0, %xmm3
1336; SSE41-NEXT: por %xmm4, %xmm3
1337; SSE41-NEXT: pxor %xmm0, %xmm0
1338; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
1339; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
1340; SSE41-NEXT: movaps %xmm3, %xmm0
1341; SSE41-NEXT: retq
1342;
1343; AVX1-LABEL: splatvar_funnnel_v4i32:
1344; AVX1: # %bb.0:
1345; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1346; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1347; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1348; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1349; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
1350; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1351; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1352; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
1353; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1354; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1355; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1356; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1357; AVX1-NEXT: retq
1358;
1359; AVX2-LABEL: splatvar_funnnel_v4i32:
1360; AVX2: # %bb.0:
1361; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
1362; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1363; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1364; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1365; AVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1366; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1367; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1368; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1369; AVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1370; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1371; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1372; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1373; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1374; AVX2-NEXT: retq
1375;
1376; AVX512F-LABEL: splatvar_funnnel_v4i32:
1377; AVX512F: # %bb.0:
1378; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1379; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
1380; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1381; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1382; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1383; AVX512F-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1384; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1385; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1386; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1387; AVX512F-NEXT: vpslld %xmm4, %xmm0, %xmm0
1388; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
1389; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
1390; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1391; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1392; AVX512F-NEXT: vzeroupper
1393; AVX512F-NEXT: retq
1394;
1395; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1396; AVX512VL: # %bb.0:
1397; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
1398; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1399; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1400; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1401; AVX512VL-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1402; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1403; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1404; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1405; AVX512VL-NEXT: vpslld %xmm4, %xmm0, %xmm0
1406; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
1407; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
1408; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1409; AVX512VL-NEXT: retq
1410;
1411; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1412; AVX512BW: # %bb.0:
1413; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1414; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
1415; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1416; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1417; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1418; AVX512BW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1419; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1420; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1421; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1422; AVX512BW-NEXT: vpslld %xmm4, %xmm0, %xmm0
1423; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1424; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
1425; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1426; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1427; AVX512BW-NEXT: vzeroupper
1428; AVX512BW-NEXT: retq
1429;
1430; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1431; AVX512VLBW: # %bb.0:
1432; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
1433; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1434; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1435; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1436; AVX512VLBW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1437; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1438; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1439; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1440; AVX512VLBW-NEXT: vpslld %xmm4, %xmm0, %xmm0
1441; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1442; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
1443; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1444; AVX512VLBW-NEXT: retq
1445;
1446; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
1447; XOPAVX1: # %bb.0:
1448; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1449; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1450; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1451; XOPAVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1452; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
1453; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1454; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1455; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
1456; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1457; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1458; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
1459; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1460; XOPAVX1-NEXT: retq
1461;
1462; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
1463; XOPAVX2: # %bb.0:
1464; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2
1465; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1466; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1467; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1468; XOPAVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1469; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1470; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1471; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1472; XOPAVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1473; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1474; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1475; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
1476; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1477; XOPAVX2-NEXT: retq
1478;
1479; X32-SSE-LABEL: splatvar_funnnel_v4i32:
1480; X32-SSE: # %bb.0:
1481; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1482; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1483; X32-SSE-NEXT: pxor %xmm3, %xmm3
1484; X32-SSE-NEXT: xorps %xmm4, %xmm4
1485; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1486; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1487; X32-SSE-NEXT: psrld %xmm4, %xmm5
1488; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
1489; X32-SSE-NEXT: psubd %xmm2, %xmm4
1490; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2
1491; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
1492; X32-SSE-NEXT: pslld %xmm3, %xmm0
1493; X32-SSE-NEXT: por %xmm5, %xmm0
1494; X32-SSE-NEXT: pand %xmm2, %xmm1
1495; X32-SSE-NEXT: pandn %xmm0, %xmm2
1496; X32-SSE-NEXT: por %xmm1, %xmm2
1497; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1498; X32-SSE-NEXT: retl
1499 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1500 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1501 ret <4 x i32> %res
1502}
1503
1504define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1505; SSE2-LABEL: splatvar_funnnel_v8i16:
1506; SSE2: # %bb.0:
1507; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1508; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1509; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1510; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1511; SSE2-NEXT: psubw %xmm3, %xmm4
1512; SSE2-NEXT: pxor %xmm2, %xmm2
1513; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
1514; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
1515; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1516; SSE2-NEXT: movdqa %xmm1, %xmm5
1517; SSE2-NEXT: psrlw %xmm3, %xmm5
1518; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1519; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1520; SSE2-NEXT: psllw %xmm4, %xmm0
1521; SSE2-NEXT: por %xmm5, %xmm0
1522; SSE2-NEXT: pand %xmm2, %xmm1
1523; SSE2-NEXT: pandn %xmm0, %xmm2
1524; SSE2-NEXT: por %xmm1, %xmm2
1525; SSE2-NEXT: movdqa %xmm2, %xmm0
1526; SSE2-NEXT: retq
1527;
1528; SSE41-LABEL: splatvar_funnnel_v8i16:
1529; SSE41: # %bb.0:
1530; SSE41-NEXT: movdqa %xmm0, %xmm3
1531; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
1532; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
1533; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1534; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1535; SSE41-NEXT: movdqa %xmm1, %xmm4
1536; SSE41-NEXT: psrlw %xmm0, %xmm4
1537; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
1538; SSE41-NEXT: psubw %xmm2, %xmm0
1539; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1540; SSE41-NEXT: psllw %xmm0, %xmm3
1541; SSE41-NEXT: por %xmm4, %xmm3
1542; SSE41-NEXT: pxor %xmm0, %xmm0
1543; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
1544; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
1545; SSE41-NEXT: movdqa %xmm3, %xmm0
1546; SSE41-NEXT: retq
1547;
1548; AVX1-LABEL: splatvar_funnnel_v8i16:
1549; AVX1: # %bb.0:
1550; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1551; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1552; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1553; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1554; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1555; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1556; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1557; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1558; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1559; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1560; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1561; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1562; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1563; AVX1-NEXT: retq
1564;
1565; AVX2-LABEL: splatvar_funnnel_v8i16:
1566; AVX2: # %bb.0:
1567; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
1568; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1569; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1570; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1571; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1572; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1573; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1574; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1575; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1576; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1577; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1578; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1579; AVX2-NEXT: retq
1580;
1581; AVX512F-LABEL: splatvar_funnnel_v8i16:
1582; AVX512F: # %bb.0:
1583; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
1584; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1585; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1586; AVX512F-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1587; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1588; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1589; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1590; AVX512F-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1591; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
1592; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1593; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1594; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1595; AVX512F-NEXT: retq
1596;
1597; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1598; AVX512VL: # %bb.0:
1599; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
1600; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1601; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1602; AVX512VL-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1603; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1604; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1605; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1606; AVX512VL-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1607; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
1608; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1609; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1610; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1611; AVX512VL-NEXT: retq
1612;
1613; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1614; AVX512BW: # %bb.0:
1615; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1616; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
1617; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1618; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1619; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1620; AVX512BW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
1621; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
1622; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
1623; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1624; AVX512BW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1625; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1626; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
1627; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1628; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1629; AVX512BW-NEXT: vzeroupper
1630; AVX512BW-NEXT: retq
1631;
1632; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1633; AVX512VLBW: # %bb.0:
1634; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
1635; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1636; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1637; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1638; AVX512VLBW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
1639; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
1640; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
1641; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1642; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1643; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1644; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
1645; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
1646; AVX512VLBW-NEXT: retq
1647;
1648; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
1649; XOPAVX1: # %bb.0:
1650; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1651; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1652; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1653; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1654; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1655; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1656; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1657; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1658; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1659; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1660; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1661; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
1662; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1663; XOPAVX1-NEXT: retq
1664;
1665; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
1666; XOPAVX2: # %bb.0:
1667; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2
1668; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1669; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1670; XOPAVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1671; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1672; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1673; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1674; XOPAVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1675; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1676; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1677; XOPAVX2-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
1678; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1679; XOPAVX2-NEXT: retq
1680;
1681; X32-SSE-LABEL: splatvar_funnnel_v8i16:
1682; X32-SSE: # %bb.0:
1683; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1684; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1685; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
1686; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1687; X32-SSE-NEXT: psubw %xmm3, %xmm4
1688; X32-SSE-NEXT: pxor %xmm2, %xmm2
1689; X32-SSE-NEXT: pcmpeqw %xmm3, %xmm2
1690; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
1691; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1692; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1693; X32-SSE-NEXT: psrlw %xmm3, %xmm5
1694; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1695; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1696; X32-SSE-NEXT: psllw %xmm4, %xmm0
1697; X32-SSE-NEXT: por %xmm5, %xmm0
1698; X32-SSE-NEXT: pand %xmm2, %xmm1
1699; X32-SSE-NEXT: pandn %xmm0, %xmm2
1700; X32-SSE-NEXT: por %xmm1, %xmm2
1701; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1702; X32-SSE-NEXT: retl
1703 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1704 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1705 ret <8 x i16> %res
1706}
1707
1708define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1709; SSE2-LABEL: splatvar_funnnel_v16i8:
1710; SSE2: # %bb.0:
1711; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1712; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1713; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1714; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1715; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1716; SSE2-NEXT: psubb %xmm3, %xmm4
1717; SSE2-NEXT: pxor %xmm2, %xmm2
1718; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
1719; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1720; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1721; SSE2-NEXT: movdqa %xmm1, %xmm5
1722; SSE2-NEXT: psrlw %xmm3, %xmm5
1723; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
1724; SSE2-NEXT: psrlw %xmm3, %xmm6
1725; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
1726; SSE2-NEXT: psrlw $8, %xmm6
1727; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1728; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
1729; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
1730; SSE2-NEXT: pand %xmm5, %xmm6
1731; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
1732; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1733; SSE2-NEXT: psllw %xmm4, %xmm0
1734; SSE2-NEXT: psllw %xmm4, %xmm3
1735; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1736; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
1737; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1738; SSE2-NEXT: pand %xmm0, %xmm3
1739; SSE2-NEXT: por %xmm6, %xmm3
1740; SSE2-NEXT: pand %xmm2, %xmm1
1741; SSE2-NEXT: pandn %xmm3, %xmm2
1742; SSE2-NEXT: por %xmm1, %xmm2
1743; SSE2-NEXT: movdqa %xmm2, %xmm0
1744; SSE2-NEXT: retq
1745;
1746; SSE41-LABEL: splatvar_funnnel_v16i8:
1747; SSE41: # %bb.0:
1748; SSE41-NEXT: movdqa %xmm0, %xmm3
1749; SSE41-NEXT: pxor %xmm0, %xmm0
1750; SSE41-NEXT: pshufb %xmm0, %xmm2
1751; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1752; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1753; SSE41-NEXT: movdqa %xmm1, %xmm5
1754; SSE41-NEXT: psrlw %xmm4, %xmm5
1755; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
1756; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
1757; SSE41-NEXT: psrlw %xmm4, %xmm7
1758; SSE41-NEXT: pshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1759; SSE41-NEXT: pand %xmm5, %xmm7
1760; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1761; SSE41-NEXT: psubb %xmm2, %xmm4
1762; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1763; SSE41-NEXT: psllw %xmm4, %xmm3
1764; SSE41-NEXT: psllw %xmm4, %xmm6
1765; SSE41-NEXT: pshufb %xmm0, %xmm6
1766; SSE41-NEXT: pand %xmm6, %xmm3
1767; SSE41-NEXT: por %xmm7, %xmm3
1768; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
1769; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
1770; SSE41-NEXT: movdqa %xmm3, %xmm0
1771; SSE41-NEXT: retq
1772;
1773; AVX1-LABEL: splatvar_funnnel_v16i8:
1774; AVX1: # %bb.0:
1775; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1776; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1777; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1778; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1779; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm5
1780; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
1781; AVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm4
1782; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1783; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
1784; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1785; AVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1786; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
1787; AVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0
1788; AVX1-NEXT: vpsllw %xmm5, %xmm6, %xmm5
1789; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5
1790; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
1791; AVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
1792; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1793; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1794; AVX1-NEXT: retq
1795;
1796; AVX2-LABEL: splatvar_funnnel_v16i8:
1797; AVX2: # %bb.0:
1798; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1799; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1800; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1801; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm4
1802; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1803; AVX2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
1804; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
1805; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3
1806; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3
1807; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1808; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1809; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1810; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1811; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
1812; AVX2-NEXT: vpbroadcastb %xmm4, %xmm4
1813; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
1814; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1815; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1816; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1817; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1818; AVX2-NEXT: retq
1819;
1820; AVX512F-LABEL: splatvar_funnnel_v16i8:
1821; AVX512F: # %bb.0:
1822; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
1823; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1824; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1825; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1826; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1827; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1828; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1829; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1830; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1831; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1832; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
1833; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1834; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1835; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1836; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1837; AVX512F-NEXT: vzeroupper
1838; AVX512F-NEXT: retq
1839;
1840; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1841; AVX512VL: # %bb.0:
1842; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
1843; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1844; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1845; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1846; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1847; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1848; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1849; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1850; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1851; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1852; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
1853; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1854; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1855; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1856; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1857; AVX512VL-NEXT: vzeroupper
1858; AVX512VL-NEXT: retq
1859;
1860; AVX512BW-LABEL: splatvar_funnnel_v16i8:
1861; AVX512BW: # %bb.0:
1862; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1863; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
1864; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1865; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1866; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
1867; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1868; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
1869; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1870; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1871; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
1872; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1873; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
1874; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
1875; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1876; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
1877; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1878; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1879; AVX512BW-NEXT: vzeroupper
1880; AVX512BW-NEXT: retq
1881;
1882; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
1883; AVX512VLBW: # %bb.0:
1884; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
1885; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1886; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1887; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1888; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1889; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
1890; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1891; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
1892; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1893; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1894; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
1895; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
1896; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1897; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
1898; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
1899; AVX512VLBW-NEXT: vzeroupper
1900; AVX512VLBW-NEXT: retq
1901;
1902; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
1903; XOPAVX1: # %bb.0:
1904; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1905; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1906; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1907; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4
1908; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4
1909; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1910; XOPAVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1911; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0
1912; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
1913; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
1914; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1915; XOPAVX1-NEXT: retq
1916;
1917; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
1918; XOPAVX2: # %bb.0:
1919; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1920; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1921; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1922; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm4
1923; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm4
1924; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1925; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1926; XOPAVX2-NEXT: vpshlb %xmm5, %xmm0, %xmm0
1927; XOPAVX2-NEXT: vpor %xmm4, %xmm0, %xmm0
1928; XOPAVX2-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
1929; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1930; XOPAVX2-NEXT: retq
1931;
1932; X32-SSE-LABEL: splatvar_funnnel_v16i8:
1933; X32-SSE: # %bb.0:
1934; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1935; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1936; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1937; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
1938; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1939; X32-SSE-NEXT: psubb %xmm3, %xmm4
1940; X32-SSE-NEXT: pxor %xmm2, %xmm2
1941; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
1942; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1943; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1944; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1945; X32-SSE-NEXT: psrlw %xmm3, %xmm5
1946; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6
1947; X32-SSE-NEXT: psrlw %xmm3, %xmm6
1948; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
1949; X32-SSE-NEXT: psrlw $8, %xmm6
1950; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1951; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
1952; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
1953; X32-SSE-NEXT: pand %xmm5, %xmm6
1954; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
1955; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1956; X32-SSE-NEXT: psllw %xmm4, %xmm0
1957; X32-SSE-NEXT: psllw %xmm4, %xmm3
1958; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1959; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
1960; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1961; X32-SSE-NEXT: pand %xmm0, %xmm3
1962; X32-SSE-NEXT: por %xmm6, %xmm3
1963; X32-SSE-NEXT: pand %xmm2, %xmm1
1964; X32-SSE-NEXT: pandn %xmm3, %xmm2
1965; X32-SSE-NEXT: por %xmm1, %xmm2
1966; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1967; X32-SSE-NEXT: retl
1968 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
1969 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
1970 ret <16 x i8> %res
1971}
1972
1973;
1974; Constant Shifts
1975;
1976
1977define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
1978; SSE2-LABEL: constant_funnnel_v2i64:
1979; SSE2: # %bb.0:
1980; SSE2-NEXT: movdqa %xmm1, %xmm2
1981; SSE2-NEXT: psrlq $4, %xmm2
1982; SSE2-NEXT: psrlq $14, %xmm1
1983; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1984; SSE2-NEXT: movdqa %xmm0, %xmm2
1985; SSE2-NEXT: psllq $60, %xmm2
1986; SSE2-NEXT: psllq $50, %xmm0
1987; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1988; SSE2-NEXT: orpd %xmm1, %xmm0
1989; SSE2-NEXT: retq
1990;
1991; SSE41-LABEL: constant_funnnel_v2i64:
1992; SSE41: # %bb.0:
1993; SSE41-NEXT: movdqa %xmm1, %xmm2
1994; SSE41-NEXT: psrlq $14, %xmm2
1995; SSE41-NEXT: psrlq $4, %xmm1
1996; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1997; SSE41-NEXT: movdqa %xmm0, %xmm2
1998; SSE41-NEXT: psllq $50, %xmm2
1999; SSE41-NEXT: psllq $60, %xmm0
2000; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2001; SSE41-NEXT: por %xmm1, %xmm0
2002; SSE41-NEXT: retq
2003;
2004; AVX1-LABEL: constant_funnnel_v2i64:
2005; AVX1: # %bb.0:
2006; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm2
2007; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm1
2008; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2009; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2
2010; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
2011; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2012; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2013; AVX1-NEXT: retq
2014;
2015; AVX2-LABEL: constant_funnnel_v2i64:
2016; AVX2: # %bb.0:
2017; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2018; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2019; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2020; AVX2-NEXT: retq
2021;
2022; AVX512-LABEL: constant_funnnel_v2i64:
2023; AVX512: # %bb.0:
2024; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2025; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2026; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2027; AVX512-NEXT: retq
2028;
2029; XOPAVX1-LABEL: constant_funnnel_v2i64:
2030; XOPAVX1: # %bb.0:
2031; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1
2032; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
2033; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2034; XOPAVX1-NEXT: retq
2035;
2036; XOPAVX2-LABEL: constant_funnnel_v2i64:
2037; XOPAVX2: # %bb.0:
2038; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2039; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2040; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2041; XOPAVX2-NEXT: retq
2042;
2043; X32-SSE-LABEL: constant_funnnel_v2i64:
2044; X32-SSE: # %bb.0:
2045; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2046; X32-SSE-NEXT: psrlq $4, %xmm2
2047; X32-SSE-NEXT: psrlq $14, %xmm1
2048; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2049; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2050; X32-SSE-NEXT: psllq $60, %xmm2
2051; X32-SSE-NEXT: psllq $50, %xmm0
2052; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2053; X32-SSE-NEXT: orpd %xmm1, %xmm0
2054; X32-SSE-NEXT: retl
2055 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
2056 ret <2 x i64> %res
2057}
2058
2059define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2060; SSE2-LABEL: constant_funnnel_v4i32:
2061; SSE2: # %bb.0:
2062; SSE2-NEXT: movdqa %xmm1, %xmm2
2063; SSE2-NEXT: psrld $7, %xmm2
2064; SSE2-NEXT: movdqa %xmm1, %xmm3
2065; SSE2-NEXT: psrld $6, %xmm3
2066; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2067; SSE2-NEXT: movdqa %xmm1, %xmm2
2068; SSE2-NEXT: psrld $5, %xmm2
2069; SSE2-NEXT: psrld $4, %xmm1
2070; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2071; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2072; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2073; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2074; SSE2-NEXT: pmuludq %xmm2, %xmm0
2075; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2076; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2077; SSE2-NEXT: pmuludq %xmm3, %xmm2
2078; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2079; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2080; SSE2-NEXT: por %xmm1, %xmm0
2081; SSE2-NEXT: retq
2082;
2083; SSE41-LABEL: constant_funnnel_v4i32:
2084; SSE41: # %bb.0:
2085; SSE41-NEXT: movdqa %xmm1, %xmm2
2086; SSE41-NEXT: psrld $7, %xmm2
2087; SSE41-NEXT: movdqa %xmm1, %xmm3
2088; SSE41-NEXT: psrld $5, %xmm3
2089; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2090; SSE41-NEXT: movdqa %xmm1, %xmm2
2091; SSE41-NEXT: psrld $6, %xmm2
2092; SSE41-NEXT: psrld $4, %xmm1
2093; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2094; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2095; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
2096; SSE41-NEXT: por %xmm1, %xmm0
2097; SSE41-NEXT: retq
2098;
2099; AVX1-LABEL: constant_funnnel_v4i32:
2100; AVX1: # %bb.0:
2101; AVX1-NEXT: vpsrld $7, %xmm1, %xmm2
2102; AVX1-NEXT: vpsrld $5, %xmm1, %xmm3
2103; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2104; AVX1-NEXT: vpsrld $6, %xmm1, %xmm3
2105; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
2106; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
2107; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2108; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2109; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2110; AVX1-NEXT: retq
2111;
2112; AVX2-LABEL: constant_funnnel_v4i32:
2113; AVX2: # %bb.0:
2114; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2115; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2116; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2117; AVX2-NEXT: retq
2118;
2119; AVX512-LABEL: constant_funnnel_v4i32:
2120; AVX512: # %bb.0:
2121; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2122; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2123; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2124; AVX512-NEXT: retq
2125;
2126; XOPAVX1-LABEL: constant_funnnel_v4i32:
2127; XOPAVX1: # %bb.0:
2128; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
2129; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
2130; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2131; XOPAVX1-NEXT: retq
2132;
2133; XOPAVX2-LABEL: constant_funnnel_v4i32:
2134; XOPAVX2: # %bb.0:
2135; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2136; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2137; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2138; XOPAVX2-NEXT: retq
2139;
2140; X32-SSE-LABEL: constant_funnnel_v4i32:
2141; X32-SSE: # %bb.0:
2142; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2143; X32-SSE-NEXT: psrld $7, %xmm2
2144; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2145; X32-SSE-NEXT: psrld $6, %xmm3
2146; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2147; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2148; X32-SSE-NEXT: psrld $5, %xmm2
2149; X32-SSE-NEXT: psrld $4, %xmm1
2150; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2151; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2152; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2153; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2154; X32-SSE-NEXT: pmuludq %xmm2, %xmm0
2155; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2156; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2157; X32-SSE-NEXT: pmuludq %xmm3, %xmm2
2158; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2159; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2160; X32-SSE-NEXT: por %xmm1, %xmm0
2161; X32-SSE-NEXT: retl
2162 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
2163 ret <4 x i32> %res
2164}
2165
2166define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2167; SSE2-LABEL: constant_funnnel_v8i16:
2168; SSE2: # %bb.0:
2169; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2170; SSE2-NEXT: movdqa %xmm2, %xmm3
2171; SSE2-NEXT: pandn %xmm1, %xmm3
2172; SSE2-NEXT: movdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
2173; SSE2-NEXT: pmulhuw %xmm4, %xmm1
2174; SSE2-NEXT: pand %xmm2, %xmm1
2175; SSE2-NEXT: pmullw %xmm4, %xmm0
2176; SSE2-NEXT: por %xmm3, %xmm0
2177; SSE2-NEXT: por %xmm1, %xmm0
2178; SSE2-NEXT: pand %xmm2, %xmm0
2179; SSE2-NEXT: por %xmm3, %xmm0
2180; SSE2-NEXT: retq
2181;
2182; SSE41-LABEL: constant_funnnel_v8i16:
2183; SSE41: # %bb.0:
2184; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2185; SSE41-NEXT: movdqa %xmm1, %xmm3
2186; SSE41-NEXT: pmulhuw %xmm2, %xmm3
2187; SSE41-NEXT: pmullw %xmm2, %xmm0
2188; SSE41-NEXT: por %xmm3, %xmm0
2189; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2190; SSE41-NEXT: retq
2191;
2192; AVX-LABEL: constant_funnnel_v8i16:
2193; AVX: # %bb.0:
2194; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2195; AVX-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2196; AVX-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2197; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
2198; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2199; AVX-NEXT: retq
2200;
2201; AVX512F-LABEL: constant_funnnel_v8i16:
2202; AVX512F: # %bb.0:
2203; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2204; AVX512F-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2205; AVX512F-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2206; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
2207; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2208; AVX512F-NEXT: retq
2209;
2210; AVX512VL-LABEL: constant_funnnel_v8i16:
2211; AVX512VL: # %bb.0:
2212; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2213; AVX512VL-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2214; AVX512VL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2215; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
2216; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2217; AVX512VL-NEXT: retq
2218;
2219; AVX512BW-LABEL: constant_funnnel_v8i16:
2220; AVX512BW: # %bb.0:
2221; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2222; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2223; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
2224; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
2225; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,15,14,13,12,11,10,9]
2226; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2227; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0
2228; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2229; AVX512BW-NEXT: vzeroupper
2230; AVX512BW-NEXT: retq
2231;
2232; AVX512VLBW-LABEL: constant_funnnel_v8i16:
2233; AVX512VLBW: # %bb.0:
2234; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm2
2235; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
2236; AVX512VLBW-NEXT: vpor %xmm2, %xmm0, %xmm0
2237; AVX512VLBW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2238; AVX512VLBW-NEXT: retq
2239;
2240; XOP-LABEL: constant_funnnel_v8i16:
2241; XOP: # %bb.0:
2242; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm2
2243; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
2244; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
2245; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2246; XOP-NEXT: retq
2247;
2248; X32-SSE-LABEL: constant_funnnel_v8i16:
2249; X32-SSE: # %bb.0:
2250; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2251; X32-SSE-NEXT: movdqa %xmm2, %xmm3
2252; X32-SSE-NEXT: pandn %xmm1, %xmm3
2253; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
2254; X32-SSE-NEXT: pmulhuw %xmm4, %xmm1
2255; X32-SSE-NEXT: pand %xmm2, %xmm1
2256; X32-SSE-NEXT: pmullw %xmm4, %xmm0
2257; X32-SSE-NEXT: por %xmm3, %xmm0
2258; X32-SSE-NEXT: por %xmm1, %xmm0
2259; X32-SSE-NEXT: pand %xmm2, %xmm0
2260; X32-SSE-NEXT: por %xmm3, %xmm0
2261; X32-SSE-NEXT: retl
2262 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2263 ret <8 x i16> %res
2264}
2265
2266define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2267; SSE2-LABEL: constant_funnnel_v16i8:
2268; SSE2: # %bb.0:
2269; SSE2-NEXT: pxor %xmm2, %xmm2
2270; SSE2-NEXT: movdqa %xmm1, %xmm3
2271; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2272; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
2273; SSE2-NEXT: psrlw $8, %xmm3
2274; SSE2-NEXT: movdqa %xmm1, %xmm4
2275; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2276; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm4
2277; SSE2-NEXT: psrlw $8, %xmm4
2278; SSE2-NEXT: packuswb %xmm3, %xmm4
2279; SSE2-NEXT: movdqa %xmm0, %xmm2
2280; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2281; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2282; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2283; SSE2-NEXT: pand %xmm3, %xmm2
2284; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2285; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
2286; SSE2-NEXT: pand %xmm3, %xmm0
2287; SSE2-NEXT: packuswb %xmm2, %xmm0
2288; SSE2-NEXT: por %xmm4, %xmm0
2289; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2290; SSE2-NEXT: pand %xmm2, %xmm0
2291; SSE2-NEXT: pandn %xmm1, %xmm2
2292; SSE2-NEXT: por %xmm2, %xmm0
2293; SSE2-NEXT: retq
2294;
2295; SSE41-LABEL: constant_funnnel_v16i8:
2296; SSE41: # %bb.0:
2297; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2298; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2299; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
2300; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2301; SSE41-NEXT: pand %xmm3, %xmm0
2302; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
2303; SSE41-NEXT: pand %xmm3, %xmm2
2304; SSE41-NEXT: packuswb %xmm0, %xmm2
2305; SSE41-NEXT: pxor %xmm0, %xmm0
2306; SSE41-NEXT: movdqa %xmm1, %xmm3
2307; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2308; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm3
2309; SSE41-NEXT: psrlw $8, %xmm3
2310; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2311; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm4
2312; SSE41-NEXT: psrlw $8, %xmm4
2313; SSE41-NEXT: packuswb %xmm3, %xmm4
2314; SSE41-NEXT: por %xmm2, %xmm4
2315; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2316; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
2317; SSE41-NEXT: movdqa %xmm1, %xmm0
2318; SSE41-NEXT: retq
2319;
2320; AVX1-LABEL: constant_funnnel_v16i8:
2321; AVX1: # %bb.0:
2322; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2323; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2324; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2325; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2326; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2327; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2328; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2329; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2330; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2331; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2332; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2333; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2334; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2335; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3
2336; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2337; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2338; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
2339; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2340; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2341; AVX1-NEXT: retq
2342;
2343; AVX2-LABEL: constant_funnnel_v16i8:
2344; AVX2: # %bb.0:
2345; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2346; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2347; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2348; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2349; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2350; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2351; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2352; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
2353; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
2354; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
2355; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
2356; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2357; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2358; AVX2-NEXT: vzeroupper
2359; AVX2-NEXT: retq
2360;
2361; AVX512F-LABEL: constant_funnnel_v16i8:
2362; AVX512F: # %bb.0:
2363; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2364; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2365; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2366; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
2367; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0
2368; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2369; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2370; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2371; AVX512F-NEXT: vzeroupper
2372; AVX512F-NEXT: retq
2373;
2374; AVX512VL-LABEL: constant_funnnel_v16i8:
2375; AVX512VL: # %bb.0:
2376; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2377; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2378; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2379; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
2380; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0
2381; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2382; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2383; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2384; AVX512VL-NEXT: vzeroupper
2385; AVX512VL-NEXT: retq
2386;
2387; AVX512BW-LABEL: constant_funnnel_v16i8:
2388; AVX512BW: # %bb.0:
2389; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2390; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2391; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm3, %zmm2
2392; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,7,6,5,4,3,2,1,8,1,2,3,4,5,6,7]
2393; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2394; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2395; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
2396; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2397; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2398; AVX512BW-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2399; AVX512BW-NEXT: vzeroupper
2400; AVX512BW-NEXT: retq
2401;
2402; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2403; AVX512VLBW: # %bb.0:
2404; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2405; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2406; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2407; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
2408; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm0
2409; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2410; AVX512VLBW-NEXT: movw $257, %ax # imm = 0x101
2411; AVX512VLBW-NEXT: kmovd %eax, %k1
2412; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2413; AVX512VLBW-NEXT: vzeroupper
2414; AVX512VLBW-NEXT: retq
2415;
2416; XOP-LABEL: constant_funnnel_v16i8:
2417; XOP: # %bb.0:
2418; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm2
2419; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
2420; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
2421; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2422; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2423; XOP-NEXT: retq
2424;
2425; X32-SSE-LABEL: constant_funnnel_v16i8:
2426; X32-SSE: # %bb.0:
2427; X32-SSE-NEXT: pxor %xmm2, %xmm2
2428; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2429; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2430; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm3
2431; X32-SSE-NEXT: psrlw $8, %xmm3
2432; X32-SSE-NEXT: movdqa %xmm1, %xmm4
2433; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2434; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm4
2435; X32-SSE-NEXT: psrlw $8, %xmm4
2436; X32-SSE-NEXT: packuswb %xmm3, %xmm4
2437; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2438; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2439; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm2
2440; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2441; X32-SSE-NEXT: pand %xmm3, %xmm2
2442; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2443; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
2444; X32-SSE-NEXT: pand %xmm3, %xmm0
2445; X32-SSE-NEXT: packuswb %xmm2, %xmm0
2446; X32-SSE-NEXT: por %xmm4, %xmm0
2447; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2448; X32-SSE-NEXT: pand %xmm2, %xmm0
2449; X32-SSE-NEXT: pandn %xmm1, %xmm2
2450; X32-SSE-NEXT: por %xmm2, %xmm0
2451; X32-SSE-NEXT: retl
2452 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2453 ret <16 x i8> %res
2454}
2455
2456;
2457; Uniform Constant Shifts
2458;
2459
2460define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2461; SSE-LABEL: splatconstant_funnnel_v2i64:
2462; SSE: # %bb.0:
2463; SSE-NEXT: psrlq $14, %xmm1
2464; SSE-NEXT: psllq $50, %xmm0
2465; SSE-NEXT: por %xmm1, %xmm0
2466; SSE-NEXT: retq
2467;
2468; AVX-LABEL: splatconstant_funnnel_v2i64:
2469; AVX: # %bb.0:
2470; AVX-NEXT: vpsrlq $14, %xmm1, %xmm1
2471; AVX-NEXT: vpsllq $50, %xmm0, %xmm0
2472; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2473; AVX-NEXT: retq
2474;
2475; AVX512-LABEL: splatconstant_funnnel_v2i64:
2476; AVX512: # %bb.0:
2477; AVX512-NEXT: vpsrlq $14, %xmm1, %xmm1
2478; AVX512-NEXT: vpsllq $50, %xmm0, %xmm0
2479; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2480; AVX512-NEXT: retq
2481;
2482; XOP-LABEL: splatconstant_funnnel_v2i64:
2483; XOP: # %bb.0:
2484; XOP-NEXT: vpsrlq $14, %xmm1, %xmm1
2485; XOP-NEXT: vpsllq $50, %xmm0, %xmm0
2486; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2487; XOP-NEXT: retq
2488;
2489; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
2490; X32-SSE: # %bb.0:
2491; X32-SSE-NEXT: psrlq $14, %xmm1
2492; X32-SSE-NEXT: psllq $50, %xmm0
2493; X32-SSE-NEXT: por %xmm1, %xmm0
2494; X32-SSE-NEXT: retl
2495 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2496 ret <2 x i64> %res
2497}
2498
2499define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2500; SSE-LABEL: splatconstant_funnnel_v4i32:
2501; SSE: # %bb.0:
2502; SSE-NEXT: psrld $4, %xmm1
2503; SSE-NEXT: pslld $28, %xmm0
2504; SSE-NEXT: por %xmm1, %xmm0
2505; SSE-NEXT: retq
2506;
2507; AVX-LABEL: splatconstant_funnnel_v4i32:
2508; AVX: # %bb.0:
2509; AVX-NEXT: vpsrld $4, %xmm1, %xmm1
2510; AVX-NEXT: vpslld $28, %xmm0, %xmm0
2511; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2512; AVX-NEXT: retq
2513;
2514; AVX512-LABEL: splatconstant_funnnel_v4i32:
2515; AVX512: # %bb.0:
2516; AVX512-NEXT: vpsrld $4, %xmm1, %xmm1
2517; AVX512-NEXT: vpslld $28, %xmm0, %xmm0
2518; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2519; AVX512-NEXT: retq
2520;
2521; XOP-LABEL: splatconstant_funnnel_v4i32:
2522; XOP: # %bb.0:
2523; XOP-NEXT: vpsrld $4, %xmm1, %xmm1
2524; XOP-NEXT: vpslld $28, %xmm0, %xmm0
2525; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2526; XOP-NEXT: retq
2527;
2528; X32-SSE-LABEL: splatconstant_funnnel_v4i32:
2529; X32-SSE: # %bb.0:
2530; X32-SSE-NEXT: psrld $4, %xmm1
2531; X32-SSE-NEXT: pslld $28, %xmm0
2532; X32-SSE-NEXT: por %xmm1, %xmm0
2533; X32-SSE-NEXT: retl
2534 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2535 ret <4 x i32> %res
2536}
2537
2538define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2539; SSE-LABEL: splatconstant_funnnel_v8i16:
2540; SSE: # %bb.0:
2541; SSE-NEXT: psrlw $7, %xmm1
2542; SSE-NEXT: psllw $9, %xmm0
2543; SSE-NEXT: por %xmm1, %xmm0
2544; SSE-NEXT: retq
2545;
2546; AVX-LABEL: splatconstant_funnnel_v8i16:
2547; AVX: # %bb.0:
2548; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
2549; AVX-NEXT: vpsllw $9, %xmm0, %xmm0
2550; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2551; AVX-NEXT: retq
2552;
2553; AVX512-LABEL: splatconstant_funnnel_v8i16:
2554; AVX512: # %bb.0:
2555; AVX512-NEXT: vpsrlw $7, %xmm1, %xmm1
2556; AVX512-NEXT: vpsllw $9, %xmm0, %xmm0
2557; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2558; AVX512-NEXT: retq
2559;
2560; XOP-LABEL: splatconstant_funnnel_v8i16:
2561; XOP: # %bb.0:
2562; XOP-NEXT: vpsrlw $7, %xmm1, %xmm1
2563; XOP-NEXT: vpsllw $9, %xmm0, %xmm0
2564; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2565; XOP-NEXT: retq
2566;
2567; X32-SSE-LABEL: splatconstant_funnnel_v8i16:
2568; X32-SSE: # %bb.0:
2569; X32-SSE-NEXT: psrlw $7, %xmm1
2570; X32-SSE-NEXT: psllw $9, %xmm0
2571; X32-SSE-NEXT: por %xmm1, %xmm0
2572; X32-SSE-NEXT: retl
2573 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2574 ret <8 x i16> %res
2575}
2576
2577define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2578; SSE-LABEL: splatconstant_funnnel_v16i8:
2579; SSE: # %bb.0:
2580; SSE-NEXT: psrlw $4, %xmm1
2581; SSE-NEXT: pand {{.*}}(%rip), %xmm1
2582; SSE-NEXT: psllw $4, %xmm0
2583; SSE-NEXT: pand {{.*}}(%rip), %xmm0
2584; SSE-NEXT: por %xmm1, %xmm0
2585; SSE-NEXT: retq
2586;
2587; AVX-LABEL: splatconstant_funnnel_v16i8:
2588; AVX: # %bb.0:
2589; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1
2590; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
2591; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
2592; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2593; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2594; AVX-NEXT: retq
2595;
2596; AVX512-LABEL: splatconstant_funnnel_v16i8:
2597; AVX512: # %bb.0:
2598; AVX512-NEXT: vpsrlw $4, %xmm1, %xmm1
2599; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
2600; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0
2601; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
2602; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
2603; AVX512-NEXT: retq
2604;
2605; XOP-LABEL: splatconstant_funnnel_v16i8:
2606; XOP: # %bb.0:
2607; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1
2608; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
2609; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2610; XOP-NEXT: retq
2611;
2612; X32-SSE-LABEL: splatconstant_funnnel_v16i8:
2613; X32-SSE: # %bb.0:
2614; X32-SSE-NEXT: psrlw $4, %xmm1
2615; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
2616; X32-SSE-NEXT: psllw $4, %xmm0
2617; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
2618; X32-SSE-NEXT: por %xmm1, %xmm0
2619; X32-SSE-NEXT: retl
2620 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
2621 ret <16 x i8> %res
2622}