blob: 1f70fc95277d4a3a91f8a262a5957a21ab55262b [file] [log] [blame]
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
Craig Topper6ffeeb72019-01-06 18:10:18 +00009; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
Simon Pilgrim46b90e82018-12-18 10:08:23 +000010; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
Craig Topper6ffeeb72019-01-06 18:10:18 +000011; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
Simon Pilgrim46b90e82018-12-18 10:08:23 +000012; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
14
15; Just one 32-bit run to make sure we do reasonable things for i64 cases.
16; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
17
18declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
20declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
21declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
22
23;
24; Variable Shifts
25;
26
27define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
28; SSE2-LABEL: var_funnnel_v2i64:
29; SSE2: # %bb.0:
30; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
31; SSE2-NEXT: movdqa %xmm1, %xmm3
32; SSE2-NEXT: psrlq %xmm2, %xmm3
33; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
34; SSE2-NEXT: movdqa %xmm1, %xmm5
35; SSE2-NEXT: psrlq %xmm4, %xmm5
36; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
37; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,64]
38; SSE2-NEXT: psubq %xmm2, %xmm3
39; SSE2-NEXT: movdqa %xmm0, %xmm4
40; SSE2-NEXT: psllq %xmm3, %xmm4
41; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
42; SSE2-NEXT: psllq %xmm3, %xmm0
43; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
44; SSE2-NEXT: orpd %xmm5, %xmm0
45; SSE2-NEXT: pxor %xmm3, %xmm3
46; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
47; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
48; SSE2-NEXT: pand %xmm3, %xmm2
49; SSE2-NEXT: pand %xmm2, %xmm1
50; SSE2-NEXT: pandn %xmm0, %xmm2
51; SSE2-NEXT: por %xmm1, %xmm2
52; SSE2-NEXT: movdqa %xmm2, %xmm0
53; SSE2-NEXT: retq
54;
55; SSE41-LABEL: var_funnnel_v2i64:
56; SSE41: # %bb.0:
57; SSE41-NEXT: movdqa %xmm0, %xmm3
58; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
59; SSE41-NEXT: movdqa %xmm1, %xmm0
60; SSE41-NEXT: psrlq %xmm2, %xmm0
61; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
62; SSE41-NEXT: movdqa %xmm1, %xmm5
63; SSE41-NEXT: psrlq %xmm4, %xmm5
64; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],xmm5[4,5,6,7]
65; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [64,64]
66; SSE41-NEXT: psubq %xmm2, %xmm0
67; SSE41-NEXT: movdqa %xmm3, %xmm4
68; SSE41-NEXT: psllq %xmm0, %xmm4
69; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
70; SSE41-NEXT: psllq %xmm0, %xmm3
71; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
72; SSE41-NEXT: por %xmm5, %xmm3
73; SSE41-NEXT: pxor %xmm0, %xmm0
74; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
75; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
76; SSE41-NEXT: movapd %xmm3, %xmm0
77; SSE41-NEXT: retq
78;
79; AVX1-LABEL: var_funnnel_v2i64:
80; AVX1: # %bb.0:
81; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
82; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
83; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
84; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm4
85; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
86; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
87; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
88; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm5
89; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
90; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
91; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
92; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
93; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
94; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
95; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
96; AVX1-NEXT: retq
97;
98; AVX2-LABEL: var_funnnel_v2i64:
99; AVX2: # %bb.0:
100; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
101; AVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
102; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
103; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
104; AVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
105; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
106; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
107; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
108; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
109; AVX2-NEXT: retq
110;
111; AVX512F-LABEL: var_funnnel_v2i64:
112; AVX512F: # %bb.0:
113; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
114; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
115; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
116; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
117; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
118; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
119; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
120; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
121; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
122; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
123; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
124; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
125; AVX512F-NEXT: vzeroupper
126; AVX512F-NEXT: retq
127;
128; AVX512VL-LABEL: var_funnnel_v2i64:
129; AVX512VL: # %bb.0:
130; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
131; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
132; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
133; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
134; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
135; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
136; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
137; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
138; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
139; AVX512VL-NEXT: retq
140;
141; AVX512BW-LABEL: var_funnnel_v2i64:
142; AVX512BW: # %bb.0:
143; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
144; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
145; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
146; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
147; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
148; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
149; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
150; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
151; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
152; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
153; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
154; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
155; AVX512BW-NEXT: vzeroupper
156; AVX512BW-NEXT: retq
157;
Craig Topper6ffeeb72019-01-06 18:10:18 +0000158; AVX512VBMI2-LABEL: var_funnnel_v2i64:
159; AVX512VBMI2: # %bb.0:
160; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
161; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
162; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
163; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
164; AVX512VBMI2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
165; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
166; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
167; AVX512VBMI2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
168; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
169; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
170; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
171; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
172; AVX512VBMI2-NEXT: vzeroupper
173; AVX512VBMI2-NEXT: retq
174;
Simon Pilgrim46b90e82018-12-18 10:08:23 +0000175; AVX512VLBW-LABEL: var_funnnel_v2i64:
176; AVX512VLBW: # %bb.0:
177; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
178; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
179; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
180; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
181; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
182; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
183; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
184; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
185; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
186; AVX512VLBW-NEXT: retq
187;
Craig Topper6ffeeb72019-01-06 18:10:18 +0000188; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
189; AVX512VLVBMI2: # %bb.0:
190; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
191; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
192; AVX512VLVBMI2-NEXT: retq
193;
Simon Pilgrim46b90e82018-12-18 10:08:23 +0000194; XOPAVX1-LABEL: var_funnnel_v2i64:
195; XOPAVX1: # %bb.0:
196; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
197; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
198; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm4
199; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm4
200; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [64,64]
201; XOPAVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm5
202; XOPAVX1-NEXT: vpshlq %xmm5, %xmm0, %xmm0
203; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
204; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
205; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
206; XOPAVX1-NEXT: retq
207;
208; XOPAVX2-LABEL: var_funnnel_v2i64:
209; XOPAVX2: # %bb.0:
210; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
211; XOPAVX2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
212; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
213; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
214; XOPAVX2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
215; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
216; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
217; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
218; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
219; XOPAVX2-NEXT: retq
220;
221; X32-SSE-LABEL: var_funnnel_v2i64:
222; X32-SSE: # %bb.0:
223; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
224; X32-SSE-NEXT: movdqa %xmm1, %xmm3
225; X32-SSE-NEXT: psrlq %xmm2, %xmm3
226; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
227; X32-SSE-NEXT: movdqa %xmm1, %xmm5
228; X32-SSE-NEXT: psrlq %xmm4, %xmm5
229; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
230; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
231; X32-SSE-NEXT: psubq %xmm2, %xmm3
232; X32-SSE-NEXT: movdqa %xmm0, %xmm4
233; X32-SSE-NEXT: psllq %xmm3, %xmm4
234; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
235; X32-SSE-NEXT: psllq %xmm3, %xmm0
236; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
237; X32-SSE-NEXT: orpd %xmm5, %xmm0
238; X32-SSE-NEXT: pxor %xmm3, %xmm3
239; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
240; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
241; X32-SSE-NEXT: pand %xmm3, %xmm2
242; X32-SSE-NEXT: pand %xmm2, %xmm1
243; X32-SSE-NEXT: pandn %xmm0, %xmm2
244; X32-SSE-NEXT: por %xmm1, %xmm2
245; X32-SSE-NEXT: movdqa %xmm2, %xmm0
246; X32-SSE-NEXT: retl
247 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
248 ret <2 x i64> %res
249}
250
251define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
252; SSE2-LABEL: var_funnnel_v4i32:
253; SSE2: # %bb.0:
254; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
255; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
256; SSE2-NEXT: movdqa %xmm1, %xmm4
257; SSE2-NEXT: psrld %xmm3, %xmm4
258; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
259; SSE2-NEXT: movdqa %xmm1, %xmm3
260; SSE2-NEXT: psrld %xmm5, %xmm3
261; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
262; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
263; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
264; SSE2-NEXT: movdqa %xmm1, %xmm6
265; SSE2-NEXT: psrld %xmm5, %xmm6
266; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
267; SSE2-NEXT: movdqa %xmm1, %xmm5
268; SSE2-NEXT: psrld %xmm4, %xmm5
269; SSE2-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
270; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
271; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
272; SSE2-NEXT: psubd %xmm2, %xmm4
273; SSE2-NEXT: pslld $23, %xmm4
274; SSE2-NEXT: paddd {{.*}}(%rip), %xmm4
275; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
276; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
277; SSE2-NEXT: pmuludq %xmm4, %xmm0
278; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
279; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
280; SSE2-NEXT: pmuludq %xmm5, %xmm0
281; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
282; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
283; SSE2-NEXT: por %xmm3, %xmm6
284; SSE2-NEXT: pxor %xmm0, %xmm0
285; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
286; SSE2-NEXT: pand %xmm0, %xmm1
287; SSE2-NEXT: pandn %xmm6, %xmm0
288; SSE2-NEXT: por %xmm1, %xmm0
289; SSE2-NEXT: retq
290;
291; SSE41-LABEL: var_funnnel_v4i32:
292; SSE41: # %bb.0:
293; SSE41-NEXT: movdqa %xmm0, %xmm3
294; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
295; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,3,3,3,4,5,6,7]
296; SSE41-NEXT: movdqa %xmm1, %xmm4
297; SSE41-NEXT: psrld %xmm0, %xmm4
298; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
299; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,3,3,3,4,5,6,7]
300; SSE41-NEXT: movdqa %xmm1, %xmm6
301; SSE41-NEXT: psrld %xmm5, %xmm6
302; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
303; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
304; SSE41-NEXT: movdqa %xmm1, %xmm5
305; SSE41-NEXT: psrld %xmm4, %xmm5
306; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
307; SSE41-NEXT: movdqa %xmm1, %xmm4
308; SSE41-NEXT: psrld %xmm0, %xmm4
309; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
310; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
311; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
312; SSE41-NEXT: psubd %xmm2, %xmm0
313; SSE41-NEXT: pslld $23, %xmm0
314; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
315; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
316; SSE41-NEXT: pmulld %xmm0, %xmm3
317; SSE41-NEXT: por %xmm4, %xmm3
318; SSE41-NEXT: pxor %xmm0, %xmm0
319; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
320; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
321; SSE41-NEXT: movaps %xmm3, %xmm0
322; SSE41-NEXT: retq
323;
324; AVX1-LABEL: var_funnnel_v4i32:
325; AVX1: # %bb.0:
326; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
327; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
328; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
329; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
330; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm4
331; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
332; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
333; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
334; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5
335; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero
336; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6
337; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
338; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
339; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
340; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
341; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
342; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm5, %xmm5
343; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
344; AVX1-NEXT: vpmulld %xmm5, %xmm0, %xmm0
345; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
346; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
347; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
348; AVX1-NEXT: retq
349;
350; AVX2-LABEL: var_funnnel_v4i32:
351; AVX2: # %bb.0:
352; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
353; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
354; AVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
355; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
356; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
357; AVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
358; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
359; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
360; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
361; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
362; AVX2-NEXT: retq
363;
364; AVX512F-LABEL: var_funnnel_v4i32:
365; AVX512F: # %bb.0:
366; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
367; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
368; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
369; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
370; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
371; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
372; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
373; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
374; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
375; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
376; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
377; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
378; AVX512F-NEXT: vzeroupper
379; AVX512F-NEXT: retq
380;
381; AVX512VL-LABEL: var_funnnel_v4i32:
382; AVX512VL: # %bb.0:
383; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
384; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
385; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
386; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
387; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
388; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
389; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
390; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
391; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
392; AVX512VL-NEXT: retq
393;
394; AVX512BW-LABEL: var_funnnel_v4i32:
395; AVX512BW: # %bb.0:
396; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
397; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
398; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
399; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
400; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
401; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
402; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
403; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
404; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
405; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
406; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
407; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
408; AVX512BW-NEXT: vzeroupper
409; AVX512BW-NEXT: retq
410;
Craig Topper6ffeeb72019-01-06 18:10:18 +0000411; AVX512VBMI2-LABEL: var_funnnel_v4i32:
412; AVX512VBMI2: # %bb.0:
413; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
414; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
415; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
416; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
417; AVX512VBMI2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
418; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
419; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
420; AVX512VBMI2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
421; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
422; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
423; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
424; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
425; AVX512VBMI2-NEXT: vzeroupper
426; AVX512VBMI2-NEXT: retq
427;
Simon Pilgrim46b90e82018-12-18 10:08:23 +0000428; AVX512VLBW-LABEL: var_funnnel_v4i32:
429; AVX512VLBW: # %bb.0:
430; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
431; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
432; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
433; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
434; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
435; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
436; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
437; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
438; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
439; AVX512VLBW-NEXT: retq
440;
Craig Topper6ffeeb72019-01-06 18:10:18 +0000441; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
442; AVX512VLVBMI2: # %bb.0:
443; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
444; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
445; AVX512VLVBMI2-NEXT: retq
446;
Simon Pilgrim46b90e82018-12-18 10:08:23 +0000447; XOPAVX1-LABEL: var_funnnel_v4i32:
448; XOPAVX1: # %bb.0:
449; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
450; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
451; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm4
452; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm4
453; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
454; XOPAVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm5
455; XOPAVX1-NEXT: vpshld %xmm5, %xmm0, %xmm0
456; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
457; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
458; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
459; XOPAVX1-NEXT: retq
460;
461; XOPAVX2-LABEL: var_funnnel_v4i32:
462; XOPAVX2: # %bb.0:
463; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
464; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
465; XOPAVX2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
466; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
467; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
468; XOPAVX2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
469; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
470; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
471; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
472; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
473; XOPAVX2-NEXT: retq
474;
475; X32-SSE-LABEL: var_funnnel_v4i32:
476; X32-SSE: # %bb.0:
477; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
478; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
479; X32-SSE-NEXT: movdqa %xmm1, %xmm4
480; X32-SSE-NEXT: psrld %xmm3, %xmm4
481; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,1,4,5,6,7]
482; X32-SSE-NEXT: movdqa %xmm1, %xmm3
483; X32-SSE-NEXT: psrld %xmm5, %xmm3
484; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
485; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
486; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
487; X32-SSE-NEXT: movdqa %xmm1, %xmm6
488; X32-SSE-NEXT: psrld %xmm5, %xmm6
489; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
490; X32-SSE-NEXT: movdqa %xmm1, %xmm5
491; X32-SSE-NEXT: psrld %xmm4, %xmm5
492; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
493; X32-SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[0,3]
494; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [32,32,32,32]
495; X32-SSE-NEXT: psubd %xmm2, %xmm4
496; X32-SSE-NEXT: pslld $23, %xmm4
497; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm4
498; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
499; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
500; X32-SSE-NEXT: pmuludq %xmm4, %xmm0
501; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
502; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
503; X32-SSE-NEXT: pmuludq %xmm5, %xmm0
504; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
505; X32-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
506; X32-SSE-NEXT: por %xmm3, %xmm6
507; X32-SSE-NEXT: pxor %xmm0, %xmm0
508; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0
509; X32-SSE-NEXT: pand %xmm0, %xmm1
510; X32-SSE-NEXT: pandn %xmm6, %xmm0
511; X32-SSE-NEXT: por %xmm1, %xmm0
512; X32-SSE-NEXT: retl
513 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
514 ret <4 x i32> %res
515}
516
517define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
518; SSE2-LABEL: var_funnnel_v8i16:
519; SSE2: # %bb.0:
520; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
521; SSE2-NEXT: movdqa %xmm2, %xmm4
522; SSE2-NEXT: psllw $12, %xmm4
523; SSE2-NEXT: movdqa %xmm4, %xmm3
524; SSE2-NEXT: psraw $15, %xmm3
525; SSE2-NEXT: movdqa %xmm1, %xmm5
526; SSE2-NEXT: psrlw $8, %xmm5
527; SSE2-NEXT: pand %xmm3, %xmm5
528; SSE2-NEXT: pandn %xmm1, %xmm3
529; SSE2-NEXT: por %xmm5, %xmm3
530; SSE2-NEXT: paddw %xmm4, %xmm4
531; SSE2-NEXT: movdqa %xmm4, %xmm5
532; SSE2-NEXT: psraw $15, %xmm5
533; SSE2-NEXT: movdqa %xmm5, %xmm6
534; SSE2-NEXT: pandn %xmm3, %xmm6
535; SSE2-NEXT: psrlw $4, %xmm3
536; SSE2-NEXT: pand %xmm5, %xmm3
537; SSE2-NEXT: por %xmm6, %xmm3
538; SSE2-NEXT: paddw %xmm4, %xmm4
539; SSE2-NEXT: movdqa %xmm4, %xmm5
540; SSE2-NEXT: psraw $15, %xmm5
541; SSE2-NEXT: movdqa %xmm5, %xmm6
542; SSE2-NEXT: pandn %xmm3, %xmm6
543; SSE2-NEXT: psrlw $2, %xmm3
544; SSE2-NEXT: pand %xmm5, %xmm3
545; SSE2-NEXT: por %xmm6, %xmm3
546; SSE2-NEXT: paddw %xmm4, %xmm4
547; SSE2-NEXT: psraw $15, %xmm4
548; SSE2-NEXT: movdqa %xmm4, %xmm5
549; SSE2-NEXT: pandn %xmm3, %xmm5
550; SSE2-NEXT: psrlw $1, %xmm3
551; SSE2-NEXT: pand %xmm4, %xmm3
552; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
553; SSE2-NEXT: psubw %xmm2, %xmm4
554; SSE2-NEXT: pxor %xmm8, %xmm8
555; SSE2-NEXT: movdqa %xmm4, %xmm7
556; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
557; SSE2-NEXT: pslld $23, %xmm7
558; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
559; SSE2-NEXT: paddd %xmm6, %xmm7
560; SSE2-NEXT: cvttps2dq %xmm7, %xmm7
561; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
562; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
563; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
564; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
565; SSE2-NEXT: pslld $23, %xmm4
566; SSE2-NEXT: paddd %xmm6, %xmm4
567; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
568; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
569; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
570; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
571; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
572; SSE2-NEXT: pmullw %xmm0, %xmm4
573; SSE2-NEXT: por %xmm5, %xmm4
574; SSE2-NEXT: por %xmm3, %xmm4
575; SSE2-NEXT: pcmpeqw %xmm8, %xmm2
576; SSE2-NEXT: pand %xmm2, %xmm1
577; SSE2-NEXT: pandn %xmm4, %xmm2
578; SSE2-NEXT: por %xmm1, %xmm2
579; SSE2-NEXT: movdqa %xmm2, %xmm0
580; SSE2-NEXT: retq
581;
582; SSE41-LABEL: var_funnnel_v8i16:
583; SSE41: # %bb.0:
584; SSE41-NEXT: movdqa %xmm0, %xmm8
585; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
586; SSE41-NEXT: movdqa %xmm2, %xmm0
587; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
588; SSE41-NEXT: psubw %xmm2, %xmm5
589; SSE41-NEXT: pxor %xmm4, %xmm4
590; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
591; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
592; SSE41-NEXT: pcmpeqw %xmm2, %xmm4
593; SSE41-NEXT: psllw $12, %xmm2
594; SSE41-NEXT: psllw $4, %xmm0
595; SSE41-NEXT: por %xmm2, %xmm0
596; SSE41-NEXT: movdqa %xmm0, %xmm2
597; SSE41-NEXT: paddw %xmm0, %xmm2
598; SSE41-NEXT: movdqa %xmm1, %xmm7
599; SSE41-NEXT: psrlw $8, %xmm7
600; SSE41-NEXT: movdqa %xmm1, %xmm3
601; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
602; SSE41-NEXT: movdqa %xmm3, %xmm7
603; SSE41-NEXT: psrlw $4, %xmm7
604; SSE41-NEXT: movdqa %xmm2, %xmm0
605; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
606; SSE41-NEXT: movdqa %xmm3, %xmm7
607; SSE41-NEXT: psrlw $2, %xmm7
608; SSE41-NEXT: paddw %xmm2, %xmm2
609; SSE41-NEXT: movdqa %xmm2, %xmm0
610; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
611; SSE41-NEXT: movdqa %xmm3, %xmm7
612; SSE41-NEXT: psrlw $1, %xmm7
613; SSE41-NEXT: paddw %xmm2, %xmm2
614; SSE41-NEXT: movdqa %xmm2, %xmm0
615; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
616; SSE41-NEXT: pslld $23, %xmm5
617; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
618; SSE41-NEXT: paddd %xmm0, %xmm5
619; SSE41-NEXT: cvttps2dq %xmm5, %xmm2
620; SSE41-NEXT: pslld $23, %xmm6
621; SSE41-NEXT: paddd %xmm0, %xmm6
622; SSE41-NEXT: cvttps2dq %xmm6, %xmm0
623; SSE41-NEXT: packusdw %xmm2, %xmm0
624; SSE41-NEXT: pmullw %xmm0, %xmm8
625; SSE41-NEXT: por %xmm3, %xmm8
626; SSE41-NEXT: movdqa %xmm4, %xmm0
627; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm8
628; SSE41-NEXT: movdqa %xmm8, %xmm0
629; SSE41-NEXT: retq
630;
631; AVX1-LABEL: var_funnnel_v8i16:
632; AVX1: # %bb.0:
633; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
634; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
635; AVX1-NEXT: vpsllw $4, %xmm2, %xmm4
636; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
637; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm4
638; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm5
639; AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm3
640; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm5
641; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
642; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm5
643; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
644; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
645; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm5
646; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
647; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
648; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
649; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
650; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
651; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
652; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
653; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
654; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
655; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
656; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
657; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
658; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
659; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
660; AVX1-NEXT: vpackusdw %xmm6, %xmm4, %xmm4
661; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
662; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
663; AVX1-NEXT: vpcmpeqw %xmm5, %xmm2, %xmm2
664; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
665; AVX1-NEXT: retq
666;
667; AVX2-LABEL: var_funnnel_v8i16:
668; AVX2: # %bb.0:
669; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
670; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
671; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
672; AVX2-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
673; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
674; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
675; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
676; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
677; AVX2-NEXT: vpsubw %xmm2, %xmm5, %xmm5
678; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
679; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
680; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm0
681; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
682; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
683; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
684; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
685; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
686; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
687; AVX2-NEXT: vzeroupper
688; AVX2-NEXT: retq
689;
690; AVX512F-LABEL: var_funnnel_v8i16:
691; AVX512F: # %bb.0:
692; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
693; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
694; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
695; AVX512F-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
696; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
697; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
698; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
699; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
700; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
701; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
702; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
703; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
704; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
705; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
706; AVX512F-NEXT: vzeroupper
707; AVX512F-NEXT: retq
708;
709; AVX512VL-LABEL: var_funnnel_v8i16:
710; AVX512VL: # %bb.0:
711; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
712; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
713; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
714; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
715; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
716; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
717; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
718; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
719; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
720; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
721; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
722; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
723; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
724; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
725; AVX512VL-NEXT: vzeroupper
726; AVX512VL-NEXT: retq
727;
728; AVX512BW-LABEL: var_funnnel_v8i16:
729; AVX512BW: # %bb.0:
730; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
731; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
732; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
733; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
734; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
735; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
736; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
737; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
738; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
739; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
740; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
741; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
742; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
743; AVX512BW-NEXT: vzeroupper
744; AVX512BW-NEXT: retq
745;
Craig Topper6ffeeb72019-01-06 18:10:18 +0000746; AVX512VBMI2-LABEL: var_funnnel_v8i16:
747; AVX512VBMI2: # %bb.0:
748; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
749; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
750; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
751; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
752; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
753; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
754; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
755; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
756; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
757; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
758; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
759; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
760; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
761; AVX512VBMI2-NEXT: vzeroupper
762; AVX512VBMI2-NEXT: retq
763;
Simon Pilgrim46b90e82018-12-18 10:08:23 +0000764; AVX512VLBW-LABEL: var_funnnel_v8i16:
765; AVX512VLBW: # %bb.0:
766; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
767; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
768; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm5
769; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
770; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
771; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm0
772; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
773; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
774; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
775; AVX512VLBW-NEXT: retq
776;
Craig Topper6ffeeb72019-01-06 18:10:18 +0000777; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
778; AVX512VLVBMI2: # %bb.0:
779; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
780; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
781; AVX512VLVBMI2-NEXT: retq
782;
Simon Pilgrim46b90e82018-12-18 10:08:23 +0000783; XOP-LABEL: var_funnnel_v8i16:
784; XOP: # %bb.0:
785; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
786; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
787; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm4
788; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm4
789; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
790; XOP-NEXT: vpsubw %xmm2, %xmm5, %xmm5
791; XOP-NEXT: vpshlw %xmm5, %xmm0, %xmm0
792; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
793; XOP-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
794; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
795; XOP-NEXT: retq
796;
797; X32-SSE-LABEL: var_funnnel_v8i16:
798; X32-SSE: # %bb.0:
799; X32-SSE-NEXT: subl $28, %esp
800; X32-SSE-NEXT: movups %xmm0, (%esp) # 16-byte Spill
801; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
802; X32-SSE-NEXT: movdqa %xmm2, %xmm4
803; X32-SSE-NEXT: psllw $12, %xmm4
804; X32-SSE-NEXT: movdqa %xmm4, %xmm3
805; X32-SSE-NEXT: psraw $15, %xmm3
806; X32-SSE-NEXT: movdqa %xmm1, %xmm5
807; X32-SSE-NEXT: psrlw $8, %xmm5
808; X32-SSE-NEXT: pand %xmm3, %xmm5
809; X32-SSE-NEXT: pandn %xmm1, %xmm3
810; X32-SSE-NEXT: por %xmm5, %xmm3
811; X32-SSE-NEXT: paddw %xmm4, %xmm4
812; X32-SSE-NEXT: movdqa %xmm4, %xmm5
813; X32-SSE-NEXT: psraw $15, %xmm5
814; X32-SSE-NEXT: movdqa %xmm5, %xmm6
815; X32-SSE-NEXT: pandn %xmm3, %xmm6
816; X32-SSE-NEXT: psrlw $4, %xmm3
817; X32-SSE-NEXT: pand %xmm5, %xmm3
818; X32-SSE-NEXT: por %xmm6, %xmm3
819; X32-SSE-NEXT: paddw %xmm4, %xmm4
820; X32-SSE-NEXT: movdqa %xmm4, %xmm5
821; X32-SSE-NEXT: psraw $15, %xmm5
822; X32-SSE-NEXT: movdqa %xmm5, %xmm6
823; X32-SSE-NEXT: pandn %xmm3, %xmm6
824; X32-SSE-NEXT: psrlw $2, %xmm3
825; X32-SSE-NEXT: pand %xmm5, %xmm3
826; X32-SSE-NEXT: por %xmm6, %xmm3
827; X32-SSE-NEXT: paddw %xmm4, %xmm4
828; X32-SSE-NEXT: psraw $15, %xmm4
829; X32-SSE-NEXT: movdqa %xmm4, %xmm5
830; X32-SSE-NEXT: pandn %xmm3, %xmm5
831; X32-SSE-NEXT: psrlw $1, %xmm3
832; X32-SSE-NEXT: pand %xmm4, %xmm3
833; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
834; X32-SSE-NEXT: psubw %xmm2, %xmm4
835; X32-SSE-NEXT: pxor %xmm6, %xmm6
836; X32-SSE-NEXT: movdqa %xmm4, %xmm7
837; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
838; X32-SSE-NEXT: pslld $23, %xmm7
839; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
840; X32-SSE-NEXT: paddd %xmm0, %xmm7
841; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
842; X32-SSE-NEXT: pslld $23, %xmm4
843; X32-SSE-NEXT: paddd %xmm0, %xmm4
844; X32-SSE-NEXT: cvttps2dq %xmm7, %xmm0
845; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
846; X32-SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
847; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
848; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
849; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
850; X32-SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
851; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
852; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
853; X32-SSE-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload
854; X32-SSE-NEXT: pmullw %xmm0, %xmm4
855; X32-SSE-NEXT: por %xmm5, %xmm4
856; X32-SSE-NEXT: por %xmm3, %xmm4
857; X32-SSE-NEXT: pcmpeqw %xmm6, %xmm2
858; X32-SSE-NEXT: pand %xmm2, %xmm1
859; X32-SSE-NEXT: pandn %xmm4, %xmm2
860; X32-SSE-NEXT: por %xmm1, %xmm2
861; X32-SSE-NEXT: movdqa %xmm2, %xmm0
862; X32-SSE-NEXT: addl $28, %esp
863; X32-SSE-NEXT: retl
864 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
865 ret <8 x i16> %res
866}
867
868define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
869; SSE2-LABEL: var_funnnel_v16i8:
870; SSE2: # %bb.0:
871; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
872; SSE2-NEXT: movdqa %xmm2, %xmm5
873; SSE2-NEXT: psllw $5, %xmm5
874; SSE2-NEXT: pxor %xmm3, %xmm3
875; SSE2-NEXT: pxor %xmm6, %xmm6
876; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
877; SSE2-NEXT: movdqa %xmm1, %xmm4
878; SSE2-NEXT: psrlw $4, %xmm4
879; SSE2-NEXT: pand %xmm6, %xmm4
880; SSE2-NEXT: pandn %xmm1, %xmm6
881; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
882; SSE2-NEXT: por %xmm6, %xmm4
883; SSE2-NEXT: paddb %xmm5, %xmm5
884; SSE2-NEXT: pxor %xmm6, %xmm6
885; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
886; SSE2-NEXT: movdqa %xmm6, %xmm7
887; SSE2-NEXT: pandn %xmm4, %xmm7
888; SSE2-NEXT: psrlw $2, %xmm4
889; SSE2-NEXT: pand %xmm6, %xmm4
890; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
891; SSE2-NEXT: por %xmm7, %xmm4
892; SSE2-NEXT: paddb %xmm5, %xmm5
893; SSE2-NEXT: pxor %xmm6, %xmm6
894; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
895; SSE2-NEXT: movdqa %xmm6, %xmm5
896; SSE2-NEXT: pandn %xmm4, %xmm5
897; SSE2-NEXT: psrlw $1, %xmm4
898; SSE2-NEXT: pand %xmm6, %xmm4
899; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
900; SSE2-NEXT: por %xmm5, %xmm4
901; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
902; SSE2-NEXT: psubb %xmm2, %xmm5
903; SSE2-NEXT: psllw $5, %xmm5
904; SSE2-NEXT: pxor %xmm6, %xmm6
905; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
906; SSE2-NEXT: movdqa %xmm6, %xmm7
907; SSE2-NEXT: pandn %xmm0, %xmm7
908; SSE2-NEXT: psllw $4, %xmm0
909; SSE2-NEXT: pand %xmm6, %xmm0
910; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
911; SSE2-NEXT: por %xmm7, %xmm0
912; SSE2-NEXT: paddb %xmm5, %xmm5
913; SSE2-NEXT: pxor %xmm6, %xmm6
914; SSE2-NEXT: pcmpgtb %xmm5, %xmm6
915; SSE2-NEXT: movdqa %xmm6, %xmm7
916; SSE2-NEXT: pandn %xmm0, %xmm7
917; SSE2-NEXT: psllw $2, %xmm0
918; SSE2-NEXT: pand %xmm6, %xmm0
919; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
920; SSE2-NEXT: por %xmm7, %xmm0
921; SSE2-NEXT: paddb %xmm5, %xmm5
922; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
923; SSE2-NEXT: pcmpgtb %xmm5, %xmm3
924; SSE2-NEXT: movdqa %xmm3, %xmm5
925; SSE2-NEXT: pandn %xmm0, %xmm5
926; SSE2-NEXT: por %xmm4, %xmm5
927; SSE2-NEXT: paddb %xmm0, %xmm0
928; SSE2-NEXT: pand %xmm3, %xmm0
929; SSE2-NEXT: por %xmm5, %xmm0
930; SSE2-NEXT: pand %xmm2, %xmm1
931; SSE2-NEXT: pandn %xmm0, %xmm2
932; SSE2-NEXT: por %xmm1, %xmm2
933; SSE2-NEXT: movdqa %xmm2, %xmm0
934; SSE2-NEXT: retq
935;
936; SSE41-LABEL: var_funnnel_v16i8:
937; SSE41: # %bb.0:
938; SSE41-NEXT: movdqa %xmm0, %xmm3
939; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
940; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
941; SSE41-NEXT: psubb %xmm2, %xmm4
942; SSE41-NEXT: pxor %xmm5, %xmm5
943; SSE41-NEXT: pcmpeqb %xmm2, %xmm5
944; SSE41-NEXT: movdqa %xmm2, %xmm0
945; SSE41-NEXT: psllw $5, %xmm0
946; SSE41-NEXT: movdqa %xmm1, %xmm2
947; SSE41-NEXT: psrlw $4, %xmm2
948; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
949; SSE41-NEXT: movdqa %xmm1, %xmm6
950; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
951; SSE41-NEXT: movdqa %xmm6, %xmm2
952; SSE41-NEXT: psrlw $2, %xmm2
953; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
954; SSE41-NEXT: paddb %xmm0, %xmm0
955; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
956; SSE41-NEXT: movdqa %xmm6, %xmm2
957; SSE41-NEXT: psrlw $1, %xmm2
958; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
959; SSE41-NEXT: paddb %xmm0, %xmm0
960; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm6
961; SSE41-NEXT: psllw $5, %xmm4
962; SSE41-NEXT: movdqa %xmm4, %xmm2
963; SSE41-NEXT: paddb %xmm4, %xmm2
964; SSE41-NEXT: movdqa %xmm3, %xmm7
965; SSE41-NEXT: psllw $4, %xmm7
966; SSE41-NEXT: pand {{.*}}(%rip), %xmm7
967; SSE41-NEXT: movdqa %xmm4, %xmm0
968; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
969; SSE41-NEXT: movdqa %xmm3, %xmm4
970; SSE41-NEXT: psllw $2, %xmm4
971; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
972; SSE41-NEXT: movdqa %xmm2, %xmm0
973; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
974; SSE41-NEXT: movdqa %xmm3, %xmm4
975; SSE41-NEXT: paddb %xmm3, %xmm4
976; SSE41-NEXT: paddb %xmm2, %xmm2
977; SSE41-NEXT: movdqa %xmm2, %xmm0
978; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
979; SSE41-NEXT: por %xmm6, %xmm3
980; SSE41-NEXT: movdqa %xmm5, %xmm0
981; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
982; SSE41-NEXT: movdqa %xmm3, %xmm0
983; SSE41-NEXT: retq
984;
985; AVX-LABEL: var_funnnel_v16i8:
986; AVX: # %bb.0:
987; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
988; AVX-NEXT: vpsllw $5, %xmm2, %xmm3
989; AVX-NEXT: vpsrlw $4, %xmm1, %xmm4
990; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
991; AVX-NEXT: vpblendvb %xmm3, %xmm4, %xmm1, %xmm4
992; AVX-NEXT: vpsrlw $2, %xmm4, %xmm5
993; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
994; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
995; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
996; AVX-NEXT: vpsrlw $1, %xmm4, %xmm5
997; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5
998; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3
999; AVX-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
1000; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1001; AVX-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1002; AVX-NEXT: vpsllw $5, %xmm4, %xmm4
1003; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5
1004; AVX-NEXT: vpsllw $4, %xmm0, %xmm6
1005; AVX-NEXT: vpand {{.*}}(%rip), %xmm6, %xmm6
1006; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm0, %xmm0
1007; AVX-NEXT: vpsllw $2, %xmm0, %xmm4
1008; AVX-NEXT: vpand {{.*}}(%rip), %xmm4, %xmm4
1009; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
1010; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm4
1011; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5
1012; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
1013; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
1014; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
1015; AVX-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1016; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1017; AVX-NEXT: retq
1018;
1019; AVX512F-LABEL: var_funnnel_v16i8:
1020; AVX512F: # %bb.0:
1021; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1022; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1023; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1024; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1025; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1026; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1027; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1028; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1029; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1030; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
1031; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
1032; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1033; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1034; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1035; AVX512F-NEXT: vzeroupper
1036; AVX512F-NEXT: retq
1037;
1038; AVX512VL-LABEL: var_funnnel_v16i8:
1039; AVX512VL: # %bb.0:
1040; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1041; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1042; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1043; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
1044; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1045; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
1046; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
1047; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1048; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
1049; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
1050; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
1051; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1052; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1053; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1054; AVX512VL-NEXT: vzeroupper
1055; AVX512VL-NEXT: retq
1056;
1057; AVX512BW-LABEL: var_funnnel_v16i8:
1058; AVX512BW: # %bb.0:
1059; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1060; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1061; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1062; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1063; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
1064; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1065; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
1066; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1067; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1068; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
1069; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1070; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
1071; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
1072; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1073; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
1074; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1075; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1076; AVX512BW-NEXT: vzeroupper
1077; AVX512BW-NEXT: retq
1078;
Craig Topper6ffeeb72019-01-06 18:10:18 +00001079; AVX512VBMI2-LABEL: var_funnnel_v16i8:
1080; AVX512VBMI2: # %bb.0:
1081; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1082; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1083; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1084; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1085; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
1086; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1087; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
1088; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1089; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1090; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
1091; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
1092; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
1093; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
1094; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
1095; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
1096; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
1097; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1098; AVX512VBMI2-NEXT: vzeroupper
1099; AVX512VBMI2-NEXT: retq
1100;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001101; AVX512VLBW-LABEL: var_funnnel_v16i8:
1102; AVX512VLBW: # %bb.0:
1103; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1104; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1105; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1106; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1107; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
1108; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1109; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
1110; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1111; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1112; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
1113; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
1114; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
1115; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
1116; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
1117; AVX512VLBW-NEXT: vzeroupper
1118; AVX512VLBW-NEXT: retq
1119;
Craig Topper6ffeeb72019-01-06 18:10:18 +00001120; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
1121; AVX512VLVBMI2: # %bb.0:
1122; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1123; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
1124; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1125; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1126; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
1127; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1128; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
1129; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
1130; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1131; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
1132; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
1133; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
1134; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
1135; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
1136; AVX512VLVBMI2-NEXT: vzeroupper
1137; AVX512VLVBMI2-NEXT: retq
1138;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001139; XOP-LABEL: var_funnnel_v16i8:
1140; XOP: # %bb.0:
1141; XOP-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1142; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
1143; XOP-NEXT: vpsubb %xmm2, %xmm3, %xmm4
1144; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm4
1145; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1146; XOP-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1147; XOP-NEXT: vpshlb %xmm5, %xmm0, %xmm0
1148; XOP-NEXT: vpor %xmm4, %xmm0, %xmm0
1149; XOP-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
1150; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1151; XOP-NEXT: retq
1152;
1153; X32-SSE-LABEL: var_funnnel_v16i8:
1154; X32-SSE: # %bb.0:
1155; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1156; X32-SSE-NEXT: movdqa %xmm2, %xmm5
1157; X32-SSE-NEXT: psllw $5, %xmm5
1158; X32-SSE-NEXT: pxor %xmm3, %xmm3
1159; X32-SSE-NEXT: pxor %xmm6, %xmm6
1160; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1161; X32-SSE-NEXT: movdqa %xmm1, %xmm4
1162; X32-SSE-NEXT: psrlw $4, %xmm4
1163; X32-SSE-NEXT: pand %xmm6, %xmm4
1164; X32-SSE-NEXT: pandn %xmm1, %xmm6
1165; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1166; X32-SSE-NEXT: por %xmm6, %xmm4
1167; X32-SSE-NEXT: paddb %xmm5, %xmm5
1168; X32-SSE-NEXT: pxor %xmm6, %xmm6
1169; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1170; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1171; X32-SSE-NEXT: pandn %xmm4, %xmm7
1172; X32-SSE-NEXT: psrlw $2, %xmm4
1173; X32-SSE-NEXT: pand %xmm6, %xmm4
1174; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1175; X32-SSE-NEXT: por %xmm7, %xmm4
1176; X32-SSE-NEXT: paddb %xmm5, %xmm5
1177; X32-SSE-NEXT: pxor %xmm6, %xmm6
1178; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1179; X32-SSE-NEXT: movdqa %xmm6, %xmm5
1180; X32-SSE-NEXT: pandn %xmm4, %xmm5
1181; X32-SSE-NEXT: psrlw $1, %xmm4
1182; X32-SSE-NEXT: pand %xmm6, %xmm4
1183; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
1184; X32-SSE-NEXT: por %xmm5, %xmm4
1185; X32-SSE-NEXT: movdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1186; X32-SSE-NEXT: psubb %xmm2, %xmm5
1187; X32-SSE-NEXT: psllw $5, %xmm5
1188; X32-SSE-NEXT: pxor %xmm6, %xmm6
1189; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1190; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1191; X32-SSE-NEXT: pandn %xmm0, %xmm7
1192; X32-SSE-NEXT: psllw $4, %xmm0
1193; X32-SSE-NEXT: pand %xmm6, %xmm0
1194; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1195; X32-SSE-NEXT: por %xmm7, %xmm0
1196; X32-SSE-NEXT: paddb %xmm5, %xmm5
1197; X32-SSE-NEXT: pxor %xmm6, %xmm6
1198; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm6
1199; X32-SSE-NEXT: movdqa %xmm6, %xmm7
1200; X32-SSE-NEXT: pandn %xmm0, %xmm7
1201; X32-SSE-NEXT: psllw $2, %xmm0
1202; X32-SSE-NEXT: pand %xmm6, %xmm0
1203; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1204; X32-SSE-NEXT: por %xmm7, %xmm0
1205; X32-SSE-NEXT: paddb %xmm5, %xmm5
1206; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
1207; X32-SSE-NEXT: pcmpgtb %xmm5, %xmm3
1208; X32-SSE-NEXT: movdqa %xmm3, %xmm5
1209; X32-SSE-NEXT: pandn %xmm0, %xmm5
1210; X32-SSE-NEXT: por %xmm4, %xmm5
1211; X32-SSE-NEXT: paddb %xmm0, %xmm0
1212; X32-SSE-NEXT: pand %xmm3, %xmm0
1213; X32-SSE-NEXT: por %xmm5, %xmm0
1214; X32-SSE-NEXT: pand %xmm2, %xmm1
1215; X32-SSE-NEXT: pandn %xmm0, %xmm2
1216; X32-SSE-NEXT: por %xmm1, %xmm2
1217; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1218; X32-SSE-NEXT: retl
1219 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
1220 ret <16 x i8> %res
1221}
1222
1223;
1224; Uniform Variable Shifts
1225;
1226
1227define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
1228; SSE2-LABEL: splatvar_funnnel_v2i64:
1229; SSE2: # %bb.0:
1230; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1231; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1232; SSE2-NEXT: movdqa %xmm1, %xmm3
1233; SSE2-NEXT: psrlq %xmm2, %xmm3
1234; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
1235; SSE2-NEXT: psubq %xmm2, %xmm4
1236; SSE2-NEXT: psllq %xmm4, %xmm0
1237; SSE2-NEXT: por %xmm3, %xmm0
1238; SSE2-NEXT: pxor %xmm3, %xmm3
1239; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
1240; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
1241; SSE2-NEXT: pand %xmm3, %xmm2
1242; SSE2-NEXT: pand %xmm2, %xmm1
1243; SSE2-NEXT: pandn %xmm0, %xmm2
1244; SSE2-NEXT: por %xmm1, %xmm2
1245; SSE2-NEXT: movdqa %xmm2, %xmm0
1246; SSE2-NEXT: retq
1247;
1248; SSE41-LABEL: splatvar_funnnel_v2i64:
1249; SSE41: # %bb.0:
1250; SSE41-NEXT: movdqa %xmm0, %xmm3
1251; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1252; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1253; SSE41-NEXT: movdqa %xmm1, %xmm0
1254; SSE41-NEXT: psrlq %xmm2, %xmm0
1255; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [64,64]
1256; SSE41-NEXT: psubq %xmm2, %xmm4
1257; SSE41-NEXT: psllq %xmm4, %xmm3
1258; SSE41-NEXT: por %xmm0, %xmm3
1259; SSE41-NEXT: pxor %xmm0, %xmm0
1260; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
1261; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
1262; SSE41-NEXT: movapd %xmm3, %xmm0
1263; SSE41-NEXT: retq
1264;
1265; AVX1-LABEL: splatvar_funnnel_v2i64:
1266; AVX1: # %bb.0:
1267; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1268; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1269; AVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1270; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1271; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1272; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1273; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1274; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1275; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1276; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1277; AVX1-NEXT: retq
1278;
1279; AVX2-LABEL: splatvar_funnnel_v2i64:
1280; AVX2: # %bb.0:
1281; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2
1282; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1283; AVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1284; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1285; AVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1286; AVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1287; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1288; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1289; AVX2-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
1290; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1291; AVX2-NEXT: retq
1292;
1293; AVX512F-LABEL: splatvar_funnnel_v2i64:
1294; AVX512F: # %bb.0:
1295; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1296; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
1297; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1298; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1299; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1300; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1301; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1302; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1303; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
1304; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
1305; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1306; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1307; AVX512F-NEXT: vzeroupper
1308; AVX512F-NEXT: retq
1309;
1310; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1311; AVX512VL: # %bb.0:
1312; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
1313; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1314; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1315; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1316; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1317; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1318; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1319; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
1320; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
1321; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
1322; AVX512VL-NEXT: retq
1323;
1324; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1325; AVX512BW: # %bb.0:
1326; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1327; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
1328; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1329; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1330; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1331; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1332; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1333; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1334; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1335; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
1336; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1337; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1338; AVX512BW-NEXT: vzeroupper
1339; AVX512BW-NEXT: retq
1340;
Craig Topper6ffeeb72019-01-06 18:10:18 +00001341; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
1342; AVX512VBMI2: # %bb.0:
1343; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1344; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1345; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1346; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
1347; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1348; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1349; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1350; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1351; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
1352; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
1353; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
1354; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1355; AVX512VBMI2-NEXT: vzeroupper
1356; AVX512VBMI2-NEXT: retq
1357;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001358; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1359; AVX512VLBW: # %bb.0:
1360; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
1361; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
1362; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1363; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
1364; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
1365; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
1366; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1367; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1368; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
1369; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
1370; AVX512VLBW-NEXT: retq
1371;
Craig Topper6ffeeb72019-01-06 18:10:18 +00001372; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
1373; AVX512VLVBMI2: # %bb.0:
1374; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
1375; AVX512VLVBMI2-NEXT: vpshrdvq %xmm2, %xmm0, %xmm1
1376; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1377; AVX512VLVBMI2-NEXT: retq
1378;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001379; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
1380; XOPAVX1: # %bb.0:
1381; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1382; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1383; XOPAVX1-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1384; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1385; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1386; XOPAVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1387; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1388; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1389; XOPAVX1-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
1390; XOPAVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1391; XOPAVX1-NEXT: retq
1392;
1393; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
1394; XOPAVX2: # %bb.0:
1395; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2
1396; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1397; XOPAVX2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
1398; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
1399; XOPAVX2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
1400; XOPAVX2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
1401; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1402; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1403; XOPAVX2-NEXT: vpcomeqq %xmm3, %xmm2, %xmm2
1404; XOPAVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1405; XOPAVX2-NEXT: retq
1406;
1407; X32-SSE-LABEL: splatvar_funnnel_v2i64:
1408; X32-SSE: # %bb.0:
1409; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1410; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1411; X32-SSE-NEXT: movdqa %xmm1, %xmm3
1412; X32-SSE-NEXT: psrlq %xmm2, %xmm3
1413; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
1414; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1415; X32-SSE-NEXT: psrlq %xmm4, %xmm5
1416; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
1417; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
1418; X32-SSE-NEXT: psubq %xmm2, %xmm3
1419; X32-SSE-NEXT: movdqa %xmm0, %xmm4
1420; X32-SSE-NEXT: psllq %xmm3, %xmm4
1421; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1422; X32-SSE-NEXT: psllq %xmm3, %xmm0
1423; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1424; X32-SSE-NEXT: orpd %xmm5, %xmm0
1425; X32-SSE-NEXT: pxor %xmm3, %xmm3
1426; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm3
1427; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
1428; X32-SSE-NEXT: pand %xmm3, %xmm2
1429; X32-SSE-NEXT: pand %xmm2, %xmm1
1430; X32-SSE-NEXT: pandn %xmm0, %xmm2
1431; X32-SSE-NEXT: por %xmm1, %xmm2
1432; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1433; X32-SSE-NEXT: retl
1434 %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1435 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1436 ret <2 x i64> %res
1437}
1438
1439define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1440; SSE2-LABEL: splatvar_funnnel_v4i32:
1441; SSE2: # %bb.0:
1442; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1443; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
1444; SSE2-NEXT: pxor %xmm3, %xmm3
1445; SSE2-NEXT: xorps %xmm4, %xmm4
1446; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1447; SSE2-NEXT: movdqa %xmm1, %xmm5
1448; SSE2-NEXT: psrld %xmm4, %xmm5
Sanjay Patel9633d762019-01-03 21:31:16 +00001449; SSE2-NEXT: movd %xmm2, %eax
1450; SSE2-NEXT: movl $32, %ecx
1451; SSE2-NEXT: subl %eax, %ecx
1452; SSE2-NEXT: movd %ecx, %xmm4
1453; SSE2-NEXT: pslld %xmm4, %xmm0
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001454; SSE2-NEXT: por %xmm5, %xmm0
Sanjay Patel9633d762019-01-03 21:31:16 +00001455; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001456; SSE2-NEXT: pand %xmm2, %xmm1
1457; SSE2-NEXT: pandn %xmm0, %xmm2
1458; SSE2-NEXT: por %xmm1, %xmm2
1459; SSE2-NEXT: movdqa %xmm2, %xmm0
1460; SSE2-NEXT: retq
1461;
1462; SSE41-LABEL: splatvar_funnnel_v4i32:
1463; SSE41: # %bb.0:
1464; SSE41-NEXT: movdqa %xmm0, %xmm3
1465; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1466; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1467; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
1468; SSE41-NEXT: movdqa %xmm1, %xmm4
1469; SSE41-NEXT: psrld %xmm0, %xmm4
1470; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
1471; SSE41-NEXT: psubd %xmm2, %xmm0
1472; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1473; SSE41-NEXT: pslld %xmm0, %xmm3
1474; SSE41-NEXT: por %xmm4, %xmm3
1475; SSE41-NEXT: pxor %xmm0, %xmm0
1476; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
1477; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
1478; SSE41-NEXT: movaps %xmm3, %xmm0
1479; SSE41-NEXT: retq
1480;
1481; AVX1-LABEL: splatvar_funnnel_v4i32:
1482; AVX1: # %bb.0:
1483; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1484; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1485; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1486; AVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1487; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
1488; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1489; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1490; AVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
1491; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1492; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1493; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1494; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1495; AVX1-NEXT: retq
1496;
1497; AVX2-LABEL: splatvar_funnnel_v4i32:
1498; AVX2: # %bb.0:
1499; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
1500; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1501; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1502; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1503; AVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1504; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1505; AVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1506; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1507; AVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1508; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1509; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1510; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
1511; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1512; AVX2-NEXT: retq
1513;
1514; AVX512F-LABEL: splatvar_funnnel_v4i32:
1515; AVX512F: # %bb.0:
1516; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1517; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
1518; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1519; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
1520; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1521; AVX512F-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1522; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1523; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1524; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1525; AVX512F-NEXT: vpslld %xmm4, %xmm0, %xmm0
1526; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
1527; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
1528; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1529; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1530; AVX512F-NEXT: vzeroupper
1531; AVX512F-NEXT: retq
1532;
1533; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1534; AVX512VL: # %bb.0:
1535; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
1536; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1537; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
1538; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1539; AVX512VL-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1540; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1541; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1542; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1543; AVX512VL-NEXT: vpslld %xmm4, %xmm0, %xmm0
1544; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
1545; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
1546; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1547; AVX512VL-NEXT: retq
1548;
1549; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1550; AVX512BW: # %bb.0:
1551; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1552; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
1553; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1554; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1555; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1556; AVX512BW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1557; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1558; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1559; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1560; AVX512BW-NEXT: vpslld %xmm4, %xmm0, %xmm0
1561; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1562; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
1563; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1564; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1565; AVX512BW-NEXT: vzeroupper
1566; AVX512BW-NEXT: retq
1567;
Craig Topper6ffeeb72019-01-06 18:10:18 +00001568; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
1569; AVX512VBMI2: # %bb.0:
1570; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1571; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1572; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1573; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
1574; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1575; AVX512VBMI2-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1576; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1577; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1578; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1579; AVX512VBMI2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1580; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
1581; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
1582; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1583; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1584; AVX512VBMI2-NEXT: vzeroupper
1585; AVX512VBMI2-NEXT: retq
1586;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001587; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1588; AVX512VLBW: # %bb.0:
1589; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
1590; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1591; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1592; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
1593; AVX512VLBW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
1594; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
1595; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
1596; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1597; AVX512VLBW-NEXT: vpslld %xmm4, %xmm0, %xmm0
1598; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1599; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
1600; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1601; AVX512VLBW-NEXT: retq
1602;
Craig Topper6ffeeb72019-01-06 18:10:18 +00001603; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1604; AVX512VLVBMI2: # %bb.0:
1605; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
1606; AVX512VLVBMI2-NEXT: vpshrdvd %xmm2, %xmm0, %xmm1
1607; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1608; AVX512VLVBMI2-NEXT: retq
1609;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001610; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
1611; XOPAVX1: # %bb.0:
1612; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1613; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1614; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1615; XOPAVX1-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1616; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32,32,32,32]
1617; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1618; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1619; XOPAVX1-NEXT: vpslld %xmm4, %xmm0, %xmm0
1620; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1621; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1622; XOPAVX1-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
1623; XOPAVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1624; XOPAVX1-NEXT: retq
1625;
1626; XOPAVX2-LABEL: splatvar_funnnel_v4i32:
1627; XOPAVX2: # %bb.0:
1628; XOPAVX2-NEXT: vpbroadcastd %xmm2, %xmm2
1629; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
1630; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
1631; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
1632; XOPAVX2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
1633; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
1634; XOPAVX2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
1635; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
1636; XOPAVX2-NEXT: vpslld %xmm4, %xmm0, %xmm0
1637; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1638; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1639; XOPAVX2-NEXT: vpcomeqd %xmm3, %xmm2, %xmm2
1640; XOPAVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1641; XOPAVX2-NEXT: retq
1642;
1643; X32-SSE-LABEL: splatvar_funnnel_v4i32:
1644; X32-SSE: # %bb.0:
1645; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1646; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
1647; X32-SSE-NEXT: pxor %xmm3, %xmm3
1648; X32-SSE-NEXT: xorps %xmm4, %xmm4
1649; X32-SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1650; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1651; X32-SSE-NEXT: psrld %xmm4, %xmm5
Sanjay Patel9633d762019-01-03 21:31:16 +00001652; X32-SSE-NEXT: movd %xmm2, %eax
1653; X32-SSE-NEXT: movl $32, %ecx
1654; X32-SSE-NEXT: subl %eax, %ecx
1655; X32-SSE-NEXT: movd %ecx, %xmm4
1656; X32-SSE-NEXT: pslld %xmm4, %xmm0
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001657; X32-SSE-NEXT: por %xmm5, %xmm0
Sanjay Patel9633d762019-01-03 21:31:16 +00001658; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm2
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001659; X32-SSE-NEXT: pand %xmm2, %xmm1
1660; X32-SSE-NEXT: pandn %xmm0, %xmm2
1661; X32-SSE-NEXT: por %xmm1, %xmm2
1662; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1663; X32-SSE-NEXT: retl
1664 %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1665 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1666 ret <4 x i32> %res
1667}
1668
1669define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1670; SSE2-LABEL: splatvar_funnnel_v8i16:
1671; SSE2: # %bb.0:
1672; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1673; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1674; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1675; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1676; SSE2-NEXT: psubw %xmm3, %xmm4
1677; SSE2-NEXT: pxor %xmm2, %xmm2
1678; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
1679; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
1680; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1681; SSE2-NEXT: movdqa %xmm1, %xmm5
1682; SSE2-NEXT: psrlw %xmm3, %xmm5
1683; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1684; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1685; SSE2-NEXT: psllw %xmm4, %xmm0
1686; SSE2-NEXT: por %xmm5, %xmm0
1687; SSE2-NEXT: pand %xmm2, %xmm1
1688; SSE2-NEXT: pandn %xmm0, %xmm2
1689; SSE2-NEXT: por %xmm1, %xmm2
1690; SSE2-NEXT: movdqa %xmm2, %xmm0
1691; SSE2-NEXT: retq
1692;
1693; SSE41-LABEL: splatvar_funnnel_v8i16:
1694; SSE41: # %bb.0:
1695; SSE41-NEXT: movdqa %xmm0, %xmm3
1696; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7]
1697; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
1698; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1699; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1700; SSE41-NEXT: movdqa %xmm1, %xmm4
1701; SSE41-NEXT: psrlw %xmm0, %xmm4
1702; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16]
1703; SSE41-NEXT: psubw %xmm2, %xmm0
1704; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1705; SSE41-NEXT: psllw %xmm0, %xmm3
1706; SSE41-NEXT: por %xmm4, %xmm3
1707; SSE41-NEXT: pxor %xmm0, %xmm0
1708; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
1709; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
1710; SSE41-NEXT: movdqa %xmm3, %xmm0
1711; SSE41-NEXT: retq
1712;
1713; AVX1-LABEL: splatvar_funnnel_v8i16:
1714; AVX1: # %bb.0:
1715; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1716; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1717; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1718; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1719; AVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1720; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1721; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1722; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1723; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1724; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1725; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1726; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1727; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1728; AVX1-NEXT: retq
1729;
1730; AVX2-LABEL: splatvar_funnnel_v8i16:
1731; AVX2: # %bb.0:
1732; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2
1733; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1734; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1735; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1736; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1737; AVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1738; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1739; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1740; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1741; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1742; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1743; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1744; AVX2-NEXT: retq
1745;
1746; AVX512F-LABEL: splatvar_funnnel_v8i16:
1747; AVX512F: # %bb.0:
1748; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
1749; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1750; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1751; AVX512F-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1752; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1753; AVX512F-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1754; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1755; AVX512F-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1756; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
1757; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
1758; AVX512F-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1759; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1760; AVX512F-NEXT: retq
1761;
1762; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1763; AVX512VL: # %bb.0:
1764; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
1765; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1766; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1767; AVX512VL-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1768; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1769; AVX512VL-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1770; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1771; AVX512VL-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1772; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
1773; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1774; AVX512VL-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
1775; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1776; AVX512VL-NEXT: retq
1777;
1778; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1779; AVX512BW: # %bb.0:
1780; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1781; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
1782; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1783; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
1784; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1785; AVX512BW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
1786; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
1787; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
1788; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1789; AVX512BW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1790; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
1791; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
1792; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1793; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1794; AVX512BW-NEXT: vzeroupper
1795; AVX512BW-NEXT: retq
1796;
Craig Topper6ffeeb72019-01-06 18:10:18 +00001797; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1798; AVX512VBMI2: # %bb.0:
1799; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1800; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1801; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1802; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
1803; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1804; AVX512VBMI2-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
1805; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
1806; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
1807; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1808; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1809; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
1810; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
1811; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
1812; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1813; AVX512VBMI2-NEXT: vzeroupper
1814; AVX512VBMI2-NEXT: retq
1815;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001816; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1817; AVX512VLBW: # %bb.0:
1818; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
1819; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
1820; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
1821; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1822; AVX512VLBW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
1823; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
1824; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
1825; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1826; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1827; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
1828; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
1829; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
1830; AVX512VLBW-NEXT: retq
1831;
Craig Topper6ffeeb72019-01-06 18:10:18 +00001832; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1833; AVX512VLVBMI2: # %bb.0:
1834; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
1835; AVX512VLVBMI2-NEXT: vpshrdvw %xmm2, %xmm0, %xmm1
1836; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
1837; AVX512VLVBMI2-NEXT: retq
1838;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00001839; XOPAVX1-LABEL: splatvar_funnnel_v8i16:
1840; XOPAVX1: # %bb.0:
1841; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1842; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1843; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1844; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1845; XOPAVX1-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1846; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1847; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1848; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1849; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1850; XOPAVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
1851; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1852; XOPAVX1-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
1853; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1854; XOPAVX1-NEXT: retq
1855;
1856; XOPAVX2-LABEL: splatvar_funnnel_v8i16:
1857; XOPAVX2: # %bb.0:
1858; XOPAVX2-NEXT: vpbroadcastw %xmm2, %xmm2
1859; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1860; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1861; XOPAVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
1862; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1863; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
1864; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
1865; XOPAVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
1866; XOPAVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
1867; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
1868; XOPAVX2-NEXT: vpcomeqw %xmm3, %xmm2, %xmm2
1869; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1870; XOPAVX2-NEXT: retq
1871;
1872; X32-SSE-LABEL: splatvar_funnnel_v8i16:
1873; X32-SSE: # %bb.0:
1874; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1875; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1876; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
1877; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
1878; X32-SSE-NEXT: psubw %xmm3, %xmm4
1879; X32-SSE-NEXT: pxor %xmm2, %xmm2
1880; X32-SSE-NEXT: pcmpeqw %xmm3, %xmm2
1881; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
1882; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1883; X32-SSE-NEXT: movdqa %xmm1, %xmm5
1884; X32-SSE-NEXT: psrlw %xmm3, %xmm5
1885; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1]
1886; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1887; X32-SSE-NEXT: psllw %xmm4, %xmm0
1888; X32-SSE-NEXT: por %xmm5, %xmm0
1889; X32-SSE-NEXT: pand %xmm2, %xmm1
1890; X32-SSE-NEXT: pandn %xmm0, %xmm2
1891; X32-SSE-NEXT: por %xmm1, %xmm2
1892; X32-SSE-NEXT: movdqa %xmm2, %xmm0
1893; X32-SSE-NEXT: retl
1894 %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1895 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1896 ret <8 x i16> %res
1897}
1898
1899define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1900; SSE2-LABEL: splatvar_funnnel_v16i8:
1901; SSE2: # %bb.0:
1902; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1903; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
1904; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
1905; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
1906; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1907; SSE2-NEXT: psubb %xmm3, %xmm4
1908; SSE2-NEXT: pxor %xmm2, %xmm2
1909; SSE2-NEXT: pcmpeqb %xmm3, %xmm2
1910; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1911; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1912; SSE2-NEXT: movdqa %xmm1, %xmm5
1913; SSE2-NEXT: psrlw %xmm3, %xmm5
1914; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
1915; SSE2-NEXT: psrlw %xmm3, %xmm6
1916; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
1917; SSE2-NEXT: psrlw $8, %xmm6
1918; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1919; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
1920; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
1921; SSE2-NEXT: pand %xmm5, %xmm6
1922; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
1923; SSE2-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1924; SSE2-NEXT: psllw %xmm4, %xmm0
1925; SSE2-NEXT: psllw %xmm4, %xmm3
1926; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1927; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
1928; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1929; SSE2-NEXT: pand %xmm0, %xmm3
1930; SSE2-NEXT: por %xmm6, %xmm3
1931; SSE2-NEXT: pand %xmm2, %xmm1
1932; SSE2-NEXT: pandn %xmm3, %xmm2
1933; SSE2-NEXT: por %xmm1, %xmm2
1934; SSE2-NEXT: movdqa %xmm2, %xmm0
1935; SSE2-NEXT: retq
1936;
1937; SSE41-LABEL: splatvar_funnnel_v16i8:
1938; SSE41: # %bb.0:
1939; SSE41-NEXT: movdqa %xmm0, %xmm3
1940; SSE41-NEXT: pxor %xmm0, %xmm0
1941; SSE41-NEXT: pshufb %xmm0, %xmm2
1942; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1943; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1944; SSE41-NEXT: movdqa %xmm1, %xmm5
1945; SSE41-NEXT: psrlw %xmm4, %xmm5
1946; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
1947; SSE41-NEXT: pcmpeqd %xmm7, %xmm7
1948; SSE41-NEXT: psrlw %xmm4, %xmm7
1949; SSE41-NEXT: pshufb {{.*#+}} xmm7 = xmm7[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1950; SSE41-NEXT: pand %xmm5, %xmm7
1951; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1952; SSE41-NEXT: psubb %xmm2, %xmm4
1953; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
1954; SSE41-NEXT: psllw %xmm4, %xmm3
1955; SSE41-NEXT: psllw %xmm4, %xmm6
1956; SSE41-NEXT: pshufb %xmm0, %xmm6
1957; SSE41-NEXT: pand %xmm6, %xmm3
1958; SSE41-NEXT: por %xmm7, %xmm3
1959; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
1960; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
1961; SSE41-NEXT: movdqa %xmm3, %xmm0
1962; SSE41-NEXT: retq
1963;
1964; AVX1-LABEL: splatvar_funnnel_v16i8:
1965; AVX1: # %bb.0:
1966; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1967; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1968; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1969; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1970; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm5
1971; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
1972; AVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm4
1973; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1974; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4
1975; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1976; AVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
1977; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
1978; AVX1-NEXT: vpsllw %xmm5, %xmm0, %xmm0
1979; AVX1-NEXT: vpsllw %xmm5, %xmm6, %xmm5
1980; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm5
1981; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
1982; AVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
1983; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
1984; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1985; AVX1-NEXT: retq
1986;
1987; AVX2-LABEL: splatvar_funnnel_v16i8:
1988; AVX2: # %bb.0:
1989; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
1990; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
1991; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1992; AVX2-NEXT: vpsrlw %xmm3, %xmm1, %xmm4
1993; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
1994; AVX2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
1995; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
1996; AVX2-NEXT: vpbroadcastb %xmm3, %xmm3
1997; AVX2-NEXT: vpand %xmm3, %xmm4, %xmm3
1998; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1999; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2000; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
2001; AVX2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
2002; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
2003; AVX2-NEXT: vpbroadcastb %xmm4, %xmm4
2004; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
2005; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
2006; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
2007; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
2008; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2009; AVX2-NEXT: retq
2010;
2011; AVX512F-LABEL: splatvar_funnnel_v16i8:
2012; AVX512F: # %bb.0:
2013; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
2014; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2015; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2016; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2017; AVX512F-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
2018; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2019; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2020; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
2021; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2022; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
2023; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
2024; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2025; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
2026; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
2027; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2028; AVX512F-NEXT: vzeroupper
2029; AVX512F-NEXT: retq
2030;
2031; AVX512VL-LABEL: splatvar_funnnel_v16i8:
2032; AVX512VL: # %bb.0:
2033; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
2034; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2035; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2036; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2037; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm3, %zmm3
2038; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2039; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
2040; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
2041; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2042; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
2043; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
2044; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2045; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
2046; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
2047; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2048; AVX512VL-NEXT: vzeroupper
2049; AVX512VL-NEXT: retq
2050;
2051; AVX512BW-LABEL: splatvar_funnnel_v16i8:
2052; AVX512BW: # %bb.0:
2053; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2054; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
2055; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2056; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
2057; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
2058; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2059; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
2060; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2061; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2062; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
2063; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2064; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
2065; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
2066; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2067; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
2068; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
2069; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2070; AVX512BW-NEXT: vzeroupper
2071; AVX512BW-NEXT: retq
2072;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002073; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
2074; AVX512VBMI2: # %bb.0:
2075; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2076; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
2077; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2078; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
2079; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
2080; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2081; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
2082; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2083; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2084; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
2085; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2086; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
2087; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
2088; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2089; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
2090; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
2091; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2092; AVX512VBMI2-NEXT: vzeroupper
2093; AVX512VBMI2-NEXT: retq
2094;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00002095; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
2096; AVX512VLBW: # %bb.0:
2097; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
2098; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
2099; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
2100; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2101; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2102; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
2103; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2104; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
2105; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2106; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2107; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
2108; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
2109; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2110; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
2111; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2112; AVX512VLBW-NEXT: vzeroupper
2113; AVX512VLBW-NEXT: retq
2114;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002115; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
2116; AVX512VLVBMI2: # %bb.0:
2117; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
2118; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
2119; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
2120; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2121; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2122; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
2123; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2124; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
2125; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2126; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2127; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
2128; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
2129; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2130; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
2131; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2132; AVX512VLVBMI2-NEXT: vzeroupper
2133; AVX512VLVBMI2-NEXT: retq
2134;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00002135; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
2136; XOPAVX1: # %bb.0:
2137; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2138; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2139; XOPAVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2140; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm4
2141; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm4
2142; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2143; XOPAVX1-NEXT: vpsubb %xmm2, %xmm5, %xmm5
2144; XOPAVX1-NEXT: vpshlb %xmm5, %xmm0, %xmm0
2145; XOPAVX1-NEXT: vpor %xmm4, %xmm0, %xmm0
2146; XOPAVX1-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
2147; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2148; XOPAVX1-NEXT: retq
2149;
2150; XOPAVX2-LABEL: splatvar_funnnel_v16i8:
2151; XOPAVX2: # %bb.0:
2152; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2
2153; XOPAVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
2154; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
2155; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm4
2156; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm4
2157; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2158; XOPAVX2-NEXT: vpsubb %xmm2, %xmm5, %xmm5
2159; XOPAVX2-NEXT: vpshlb %xmm5, %xmm0, %xmm0
2160; XOPAVX2-NEXT: vpor %xmm4, %xmm0, %xmm0
2161; XOPAVX2-NEXT: vpcomeqb %xmm3, %xmm2, %xmm2
2162; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2163; XOPAVX2-NEXT: retq
2164;
2165; X32-SSE-LABEL: splatvar_funnnel_v16i8:
2166; X32-SSE: # %bb.0:
2167; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2168; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
2169; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
2170; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
2171; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
2172; X32-SSE-NEXT: psubb %xmm3, %xmm4
2173; X32-SSE-NEXT: pxor %xmm2, %xmm2
2174; X32-SSE-NEXT: pcmpeqb %xmm3, %xmm2
2175; X32-SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
2176; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2177; X32-SSE-NEXT: movdqa %xmm1, %xmm5
2178; X32-SSE-NEXT: psrlw %xmm3, %xmm5
2179; X32-SSE-NEXT: pcmpeqd %xmm6, %xmm6
2180; X32-SSE-NEXT: psrlw %xmm3, %xmm6
2181; X32-SSE-NEXT: pcmpeqd %xmm3, %xmm3
2182; X32-SSE-NEXT: psrlw $8, %xmm6
2183; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2184; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7]
2185; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
2186; X32-SSE-NEXT: pand %xmm5, %xmm6
2187; X32-SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
2188; X32-SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2189; X32-SSE-NEXT: psllw %xmm4, %xmm0
2190; X32-SSE-NEXT: psllw %xmm4, %xmm3
2191; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2192; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7]
2193; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
2194; X32-SSE-NEXT: pand %xmm0, %xmm3
2195; X32-SSE-NEXT: por %xmm6, %xmm3
2196; X32-SSE-NEXT: pand %xmm2, %xmm1
2197; X32-SSE-NEXT: pandn %xmm3, %xmm2
2198; X32-SSE-NEXT: por %xmm1, %xmm2
2199; X32-SSE-NEXT: movdqa %xmm2, %xmm0
2200; X32-SSE-NEXT: retl
2201 %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
2202 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
2203 ret <16 x i8> %res
2204}
2205
2206;
2207; Constant Shifts
2208;
2209
2210define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2211; SSE2-LABEL: constant_funnnel_v2i64:
2212; SSE2: # %bb.0:
2213; SSE2-NEXT: movdqa %xmm1, %xmm2
2214; SSE2-NEXT: psrlq $4, %xmm2
2215; SSE2-NEXT: psrlq $14, %xmm1
2216; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2217; SSE2-NEXT: movdqa %xmm0, %xmm2
2218; SSE2-NEXT: psllq $60, %xmm2
2219; SSE2-NEXT: psllq $50, %xmm0
2220; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2221; SSE2-NEXT: orpd %xmm1, %xmm0
2222; SSE2-NEXT: retq
2223;
2224; SSE41-LABEL: constant_funnnel_v2i64:
2225; SSE41: # %bb.0:
2226; SSE41-NEXT: movdqa %xmm1, %xmm2
2227; SSE41-NEXT: psrlq $14, %xmm2
2228; SSE41-NEXT: psrlq $4, %xmm1
2229; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2230; SSE41-NEXT: movdqa %xmm0, %xmm2
2231; SSE41-NEXT: psllq $50, %xmm2
2232; SSE41-NEXT: psllq $60, %xmm0
2233; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2234; SSE41-NEXT: por %xmm1, %xmm0
2235; SSE41-NEXT: retq
2236;
2237; AVX1-LABEL: constant_funnnel_v2i64:
2238; AVX1: # %bb.0:
2239; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm2
2240; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm1
2241; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2242; AVX1-NEXT: vpsllq $50, %xmm0, %xmm2
2243; AVX1-NEXT: vpsllq $60, %xmm0, %xmm0
2244; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
2245; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2246; AVX1-NEXT: retq
2247;
2248; AVX2-LABEL: constant_funnnel_v2i64:
2249; AVX2: # %bb.0:
2250; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2251; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2252; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2253; AVX2-NEXT: retq
2254;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002255; AVX512F-LABEL: constant_funnnel_v2i64:
2256; AVX512F: # %bb.0:
2257; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2258; AVX512F-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2259; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2260; AVX512F-NEXT: retq
2261;
2262; AVX512VL-LABEL: constant_funnnel_v2i64:
2263; AVX512VL: # %bb.0:
2264; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2265; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2266; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2267; AVX512VL-NEXT: retq
2268;
2269; AVX512BW-LABEL: constant_funnnel_v2i64:
2270; AVX512BW: # %bb.0:
2271; AVX512BW-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2272; AVX512BW-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2273; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2274; AVX512BW-NEXT: retq
2275;
2276; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
2277; AVX512VBMI2: # %bb.0:
2278; AVX512VBMI2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2279; AVX512VBMI2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2280; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2281; AVX512VBMI2-NEXT: retq
2282;
2283; AVX512VLBW-LABEL: constant_funnnel_v2i64:
2284; AVX512VLBW: # %bb.0:
2285; AVX512VLBW-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2286; AVX512VLBW-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2287; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2288; AVX512VLBW-NEXT: retq
2289;
2290; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
2291; AVX512VLVBMI2: # %bb.0:
2292; AVX512VLVBMI2-NEXT: vpshrdvq {{.*}}(%rip), %xmm0, %xmm1
2293; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
2294; AVX512VLVBMI2-NEXT: retq
Simon Pilgrim46b90e82018-12-18 10:08:23 +00002295;
2296; XOPAVX1-LABEL: constant_funnnel_v2i64:
2297; XOPAVX1: # %bb.0:
2298; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm1, %xmm1
2299; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
2300; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2301; XOPAVX1-NEXT: retq
2302;
2303; XOPAVX2-LABEL: constant_funnnel_v2i64:
2304; XOPAVX2: # %bb.0:
2305; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm1, %xmm1
2306; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
2307; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2308; XOPAVX2-NEXT: retq
2309;
2310; X32-SSE-LABEL: constant_funnnel_v2i64:
2311; X32-SSE: # %bb.0:
2312; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2313; X32-SSE-NEXT: psrlq $4, %xmm2
2314; X32-SSE-NEXT: psrlq $14, %xmm1
2315; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2316; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2317; X32-SSE-NEXT: psllq $60, %xmm2
2318; X32-SSE-NEXT: psllq $50, %xmm0
2319; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2320; X32-SSE-NEXT: orpd %xmm1, %xmm0
2321; X32-SSE-NEXT: retl
2322 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
2323 ret <2 x i64> %res
2324}
2325
2326define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2327; SSE2-LABEL: constant_funnnel_v4i32:
2328; SSE2: # %bb.0:
2329; SSE2-NEXT: movdqa %xmm1, %xmm2
2330; SSE2-NEXT: psrld $7, %xmm2
2331; SSE2-NEXT: movdqa %xmm1, %xmm3
2332; SSE2-NEXT: psrld $6, %xmm3
2333; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2334; SSE2-NEXT: movdqa %xmm1, %xmm2
2335; SSE2-NEXT: psrld $5, %xmm2
2336; SSE2-NEXT: psrld $4, %xmm1
2337; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2338; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2339; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2340; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2341; SSE2-NEXT: pmuludq %xmm2, %xmm0
2342; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2343; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2344; SSE2-NEXT: pmuludq %xmm3, %xmm2
2345; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2346; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2347; SSE2-NEXT: por %xmm1, %xmm0
2348; SSE2-NEXT: retq
2349;
2350; SSE41-LABEL: constant_funnnel_v4i32:
2351; SSE41: # %bb.0:
2352; SSE41-NEXT: movdqa %xmm1, %xmm2
2353; SSE41-NEXT: psrld $7, %xmm2
2354; SSE41-NEXT: movdqa %xmm1, %xmm3
2355; SSE41-NEXT: psrld $5, %xmm3
2356; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2357; SSE41-NEXT: movdqa %xmm1, %xmm2
2358; SSE41-NEXT: psrld $6, %xmm2
2359; SSE41-NEXT: psrld $4, %xmm1
2360; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
2361; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2362; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
2363; SSE41-NEXT: por %xmm1, %xmm0
2364; SSE41-NEXT: retq
2365;
2366; AVX1-LABEL: constant_funnnel_v4i32:
2367; AVX1: # %bb.0:
2368; AVX1-NEXT: vpsrld $7, %xmm1, %xmm2
2369; AVX1-NEXT: vpsrld $5, %xmm1, %xmm3
2370; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2371; AVX1-NEXT: vpsrld $6, %xmm1, %xmm3
2372; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
2373; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
2374; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2375; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2376; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2377; AVX1-NEXT: retq
2378;
2379; AVX2-LABEL: constant_funnnel_v4i32:
2380; AVX2: # %bb.0:
2381; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2382; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2383; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2384; AVX2-NEXT: retq
2385;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002386; AVX512F-LABEL: constant_funnnel_v4i32:
2387; AVX512F: # %bb.0:
2388; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2389; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2390; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2391; AVX512F-NEXT: retq
2392;
2393; AVX512VL-LABEL: constant_funnnel_v4i32:
2394; AVX512VL: # %bb.0:
2395; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2396; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2397; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2398; AVX512VL-NEXT: retq
2399;
2400; AVX512BW-LABEL: constant_funnnel_v4i32:
2401; AVX512BW: # %bb.0:
2402; AVX512BW-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2403; AVX512BW-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2404; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2405; AVX512BW-NEXT: retq
2406;
2407; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
2408; AVX512VBMI2: # %bb.0:
2409; AVX512VBMI2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2410; AVX512VBMI2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2411; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2412; AVX512VBMI2-NEXT: retq
2413;
2414; AVX512VLBW-LABEL: constant_funnnel_v4i32:
2415; AVX512VLBW: # %bb.0:
2416; AVX512VLBW-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2417; AVX512VLBW-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2418; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2419; AVX512VLBW-NEXT: retq
2420;
2421; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
2422; AVX512VLVBMI2: # %bb.0:
2423; AVX512VLVBMI2-NEXT: vpshrdvd {{.*}}(%rip), %xmm0, %xmm1
2424; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
2425; AVX512VLVBMI2-NEXT: retq
Simon Pilgrim46b90e82018-12-18 10:08:23 +00002426;
2427; XOPAVX1-LABEL: constant_funnnel_v4i32:
2428; XOPAVX1: # %bb.0:
2429; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm1, %xmm1
2430; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
2431; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
2432; XOPAVX1-NEXT: retq
2433;
2434; XOPAVX2-LABEL: constant_funnnel_v4i32:
2435; XOPAVX2: # %bb.0:
2436; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2437; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
2438; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2439; XOPAVX2-NEXT: retq
2440;
2441; X32-SSE-LABEL: constant_funnnel_v4i32:
2442; X32-SSE: # %bb.0:
2443; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2444; X32-SSE-NEXT: psrld $7, %xmm2
2445; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2446; X32-SSE-NEXT: psrld $6, %xmm3
2447; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
2448; X32-SSE-NEXT: movdqa %xmm1, %xmm2
2449; X32-SSE-NEXT: psrld $5, %xmm2
2450; X32-SSE-NEXT: psrld $4, %xmm1
2451; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2452; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
2453; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [268435456,134217728,67108864,33554432]
2454; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2455; X32-SSE-NEXT: pmuludq %xmm2, %xmm0
2456; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2457; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2458; X32-SSE-NEXT: pmuludq %xmm3, %xmm2
2459; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2460; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2461; X32-SSE-NEXT: por %xmm1, %xmm0
2462; X32-SSE-NEXT: retl
2463 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
2464 ret <4 x i32> %res
2465}
2466
2467define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2468; SSE2-LABEL: constant_funnnel_v8i16:
2469; SSE2: # %bb.0:
2470; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2471; SSE2-NEXT: movdqa %xmm2, %xmm3
2472; SSE2-NEXT: pandn %xmm1, %xmm3
2473; SSE2-NEXT: movdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
2474; SSE2-NEXT: pmulhuw %xmm4, %xmm1
2475; SSE2-NEXT: pand %xmm2, %xmm1
2476; SSE2-NEXT: pmullw %xmm4, %xmm0
2477; SSE2-NEXT: por %xmm3, %xmm0
2478; SSE2-NEXT: por %xmm1, %xmm0
2479; SSE2-NEXT: pand %xmm2, %xmm0
2480; SSE2-NEXT: por %xmm3, %xmm0
2481; SSE2-NEXT: retq
2482;
2483; SSE41-LABEL: constant_funnnel_v8i16:
2484; SSE41: # %bb.0:
2485; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2486; SSE41-NEXT: movdqa %xmm1, %xmm3
2487; SSE41-NEXT: pmulhuw %xmm2, %xmm3
2488; SSE41-NEXT: pmullw %xmm2, %xmm0
2489; SSE41-NEXT: por %xmm3, %xmm0
2490; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2491; SSE41-NEXT: retq
2492;
2493; AVX-LABEL: constant_funnnel_v8i16:
2494; AVX: # %bb.0:
2495; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2496; AVX-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2497; AVX-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2498; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
2499; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2500; AVX-NEXT: retq
2501;
2502; AVX512F-LABEL: constant_funnnel_v8i16:
2503; AVX512F: # %bb.0:
2504; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2505; AVX512F-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2506; AVX512F-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2507; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
2508; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2509; AVX512F-NEXT: retq
2510;
2511; AVX512VL-LABEL: constant_funnnel_v8i16:
2512; AVX512VL: # %bb.0:
2513; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
2514; AVX512VL-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
2515; AVX512VL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
2516; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
2517; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2518; AVX512VL-NEXT: retq
2519;
2520; AVX512BW-LABEL: constant_funnnel_v8i16:
2521; AVX512BW: # %bb.0:
2522; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2523; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2524; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
2525; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
2526; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,15,14,13,12,11,10,9]
2527; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2528; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0
2529; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2530; AVX512BW-NEXT: vzeroupper
2531; AVX512BW-NEXT: retq
2532;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002533; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
2534; AVX512VBMI2: # %bb.0:
2535; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
2536; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2537; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
2538; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm2
2539; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,15,14,13,12,11,10,9]
2540; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2541; AVX512VBMI2-NEXT: vpor %xmm2, %xmm0, %xmm0
2542; AVX512VBMI2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2543; AVX512VBMI2-NEXT: vzeroupper
2544; AVX512VBMI2-NEXT: retq
2545;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00002546; AVX512VLBW-LABEL: constant_funnnel_v8i16:
2547; AVX512VLBW: # %bb.0:
2548; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %xmm1, %xmm2
2549; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
2550; AVX512VLBW-NEXT: vpor %xmm2, %xmm0, %xmm0
2551; AVX512VLBW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2552; AVX512VLBW-NEXT: retq
2553;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002554; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
2555; AVX512VLVBMI2: # %bb.0:
2556; AVX512VLVBMI2-NEXT: vpshrdvw {{.*}}(%rip), %xmm0, %xmm1
2557; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
2558; AVX512VLVBMI2-NEXT: retq
2559;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00002560; XOP-LABEL: constant_funnnel_v8i16:
2561; XOP: # %bb.0:
2562; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm1, %xmm2
2563; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
2564; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
2565; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2566; XOP-NEXT: retq
2567;
2568; X32-SSE-LABEL: constant_funnnel_v8i16:
2569; X32-SSE: # %bb.0:
2570; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
2571; X32-SSE-NEXT: movdqa %xmm2, %xmm3
2572; X32-SSE-NEXT: pandn %xmm1, %xmm3
2573; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = <u,32768,16384,8192,4096,2048,1024,512>
2574; X32-SSE-NEXT: pmulhuw %xmm4, %xmm1
2575; X32-SSE-NEXT: pand %xmm2, %xmm1
2576; X32-SSE-NEXT: pmullw %xmm4, %xmm0
2577; X32-SSE-NEXT: por %xmm3, %xmm0
2578; X32-SSE-NEXT: por %xmm1, %xmm0
2579; X32-SSE-NEXT: pand %xmm2, %xmm0
2580; X32-SSE-NEXT: por %xmm3, %xmm0
2581; X32-SSE-NEXT: retl
2582 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2583 ret <8 x i16> %res
2584}
2585
2586define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2587; SSE2-LABEL: constant_funnnel_v16i8:
2588; SSE2: # %bb.0:
2589; SSE2-NEXT: pxor %xmm2, %xmm2
2590; SSE2-NEXT: movdqa %xmm1, %xmm3
2591; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2592; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3
2593; SSE2-NEXT: psrlw $8, %xmm3
2594; SSE2-NEXT: movdqa %xmm1, %xmm4
2595; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2596; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm4
2597; SSE2-NEXT: psrlw $8, %xmm4
2598; SSE2-NEXT: packuswb %xmm3, %xmm4
2599; SSE2-NEXT: movdqa %xmm0, %xmm2
2600; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2601; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm2
2602; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2603; SSE2-NEXT: pand %xmm3, %xmm2
2604; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2605; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
2606; SSE2-NEXT: pand %xmm3, %xmm0
2607; SSE2-NEXT: packuswb %xmm2, %xmm0
2608; SSE2-NEXT: por %xmm4, %xmm0
2609; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2610; SSE2-NEXT: pand %xmm2, %xmm0
2611; SSE2-NEXT: pandn %xmm1, %xmm2
2612; SSE2-NEXT: por %xmm2, %xmm0
2613; SSE2-NEXT: retq
2614;
2615; SSE41-LABEL: constant_funnnel_v16i8:
2616; SSE41: # %bb.0:
2617; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2618; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2619; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
2620; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2621; SSE41-NEXT: pand %xmm3, %xmm0
2622; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm2
2623; SSE41-NEXT: pand %xmm3, %xmm2
2624; SSE41-NEXT: packuswb %xmm0, %xmm2
2625; SSE41-NEXT: pxor %xmm0, %xmm0
2626; SSE41-NEXT: movdqa %xmm1, %xmm3
2627; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2628; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm3
2629; SSE41-NEXT: psrlw $8, %xmm3
2630; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2631; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm4
2632; SSE41-NEXT: psrlw $8, %xmm4
2633; SSE41-NEXT: packuswb %xmm3, %xmm4
2634; SSE41-NEXT: por %xmm2, %xmm4
2635; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2636; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
2637; SSE41-NEXT: movdqa %xmm1, %xmm0
2638; SSE41-NEXT: retq
2639;
2640; AVX1-LABEL: constant_funnnel_v16i8:
2641; AVX1: # %bb.0:
2642; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2643; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2644; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2645; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
2646; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2647; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2648; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
2649; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
2650; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2651; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2652; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2
2653; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
2654; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2655; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3
2656; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
2657; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
2658; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
2659; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2660; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2661; AVX1-NEXT: retq
2662;
2663; AVX2-LABEL: constant_funnnel_v16i8:
2664; AVX2: # %bb.0:
2665; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2666; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
2667; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
2668; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2669; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
2670; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2671; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
2672; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
2673; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
2674; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
2675; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
2676; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2677; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2678; AVX2-NEXT: vzeroupper
2679; AVX2-NEXT: retq
2680;
2681; AVX512F-LABEL: constant_funnnel_v16i8:
2682; AVX512F: # %bb.0:
2683; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2684; AVX512F-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2685; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2686; AVX512F-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
2687; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0
2688; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
2689; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2690; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2691; AVX512F-NEXT: vzeroupper
2692; AVX512F-NEXT: retq
2693;
2694; AVX512VL-LABEL: constant_funnnel_v16i8:
2695; AVX512VL: # %bb.0:
2696; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2697; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm2, %zmm2
2698; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2699; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
2700; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0
2701; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
2702; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2703; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2704; AVX512VL-NEXT: vzeroupper
2705; AVX512VL-NEXT: retq
2706;
2707; AVX512BW-LABEL: constant_funnnel_v16i8:
2708; AVX512BW: # %bb.0:
2709; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2710; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2711; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm3, %zmm2
2712; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,7,6,5,4,3,2,1,8,1,2,3,4,5,6,7]
2713; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2714; AVX512BW-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2715; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0
2716; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
2717; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2718; AVX512BW-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2719; AVX512BW-NEXT: vzeroupper
2720; AVX512BW-NEXT: retq
2721;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002722; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
2723; AVX512VBMI2: # %bb.0:
2724; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2725; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2726; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm3, %zmm2
2727; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [8,7,6,5,4,3,2,1,8,1,2,3,4,5,6,7]
2728; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2729; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm0
2730; AVX512VBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0
2731; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
2732; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2733; AVX512VBMI2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2734; AVX512VBMI2-NEXT: vzeroupper
2735; AVX512VBMI2-NEXT: retq
2736;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00002737; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2738; AVX512VLBW: # %bb.0:
2739; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2740; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2741; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2742; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
2743; AVX512VLBW-NEXT: vpor %ymm2, %ymm0, %ymm0
2744; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
2745; AVX512VLBW-NEXT: movw $257, %ax # imm = 0x101
2746; AVX512VLBW-NEXT: kmovd %eax, %k1
2747; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2748; AVX512VLBW-NEXT: vzeroupper
2749; AVX512VLBW-NEXT: retq
2750;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002751; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
2752; AVX512VLVBMI2: # %bb.0:
2753; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2754; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2755; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2756; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
2757; AVX512VLVBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0
2758; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
2759; AVX512VLVBMI2-NEXT: movw $257, %ax # imm = 0x101
2760; AVX512VLVBMI2-NEXT: kmovd %eax, %k1
2761; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
2762; AVX512VLVBMI2-NEXT: vzeroupper
2763; AVX512VLVBMI2-NEXT: retq
2764;
Simon Pilgrim46b90e82018-12-18 10:08:23 +00002765; XOP-LABEL: constant_funnnel_v16i8:
2766; XOP: # %bb.0:
2767; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm2
2768; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
2769; XOP-NEXT: vpor %xmm2, %xmm0, %xmm0
2770; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2771; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
2772; XOP-NEXT: retq
2773;
2774; X32-SSE-LABEL: constant_funnnel_v16i8:
2775; X32-SSE: # %bb.0:
2776; X32-SSE-NEXT: pxor %xmm2, %xmm2
2777; X32-SSE-NEXT: movdqa %xmm1, %xmm3
2778; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2779; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm3
2780; X32-SSE-NEXT: psrlw $8, %xmm3
2781; X32-SSE-NEXT: movdqa %xmm1, %xmm4
2782; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2783; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm4
2784; X32-SSE-NEXT: psrlw $8, %xmm4
2785; X32-SSE-NEXT: packuswb %xmm3, %xmm4
2786; X32-SSE-NEXT: movdqa %xmm0, %xmm2
2787; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2788; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm2
2789; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2790; X32-SSE-NEXT: pand %xmm3, %xmm2
2791; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2792; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
2793; X32-SSE-NEXT: pand %xmm3, %xmm0
2794; X32-SSE-NEXT: packuswb %xmm2, %xmm0
2795; X32-SSE-NEXT: por %xmm4, %xmm0
2796; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
2797; X32-SSE-NEXT: pand %xmm2, %xmm0
2798; X32-SSE-NEXT: pandn %xmm1, %xmm2
2799; X32-SSE-NEXT: por %xmm2, %xmm0
2800; X32-SSE-NEXT: retl
2801 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2802 ret <16 x i8> %res
2803}
2804
2805;
2806; Uniform Constant Shifts
2807;
2808
2809define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2810; SSE-LABEL: splatconstant_funnnel_v2i64:
2811; SSE: # %bb.0:
2812; SSE-NEXT: psrlq $14, %xmm1
2813; SSE-NEXT: psllq $50, %xmm0
2814; SSE-NEXT: por %xmm1, %xmm0
2815; SSE-NEXT: retq
2816;
2817; AVX-LABEL: splatconstant_funnnel_v2i64:
2818; AVX: # %bb.0:
2819; AVX-NEXT: vpsrlq $14, %xmm1, %xmm1
2820; AVX-NEXT: vpsllq $50, %xmm0, %xmm0
2821; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2822; AVX-NEXT: retq
2823;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002824; AVX512F-LABEL: splatconstant_funnnel_v2i64:
2825; AVX512F: # %bb.0:
2826; AVX512F-NEXT: vpsrlq $14, %xmm1, %xmm1
2827; AVX512F-NEXT: vpsllq $50, %xmm0, %xmm0
2828; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2829; AVX512F-NEXT: retq
2830;
2831; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
2832; AVX512VL: # %bb.0:
2833; AVX512VL-NEXT: vpsrlq $14, %xmm1, %xmm1
2834; AVX512VL-NEXT: vpsllq $50, %xmm0, %xmm0
2835; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2836; AVX512VL-NEXT: retq
2837;
2838; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
2839; AVX512BW: # %bb.0:
2840; AVX512BW-NEXT: vpsrlq $14, %xmm1, %xmm1
2841; AVX512BW-NEXT: vpsllq $50, %xmm0, %xmm0
2842; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2843; AVX512BW-NEXT: retq
2844;
2845; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2846; AVX512VBMI2: # %bb.0:
2847; AVX512VBMI2-NEXT: vpsrlq $14, %xmm1, %xmm1
2848; AVX512VBMI2-NEXT: vpsllq $50, %xmm0, %xmm0
2849; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2850; AVX512VBMI2-NEXT: retq
2851;
2852; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2853; AVX512VLBW: # %bb.0:
2854; AVX512VLBW-NEXT: vpsrlq $14, %xmm1, %xmm1
2855; AVX512VLBW-NEXT: vpsllq $50, %xmm0, %xmm0
2856; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2857; AVX512VLBW-NEXT: retq
2858;
2859; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2860; AVX512VLVBMI2: # %bb.0:
2861; AVX512VLVBMI2-NEXT: vpshrdq $14, %xmm0, %xmm1, %xmm0
2862; AVX512VLVBMI2-NEXT: retq
Simon Pilgrim46b90e82018-12-18 10:08:23 +00002863;
2864; XOP-LABEL: splatconstant_funnnel_v2i64:
2865; XOP: # %bb.0:
2866; XOP-NEXT: vpsrlq $14, %xmm1, %xmm1
2867; XOP-NEXT: vpsllq $50, %xmm0, %xmm0
2868; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2869; XOP-NEXT: retq
2870;
2871; X32-SSE-LABEL: splatconstant_funnnel_v2i64:
2872; X32-SSE: # %bb.0:
2873; X32-SSE-NEXT: psrlq $14, %xmm1
2874; X32-SSE-NEXT: psllq $50, %xmm0
2875; X32-SSE-NEXT: por %xmm1, %xmm0
2876; X32-SSE-NEXT: retl
2877 %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2878 ret <2 x i64> %res
2879}
2880
2881define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2882; SSE-LABEL: splatconstant_funnnel_v4i32:
2883; SSE: # %bb.0:
2884; SSE-NEXT: psrld $4, %xmm1
2885; SSE-NEXT: pslld $28, %xmm0
2886; SSE-NEXT: por %xmm1, %xmm0
2887; SSE-NEXT: retq
2888;
2889; AVX-LABEL: splatconstant_funnnel_v4i32:
2890; AVX: # %bb.0:
2891; AVX-NEXT: vpsrld $4, %xmm1, %xmm1
2892; AVX-NEXT: vpslld $28, %xmm0, %xmm0
2893; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2894; AVX-NEXT: retq
2895;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002896; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2897; AVX512F: # %bb.0:
2898; AVX512F-NEXT: vpsrld $4, %xmm1, %xmm1
2899; AVX512F-NEXT: vpslld $28, %xmm0, %xmm0
2900; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2901; AVX512F-NEXT: retq
2902;
2903; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2904; AVX512VL: # %bb.0:
2905; AVX512VL-NEXT: vpsrld $4, %xmm1, %xmm1
2906; AVX512VL-NEXT: vpslld $28, %xmm0, %xmm0
2907; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2908; AVX512VL-NEXT: retq
2909;
2910; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2911; AVX512BW: # %bb.0:
2912; AVX512BW-NEXT: vpsrld $4, %xmm1, %xmm1
2913; AVX512BW-NEXT: vpslld $28, %xmm0, %xmm0
2914; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2915; AVX512BW-NEXT: retq
2916;
2917; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2918; AVX512VBMI2: # %bb.0:
2919; AVX512VBMI2-NEXT: vpsrld $4, %xmm1, %xmm1
2920; AVX512VBMI2-NEXT: vpslld $28, %xmm0, %xmm0
2921; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2922; AVX512VBMI2-NEXT: retq
2923;
2924; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2925; AVX512VLBW: # %bb.0:
2926; AVX512VLBW-NEXT: vpsrld $4, %xmm1, %xmm1
2927; AVX512VLBW-NEXT: vpslld $28, %xmm0, %xmm0
2928; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
2929; AVX512VLBW-NEXT: retq
2930;
2931; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2932; AVX512VLVBMI2: # %bb.0:
2933; AVX512VLVBMI2-NEXT: vpshrdd $4, %xmm0, %xmm1, %xmm0
2934; AVX512VLVBMI2-NEXT: retq
Simon Pilgrim46b90e82018-12-18 10:08:23 +00002935;
2936; XOP-LABEL: splatconstant_funnnel_v4i32:
2937; XOP: # %bb.0:
2938; XOP-NEXT: vpsrld $4, %xmm1, %xmm1
2939; XOP-NEXT: vpslld $28, %xmm0, %xmm0
2940; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
2941; XOP-NEXT: retq
2942;
2943; X32-SSE-LABEL: splatconstant_funnnel_v4i32:
2944; X32-SSE: # %bb.0:
2945; X32-SSE-NEXT: psrld $4, %xmm1
2946; X32-SSE-NEXT: pslld $28, %xmm0
2947; X32-SSE-NEXT: por %xmm1, %xmm0
2948; X32-SSE-NEXT: retl
2949 %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2950 ret <4 x i32> %res
2951}
2952
2953define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2954; SSE-LABEL: splatconstant_funnnel_v8i16:
2955; SSE: # %bb.0:
2956; SSE-NEXT: psrlw $7, %xmm1
2957; SSE-NEXT: psllw $9, %xmm0
2958; SSE-NEXT: por %xmm1, %xmm0
2959; SSE-NEXT: retq
2960;
2961; AVX-LABEL: splatconstant_funnnel_v8i16:
2962; AVX: # %bb.0:
2963; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
2964; AVX-NEXT: vpsllw $9, %xmm0, %xmm0
2965; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2966; AVX-NEXT: retq
2967;
Craig Topper6ffeeb72019-01-06 18:10:18 +00002968; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2969; AVX512F: # %bb.0:
2970; AVX512F-NEXT: vpsrlw $7, %xmm1, %xmm1
2971; AVX512F-NEXT: vpsllw $9, %xmm0, %xmm0
2972; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
2973; AVX512F-NEXT: retq
2974;
2975; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2976; AVX512VL: # %bb.0:
2977; AVX512VL-NEXT: vpsrlw $7, %xmm1, %xmm1
2978; AVX512VL-NEXT: vpsllw $9, %xmm0, %xmm0
2979; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
2980; AVX512VL-NEXT: retq
2981;
2982; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2983; AVX512BW: # %bb.0:
2984; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
2985; AVX512BW-NEXT: vpsllw $9, %xmm0, %xmm0
2986; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
2987; AVX512BW-NEXT: retq
2988;
2989; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2990; AVX512VBMI2: # %bb.0:
2991; AVX512VBMI2-NEXT: vpsrlw $7, %xmm1, %xmm1
2992; AVX512VBMI2-NEXT: vpsllw $9, %xmm0, %xmm0
2993; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
2994; AVX512VBMI2-NEXT: retq
2995;
2996; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2997; AVX512VLBW: # %bb.0:
2998; AVX512VLBW-NEXT: vpsrlw $7, %xmm1, %xmm1
2999; AVX512VLBW-NEXT: vpsllw $9, %xmm0, %xmm0
3000; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
3001; AVX512VLBW-NEXT: retq
3002;
3003; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
3004; AVX512VLVBMI2: # %bb.0:
3005; AVX512VLVBMI2-NEXT: vpshrdw $7, %xmm0, %xmm1, %xmm0
3006; AVX512VLVBMI2-NEXT: retq
Simon Pilgrim46b90e82018-12-18 10:08:23 +00003007;
3008; XOP-LABEL: splatconstant_funnnel_v8i16:
3009; XOP: # %bb.0:
3010; XOP-NEXT: vpsrlw $7, %xmm1, %xmm1
3011; XOP-NEXT: vpsllw $9, %xmm0, %xmm0
3012; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
3013; XOP-NEXT: retq
3014;
3015; X32-SSE-LABEL: splatconstant_funnnel_v8i16:
3016; X32-SSE: # %bb.0:
3017; X32-SSE-NEXT: psrlw $7, %xmm1
3018; X32-SSE-NEXT: psllw $9, %xmm0
3019; X32-SSE-NEXT: por %xmm1, %xmm0
3020; X32-SSE-NEXT: retl
3021 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
3022 ret <8 x i16> %res
3023}
3024
3025define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
3026; SSE-LABEL: splatconstant_funnnel_v16i8:
3027; SSE: # %bb.0:
3028; SSE-NEXT: psrlw $4, %xmm1
3029; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3030; SSE-NEXT: psllw $4, %xmm0
3031; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3032; SSE-NEXT: por %xmm1, %xmm0
3033; SSE-NEXT: retq
3034;
3035; AVX-LABEL: splatconstant_funnnel_v16i8:
3036; AVX: # %bb.0:
3037; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1
3038; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3039; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
3040; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3041; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
3042; AVX-NEXT: retq
3043;
3044; AVX512-LABEL: splatconstant_funnnel_v16i8:
3045; AVX512: # %bb.0:
3046; AVX512-NEXT: vpsrlw $4, %xmm1, %xmm1
3047; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
3048; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0
3049; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
3050; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
3051; AVX512-NEXT: retq
3052;
3053; XOP-LABEL: splatconstant_funnnel_v16i8:
3054; XOP: # %bb.0:
3055; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm1, %xmm1
3056; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
3057; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
3058; XOP-NEXT: retq
3059;
3060; X32-SSE-LABEL: splatconstant_funnnel_v16i8:
3061; X32-SSE: # %bb.0:
3062; X32-SSE-NEXT: psrlw $4, %xmm1
3063; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
3064; X32-SSE-NEXT: psllw $4, %xmm0
3065; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
3066; X32-SSE-NEXT: por %xmm1, %xmm0
3067; X32-SSE-NEXT: retl
3068 %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
3069 ret <16 x i8> %res
3070}