blob: 624a08af3e57907c202a3f6bc78040b746d76f32 [file] [log] [blame]
Simon Pilgrimf6fa1d02017-09-11 10:50:03 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
Simon Pilgrimd0ff65b2017-09-11 12:18:43 +00007
8;
9; General cases - packing of vector comparison to legal vector result types
10;
11
12define <16 x i8> @vselect_packss_v16i16(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) {
Simon Pilgrimb092bd32017-09-11 14:03:47 +000013; SSE2-LABEL: vselect_packss_v16i16:
14; SSE2: # BB#0:
15; SSE2-NEXT: pcmpeqw %xmm3, %xmm1
16; SSE2-NEXT: pcmpeqw %xmm2, %xmm0
17; SSE2-NEXT: packsswb %xmm1, %xmm0
18; SSE2-NEXT: pand %xmm0, %xmm4
19; SSE2-NEXT: pandn %xmm5, %xmm0
20; SSE2-NEXT: por %xmm4, %xmm0
21; SSE2-NEXT: retq
22;
23; SSE42-LABEL: vselect_packss_v16i16:
24; SSE42: # BB#0:
25; SSE42-NEXT: pcmpeqw %xmm3, %xmm1
26; SSE42-NEXT: pcmpeqw %xmm2, %xmm0
27; SSE42-NEXT: packsswb %xmm1, %xmm0
28; SSE42-NEXT: pblendvb %xmm0, %xmm4, %xmm5
29; SSE42-NEXT: movdqa %xmm5, %xmm0
30; SSE42-NEXT: retq
Simon Pilgrimd0ff65b2017-09-11 12:18:43 +000031;
32; AVX1-LABEL: vselect_packss_v16i16:
33; AVX1: # BB#0:
34; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
35; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
36; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4
37; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
38; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
Simon Pilgrimb092bd32017-09-11 14:03:47 +000039; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
Simon Pilgrimd0ff65b2017-09-11 12:18:43 +000040; AVX1-NEXT: vzeroupper
41; AVX1-NEXT: retq
42;
43; AVX2-LABEL: vselect_packss_v16i16:
44; AVX2: # BB#0:
45; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
46; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
47; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
Simon Pilgrimb092bd32017-09-11 14:03:47 +000048; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
Simon Pilgrimd0ff65b2017-09-11 12:18:43 +000049; AVX2-NEXT: vzeroupper
50; AVX2-NEXT: retq
51;
52; AVX512-LABEL: vselect_packss_v16i16:
53; AVX512: # BB#0:
54; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
55; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
56; AVX512-NEXT: vpmovdb %zmm0, %xmm0
57; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm1
58; AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0
59; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
60; AVX512-NEXT: vzeroupper
61; AVX512-NEXT: retq
62 %1 = icmp eq <16 x i16> %a0, %a1
63 %2 = sext <16 x i1> %1 to <16 x i8>
64 %3 = and <16 x i8> %2, %a2
65 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
66 %5 = and <16 x i8> %4, %a3
67 %6 = or <16 x i8> %3, %5
68 ret <16 x i8> %6
69}
70
71define <16 x i8> @vselect_packss_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a2, <16 x i8> %a3) {
72; SSE-LABEL: vselect_packss_v16i32:
73; SSE: # BB#0:
74; SSE-NEXT: pcmpeqd %xmm7, %xmm3
75; SSE-NEXT: pcmpeqd %xmm6, %xmm2
76; SSE-NEXT: packsswb %xmm3, %xmm2
77; SSE-NEXT: pcmpeqd %xmm5, %xmm1
78; SSE-NEXT: pcmpeqd %xmm4, %xmm0
79; SSE-NEXT: packsswb %xmm1, %xmm0
80; SSE-NEXT: packsswb %xmm2, %xmm0
81; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
82; SSE-NEXT: pand %xmm0, %xmm1
83; SSE-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0
84; SSE-NEXT: por %xmm1, %xmm0
85; SSE-NEXT: retq
86;
87; AVX1-LABEL: vselect_packss_v16i32:
88; AVX1: # BB#0:
89; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
90; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
91; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
92; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
93; AVX1-NEXT: vpacksswb %xmm6, %xmm1, %xmm1
94; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
95; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
96; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3
97; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
98; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
99; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
100; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1
101; AVX1-NEXT: vpandn %xmm5, %xmm0, %xmm0
102; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
103; AVX1-NEXT: vzeroupper
104; AVX1-NEXT: retq
105;
106; AVX2-LABEL: vselect_packss_v16i32:
107; AVX2: # BB#0:
108; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1
109; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
110; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
111; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
112; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
113; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
114; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm1
115; AVX2-NEXT: vpandn %xmm5, %xmm0, %xmm0
116; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
117; AVX2-NEXT: vzeroupper
118; AVX2-NEXT: retq
119;
120; AVX512-LABEL: vselect_packss_v16i32:
121; AVX512: # BB#0:
122; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
123; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
124; AVX512-NEXT: vpmovdb %zmm0, %xmm0
125; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm1
126; AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0
127; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
128; AVX512-NEXT: vzeroupper
129; AVX512-NEXT: retq
130 %1 = icmp eq <16 x i32> %a0, %a1
131 %2 = sext <16 x i1> %1 to <16 x i8>
132 %3 = and <16 x i8> %2, %a2
133 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
134 %5 = and <16 x i8> %4, %a3
135 %6 = or <16 x i8> %3, %5
136 ret <16 x i8> %6
137}
138
139define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8> %a2, <16 x i8> %a3) {
140; SSE2-LABEL: vselect_packss_v16i64:
141; SSE2: # BB#0:
142; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm7
143; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,0,3,2]
144; SSE2-NEXT: pand %xmm7, %xmm8
145; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm6
146; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,0,3,2]
147; SSE2-NEXT: pand %xmm6, %xmm7
148; SSE2-NEXT: packsswb %xmm8, %xmm7
149; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm5
150; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
151; SSE2-NEXT: pand %xmm5, %xmm6
152; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm4
153; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
154; SSE2-NEXT: pand %xmm4, %xmm5
155; SSE2-NEXT: packsswb %xmm6, %xmm5
156; SSE2-NEXT: packsswb %xmm7, %xmm5
157; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm3
158; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2]
159; SSE2-NEXT: pand %xmm3, %xmm4
160; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm2
161; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
162; SSE2-NEXT: pand %xmm2, %xmm3
163; SSE2-NEXT: packsswb %xmm4, %xmm3
164; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm1
165; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
166; SSE2-NEXT: pand %xmm1, %xmm2
167; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm0
168; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
169; SSE2-NEXT: pand %xmm0, %xmm1
170; SSE2-NEXT: packsswb %xmm2, %xmm1
171; SSE2-NEXT: packsswb %xmm3, %xmm1
172; SSE2-NEXT: packsswb %xmm5, %xmm1
173; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
174; SSE2-NEXT: pand %xmm1, %xmm0
175; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm1
176; SSE2-NEXT: por %xmm0, %xmm1
177; SSE2-NEXT: movdqa %xmm1, %xmm0
178; SSE2-NEXT: retq
179;
180; SSE42-LABEL: vselect_packss_v16i64:
181; SSE42: # BB#0:
182; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm7
183; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm6
184; SSE42-NEXT: packsswb %xmm7, %xmm6
185; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm5
186; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm4
187; SSE42-NEXT: packsswb %xmm5, %xmm4
188; SSE42-NEXT: packsswb %xmm6, %xmm4
189; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm3
190; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm2
191; SSE42-NEXT: packsswb %xmm3, %xmm2
192; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm1
193; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm0
194; SSE42-NEXT: packsswb %xmm1, %xmm0
195; SSE42-NEXT: packsswb %xmm2, %xmm0
196; SSE42-NEXT: packsswb %xmm4, %xmm0
197; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
198; SSE42-NEXT: pand %xmm0, %xmm1
199; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0
200; SSE42-NEXT: por %xmm1, %xmm0
201; SSE42-NEXT: retq
202;
203; AVX1-LABEL: vselect_packss_v16i64:
204; AVX1: # BB#0:
205; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
206; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
207; AVX1-NEXT: vpcmpeqq %xmm8, %xmm9, %xmm8
208; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3
209; AVX1-NEXT: vpacksswb %xmm8, %xmm3, %xmm8
210; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
211; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
212; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3
213; AVX1-NEXT: vpcmpeqq %xmm6, %xmm2, %xmm2
214; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
215; AVX1-NEXT: vpacksswb %xmm8, %xmm2, %xmm2
216; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
217; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
218; AVX1-NEXT: vpcmpeqq %xmm3, %xmm6, %xmm3
219; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
220; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
221; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
222; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
223; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
224; AVX1-NEXT: vpcmpeqq %xmm4, %xmm0, %xmm0
225; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
226; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
227; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
228; AVX1-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
229; AVX1-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0
230; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
231; AVX1-NEXT: vzeroupper
232; AVX1-NEXT: retq
233;
234; AVX2-LABEL: vselect_packss_v16i64:
235; AVX2: # BB#0:
236; AVX2-NEXT: vpcmpeqq %ymm7, %ymm3, %ymm3
237; AVX2-NEXT: vpcmpeqq %ymm6, %ymm2, %ymm2
238; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2
239; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
240; AVX2-NEXT: vpcmpeqq %ymm5, %ymm1, %ymm1
241; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
242; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
243; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
244; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
245; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
246; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
247; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
248; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
249; AVX2-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0
250; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
251; AVX2-NEXT: vzeroupper
252; AVX2-NEXT: retq
253;
254; AVX512-LABEL: vselect_packss_v16i64:
255; AVX512: # BB#0:
256; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm6
257; AVX512-NEXT: vpextrq $1, %xmm6, %rcx
258; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm7
259; AVX512-NEXT: vpextrq $1, %xmm7, %rdx
260; AVX512-NEXT: xorl %eax, %eax
261; AVX512-NEXT: cmpq %rcx, %rdx
262; AVX512-NEXT: movq $-1, %rcx
263; AVX512-NEXT: movl $0, %edx
264; AVX512-NEXT: cmoveq %rcx, %rdx
265; AVX512-NEXT: vmovq %rdx, %xmm8
266; AVX512-NEXT: vmovq %xmm6, %rdx
267; AVX512-NEXT: vmovq %xmm7, %rsi
268; AVX512-NEXT: cmpq %rdx, %rsi
269; AVX512-NEXT: movl $0, %edx
270; AVX512-NEXT: cmoveq %rcx, %rdx
271; AVX512-NEXT: vmovq %rdx, %xmm6
272; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm6[0],xmm8[0]
273; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm7
274; AVX512-NEXT: vpextrq $1, %xmm7, %rdx
275; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6
276; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
277; AVX512-NEXT: cmpq %rdx, %rsi
278; AVX512-NEXT: movl $0, %edx
279; AVX512-NEXT: cmoveq %rcx, %rdx
280; AVX512-NEXT: vmovq %rdx, %xmm9
281; AVX512-NEXT: vmovq %xmm7, %rdx
282; AVX512-NEXT: vmovq %xmm6, %rsi
283; AVX512-NEXT: cmpq %rdx, %rsi
284; AVX512-NEXT: movl $0, %edx
285; AVX512-NEXT: cmoveq %rcx, %rdx
286; AVX512-NEXT: vmovq %rdx, %xmm6
287; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0]
288; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm8
289; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7
290; AVX512-NEXT: vpextrq $1, %xmm7, %rdx
291; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6
292; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
293; AVX512-NEXT: cmpq %rdx, %rsi
294; AVX512-NEXT: movl $0, %edx
295; AVX512-NEXT: cmoveq %rcx, %rdx
296; AVX512-NEXT: vmovq %rdx, %xmm9
297; AVX512-NEXT: vmovq %xmm7, %rdx
298; AVX512-NEXT: vmovq %xmm6, %rsi
299; AVX512-NEXT: cmpq %rdx, %rsi
300; AVX512-NEXT: movl $0, %edx
301; AVX512-NEXT: cmoveq %rcx, %rdx
302; AVX512-NEXT: vmovq %rdx, %xmm6
303; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0]
304; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
305; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
306; AVX512-NEXT: cmpq %rdx, %rsi
307; AVX512-NEXT: movl $0, %edx
308; AVX512-NEXT: cmoveq %rcx, %rdx
309; AVX512-NEXT: vmovq %rdx, %xmm7
310; AVX512-NEXT: vmovq %xmm2, %rdx
311; AVX512-NEXT: vmovq %xmm0, %rsi
312; AVX512-NEXT: cmpq %rdx, %rsi
313; AVX512-NEXT: movl $0, %edx
314; AVX512-NEXT: cmoveq %rcx, %rdx
315; AVX512-NEXT: vmovq %rdx, %xmm0
316; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
317; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
318; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
319; AVX512-NEXT: vpmovqd %zmm0, %ymm8
320; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2
321; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
322; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm6
323; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
324; AVX512-NEXT: cmpq %rdx, %rsi
325; AVX512-NEXT: movl $0, %edx
326; AVX512-NEXT: cmoveq %rcx, %rdx
327; AVX512-NEXT: vmovq %rdx, %xmm7
328; AVX512-NEXT: vmovq %xmm2, %rdx
329; AVX512-NEXT: vmovq %xmm6, %rsi
330; AVX512-NEXT: cmpq %rdx, %rsi
331; AVX512-NEXT: movl $0, %edx
332; AVX512-NEXT: cmoveq %rcx, %rdx
333; AVX512-NEXT: vmovq %rdx, %xmm2
334; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
335; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm6
336; AVX512-NEXT: vpextrq $1, %xmm6, %rdx
337; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm7
338; AVX512-NEXT: vpextrq $1, %xmm7, %rsi
339; AVX512-NEXT: cmpq %rdx, %rsi
340; AVX512-NEXT: movl $0, %edx
341; AVX512-NEXT: cmoveq %rcx, %rdx
342; AVX512-NEXT: vmovq %rdx, %xmm0
343; AVX512-NEXT: vmovq %xmm6, %rdx
344; AVX512-NEXT: vmovq %xmm7, %rsi
345; AVX512-NEXT: cmpq %rdx, %rsi
346; AVX512-NEXT: movl $0, %edx
347; AVX512-NEXT: cmoveq %rcx, %rdx
348; AVX512-NEXT: vmovq %rdx, %xmm6
349; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
350; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
351; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm0
352; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
353; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm6
354; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
355; AVX512-NEXT: cmpq %rdx, %rsi
356; AVX512-NEXT: movl $0, %edx
357; AVX512-NEXT: cmoveq %rcx, %rdx
358; AVX512-NEXT: vmovq %rdx, %xmm7
359; AVX512-NEXT: vmovq %xmm0, %rdx
360; AVX512-NEXT: vmovq %xmm6, %rsi
361; AVX512-NEXT: cmpq %rdx, %rsi
362; AVX512-NEXT: movl $0, %edx
363; AVX512-NEXT: cmoveq %rcx, %rdx
364; AVX512-NEXT: vmovq %rdx, %xmm0
365; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
366; AVX512-NEXT: vpextrq $1, %xmm3, %rdx
367; AVX512-NEXT: vpextrq $1, %xmm1, %rsi
368; AVX512-NEXT: cmpq %rdx, %rsi
369; AVX512-NEXT: movl $0, %edx
370; AVX512-NEXT: cmoveq %rcx, %rdx
371; AVX512-NEXT: vmovq %rdx, %xmm6
372; AVX512-NEXT: vmovq %xmm3, %rdx
373; AVX512-NEXT: vmovq %xmm1, %rsi
374; AVX512-NEXT: cmpq %rdx, %rsi
375; AVX512-NEXT: cmoveq %rcx, %rax
376; AVX512-NEXT: vmovq %rax, %xmm1
377; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
378; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
379; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
380; AVX512-NEXT: vpmovqd %zmm0, %ymm0
381; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0
382; AVX512-NEXT: vpmovdb %zmm0, %xmm0
383; AVX512-NEXT: vpand %xmm4, %xmm0, %xmm1
384; AVX512-NEXT: vpandn %xmm5, %xmm0, %xmm0
385; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
386; AVX512-NEXT: vzeroupper
387; AVX512-NEXT: retq
388 %1 = icmp eq <16 x i64> %a0, %a1
389 %2 = sext <16 x i1> %1 to <16 x i8>
390 %3 = and <16 x i8> %2, %a2
391 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
392 %5 = and <16 x i8> %4, %a3
393 %6 = or <16 x i8> %3, %5
394 ret <16 x i8> %6
395}
396
397;
398; PACKSS case
399;
Simon Pilgrimf6fa1d02017-09-11 10:50:03 +0000400
401define <16 x i8> @vselect_packss(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) {
Simon Pilgrimb092bd32017-09-11 14:03:47 +0000402; SSE2-LABEL: vselect_packss:
403; SSE2: # BB#0:
404; SSE2-NEXT: pcmpeqw %xmm3, %xmm1
405; SSE2-NEXT: pcmpeqw %xmm2, %xmm0
406; SSE2-NEXT: packsswb %xmm1, %xmm0
407; SSE2-NEXT: pand %xmm0, %xmm4
408; SSE2-NEXT: pandn %xmm5, %xmm0
409; SSE2-NEXT: por %xmm4, %xmm0
410; SSE2-NEXT: retq
411;
412; SSE42-LABEL: vselect_packss:
413; SSE42: # BB#0:
414; SSE42-NEXT: pcmpeqw %xmm3, %xmm1
415; SSE42-NEXT: pcmpeqw %xmm2, %xmm0
416; SSE42-NEXT: packsswb %xmm1, %xmm0
417; SSE42-NEXT: pblendvb %xmm0, %xmm4, %xmm5
418; SSE42-NEXT: movdqa %xmm5, %xmm0
419; SSE42-NEXT: retq
Simon Pilgrimf6fa1d02017-09-11 10:50:03 +0000420;
421; AVX1-LABEL: vselect_packss:
422; AVX1: # BB#0:
423; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
424; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
425; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4
426; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
427; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
Simon Pilgrimb092bd32017-09-11 14:03:47 +0000428; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
Simon Pilgrimf6fa1d02017-09-11 10:50:03 +0000429; AVX1-NEXT: vzeroupper
430; AVX1-NEXT: retq
431;
432; AVX2-LABEL: vselect_packss:
433; AVX2: # BB#0:
434; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
435; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
436; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
Simon Pilgrimb092bd32017-09-11 14:03:47 +0000437; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
Simon Pilgrimf6fa1d02017-09-11 10:50:03 +0000438; AVX2-NEXT: vzeroupper
439; AVX2-NEXT: retq
440;
441; AVX512-LABEL: vselect_packss:
442; AVX512: # BB#0:
443; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
444; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
445; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
Simon Pilgrimb092bd32017-09-11 14:03:47 +0000446; AVX512-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
Simon Pilgrimf6fa1d02017-09-11 10:50:03 +0000447; AVX512-NEXT: vzeroupper
448; AVX512-NEXT: retq
449 %1 = icmp eq <16 x i16> %a0, %a1
450 %2 = sext <16 x i1> %1 to <16 x i16>
451 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
452 %4 = shufflevector <16 x i16> %2, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
453 %5 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %3, <8 x i16> %4)
454 %6 = and <16 x i8> %5, %a2
455 %7 = xor <16 x i8> %5, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
456 %8 = and <16 x i8> %7, %a3
457 %9 = or <16 x i8> %6, %8
458 ret <16 x i8> %9
459}
460declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)